In [None]:
from pathlib import Path
import pandas as pd

DATA_DIR = Path.cwd().parent / "data"

def read_csv(rel_path: str, **kwargs):
    return pd.read_csv(DATA_DIR / rel_path, **kwargs)

Company = read_csv('20221102/Company.csv')                    
Deal = read_csv('20221102/Deal.csv')                           
DealInvestorRelation = read_csv('20221102/DealInvestorRelation.csv')
EntityTeamRelation = read_csv('20221102/EntityTeamRelation.csv')
Investor = read_csv('20221102/Investor.csv')
Person = read_csv('20221102/Person.csv')
PersonEducationRelation = read_csv('20221102/PersonEducationRelation.csv')

new_person = read_csv('SHANGHAIUNIVERSITY_20221102/new_person.csv')
new_exit   = read_csv('SHANGHAIUNIVERSITY_20221102/new_deal.csv')

In [None]:
import pandas as pd
from datetime import datetime

new_exit['DealDate'] = pd.to_datetime(new_exit['DealDate'])

first_round = new_exit[new_exit['first'] == True][['CompanyID', 'DealDate']].rename(columns={'DealDate': 'FirstRoundDate'})
second_round = new_exit[new_exit['second'] == True][['CompanyID', 'DealDate']].rename(columns={'DealDate': 'SecondRoundDate'})

merged = pd.merge(first_round, second_round, on='CompanyID', how='inner')

merged['TimeDiff_Days'] = (merged['SecondRoundDate'] - merged['FirstRoundDate']).dt.days

merged['TimeDiff_Years'] = merged['TimeDiff_Days'] / 365.25

result = merged[['CompanyID', 'FirstRoundDate', 'SecondRoundDate', 'TimeDiff_Days', 'TimeDiff_Years']]
print(result.head(10))


In [None]:
DealInvestorRelation=DealInvestorRelation[['DealID','LeadPartnerID']]
DealInvestorRelation = DealInvestorRelation.loc[~DealInvestorRelation['LeadPartnerID'].isna(
)]
DealInvestorRelation= DealInvestorRelation.drop_duplicates()

In [None]:
EntityTeamRelation = EntityTeamRelation.loc[EntityTeamRelation['EntityID'].isin(
    Deal['CompanyID'])]
EntityTeamRelation = EntityTeamRelation.loc[~EntityTeamRelation['StartDate'].isna(
)]
EntityTeamRelation['StartDate'] = pd.to_datetime(
    EntityTeamRelation['StartDate'])
EntityTeamRelation['EndDate'] = pd.to_datetime(
    EntityTeamRelation['EndDate'])
EntityTeamRelation=EntityTeamRelation[['EntityID','PersonID','FullTitle','Location','StartDate','EndDate']]

In [None]:
PeopleAffiliatedDealRelation = pd.merge(Deal, DealInvestorRelation, on='DealID')
PeopleAffiliatedDealRelation=PeopleAffiliatedDealRelation[['LeadPartnerID','DealID','CompanyID','CompanyName','DealDate','DealType','DealSize']]
PeopleAffiliatedDealRelation = PeopleAffiliatedDealRelation.loc[~PeopleAffiliatedDealRelation['LeadPartnerID'].isna(
)]
PeopleAffiliatedDealRelation=PeopleAffiliatedDealRelation.drop_duplicates()

In [None]:
# Build Graph
import pickle
import tqdm
import datetime
print('*'*10+'Building Graph'+'*'*10)
ID2index = {}
all_ids = list(set(PeopleAffiliatedDealRelation['CompanyID'].tolist(
)+PeopleAffiliatedDealRelation['LeadPartnerID'].tolist()+EntityTeamRelation['PersonID'].tolist()+EntityTeamRelation['EntityID'].tolist()))
for i, j in enumerate(all_ids):
    ID2index[j] = i
for key in ID2index.keys():
    ID2index[key]=int(ID2index[key])
PeopleAffiliatedDealRelation['PersonIndex'] = PeopleAffiliatedDealRelation['LeadPartnerID'].map(
    ID2index)
PeopleAffiliatedDealRelation = PeopleAffiliatedDealRelation.dropna(subset=["PersonIndex"])
PeopleAffiliatedDealRelation['PersonIndex']=PeopleAffiliatedDealRelation['PersonIndex'].astype(int)
EntityTeamRelation['PersonIndex'] = EntityTeamRelation['PersonID'].map(
    ID2index)
EntityTeamRelation = EntityTeamRelation.dropna(subset=["PersonIndex"])
EntityTeamRelation['PersonIndex']=EntityTeamRelation['PersonIndex'].astype(int)
PeopleAffiliatedDealRelation['CompanyIndex'] = PeopleAffiliatedDealRelation['CompanyID'].map(
    ID2index)
PeopleAffiliatedDealRelation['CompanyIndex']=PeopleAffiliatedDealRelation['CompanyIndex'].astype(int)
EntityTeamRelation['CompanyIndex'] = EntityTeamRelation['EntityID'].map(
    ID2index)
EntityTeamRelation['CompanyIndex']=EntityTeamRelation['CompanyIndex'].astype(int)

timestep = 0
edges = []
all_nodes = []
graph_nodes = []   
start_date = datetime.datetime(2007, 1, 1)
temp_PADR = PeopleAffiliatedDealRelation.loc[PeopleAffiliatedDealRelation['DealDate'] < start_date]
temp_CTR = EntityTeamRelation.loc[EntityTeamRelation['StartDate'] < start_date]
edges += temp_PADR.apply(lambda x: str(
            x['PersonIndex'])+'+'+str(x['CompanyIndex'])+'+0+'+str(timestep), axis=1).tolist()
edges += temp_CTR.apply(lambda x: str(
            x['PersonIndex'])+'+'+str(x['CompanyIndex'])+'+'+(x['FullTitle'])+'+'+str(timestep), axis=1).tolist()
edges += temp_PADR.apply(lambda x: str(
            x['CompanyIndex'])+'+'+str(x['PersonIndex'])+'+0+'+str(timestep), axis=1).tolist()
edges += temp_CTR.apply(lambda x: str(
            x['CompanyIndex'])+'+'+str(x['PersonIndex'])+'+'+(x['FullTitle'])+'+'+str(timestep), axis=1).tolist()
deal_edge+=temp_PADR.apply(lambda x: str(
            x['CompanyIndex'])+'+'+str(timestep), axis=1).tolist()
edges = list(set(edges))
graph_edges = edges
new_edges = [graph_edges]
graph_nodes += [int(i.split('+')[0]) for i in edges]
graph_nodes += [int(i.split('+')[1]) for i in edges]
graph_nodes = [list(set(graph_nodes))]
all_nodes = graph_nodes[-1]


In [None]:
new_companies = []
labels = []
new_nodes_list=[]
# add nodes and edges to graph
for year in range(2007, 2023):
    for month in tqdm.tqdm(range(1, 13)):
        timestep += 1
        if month == 12:
            start_date = datetime.datetime(year, 12, 1)
            end_date = datetime.datetime(year+1, 1, 1)
            exit_temp = Exit.loc[Exit['DealDate']
                                    < datetime.datetime(year+2, 1, 1)]
        else:
            start_date = datetime.datetime(year, month, 1)
            end_date = datetime.datetime(year, month+1, 1)
            exit_temp = Exit.loc[Exit['DealDate'] <
                                    datetime.datetime(year+1, month+1, 1)]
        # if year in [2007, 2008, 2014]:
        new_company = PeopleAffiliatedDealRelation.loc[PeopleAffiliatedDealRelation['DealDate'] >= start_date].loc[PeopleAffiliatedDealRelation['DealDate'] < end_date].drop_duplicates(
            'CompanyID').loc[PeopleAffiliatedDealRelation['first'] == True]
        label = new_company['CompanyID'].isin(exit_temp['CompanyID'])
        new_company = new_company['CompanyIndex']
        new_companies.append(new_company.values)
        labels.append([1 if i else 0 for i in label])

        assert len(new_companies[0]) == len(labels[0])
        temp_PADR = PeopleAffiliatedDealRelation.loc[PeopleAffiliatedDealRelation['DealDate']
                                                        < end_date].loc[PeopleAffiliatedDealRelation['DealDate'] >= start_date]
        temp_CTR = EntityTeamRelation.loc[EntityTeamRelation['StartDate']
                                            < end_date].loc[EntityTeamRelation['StartDate'] >= start_date]

        # new nodes
        nodes = temp_PADR['CompanyIndex'].tolist()+temp_CTR['CompanyIndex'].tolist(
        )+temp_PADR['PersonIndex'].tolist()+temp_CTR['PersonIndex'].tolist()
        nodes = list(set(nodes))
        all_nodes_set=set(all_nodes)
        new_nodes = [i for i in nodes if i not in all_nodes_set]
        all_nodes += new_nodes
        graph_nodes.append(new_nodes)
        
        # new edges
        edges += temp_PADR.apply(lambda x: str(
            x['PersonIndex'])+'+'+str(x['CompanyIndex'])+'+0+'+str(timestep), axis=1).tolist()
        edges += temp_CTR.apply(lambda x: str(
                    x['PersonIndex'])+'+'+str(x['CompanyIndex'])+'+'+(x['FullTitle'])+'+'+str(timestep), axis=1).tolist()
        edges += temp_PADR.apply(lambda x: str(
                    x['CompanyIndex'])+'+'+str(x['PersonIndex'])+'+0+'+str(timestep), axis=1).tolist()
        edges += temp_CTR.apply(lambda x: str(
                    x['CompanyIndex'])+'+'+str(x['PersonIndex'])+'+'+(x['FullTitle'])+'+'+str(timestep), axis=1).tolist()
        
        deal_edge+=temp_PADR.apply(lambda x: str(
            x['CompanyIndex'])+'+'+str(timestep), axis=1).tolist()

        graph_edges_set=set(graph_edges)
        edges = list(set([i for i in edges if i not in graph_edges_set]))
        graph_edges = list(set(graph_edges+edges))
        new_edges.append(edges)
print('*'*10+'Saving Data'+'*'*10)
