In [None]:
#run_in = 'local'
run_in = 'colab'

In [None]:
if run_in == 'local':
  import json
  import matplotlib.pyplot as plt
  import IPython
  import os
  import networkx as nx
  import matplotlib.pyplot as plt
  import pprint
  pp = pprint.PrettyPrinter(indent=4)
  import datetime
  import time
  import collections
  import pandas as pd
  import tweepy
  print('Local runtime has packages')
  
if run_in == 'colab':
  !pip install networkx[all]
  !pip install matplotlib
  !pip install tweepy
  import json
  import matplotlib.pyplot as plt
  import IPython
  import os
  import networkx as nx
  import matplotlib.pyplot as plt
  import pprint
  pp = pprint.PrettyPrinter(indent=4)
  import datetime
  import time
  import collections
  import pandas as pd
  import tweepy
  import sys
  sys.setrecursionlimit(3000)

In [None]:
version_id = "V14"

if run_in == 'local':
  base_path = "/users/USERNAME/Google Drive/USERNAME/"
if run_in == 'colab':
  base_path = "/content/drive/My Drive/USERNAME/"
  from google.colab import drive
  drive.mount('/content/drive')
  print()

print('\033[1m' +  'base_path set to:' + '\033[0m')
print(base_path)
print()

investors_filepath = base_path + "Analysis/SNA/company_investor_edgelist_attributes.xlsx"

print('\033[1m' +  'investors_filepath set to:' + '\033[0m')
print(investors_filepath)
print("investors_filepath exists:", os.path.isfile(investors_filepath))
print(os.path.getsize(investors_filepath)/1000000, "MB")
timestamp = os.path.getctime(investors_filepath) # seconds since the epoch
creation_time = datetime.datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')
print("Created at:",creation_time)
print()

# Graph import

In [None]:
df_company2investor = pd.read_excel(base_path + "Analysis/SNA/company_investor_edgelist_attributes.xlsx" , 'Connections', usecols = "A,B")
df_company2investor.head()

In [None]:
df_show = df_company2investor.loc[df_company2investor['From'] == "ai corporation"]
#df_show = df_company2investor.loc[df_company2investor['From'] == "3co"]
df_show

In [None]:
#to read a dataframe into row wise lists:
company2investor_list = df_company2investor.values.tolist()
edge_list = []
for edge in company2investor_list:
  edge_list.append((edge[0],edge[1]))
edge_list

In [None]:
company2investors_graph = nx.Graph(name = "company2investors_graph")
company2investors_graph.add_edges_from(edge_list)
print('Successfully created graph of investors around the startups')
print(nx.info(company2investors_graph))
print(datetime.datetime.now())

## Decide if we want to remove nodes under a certain degree

show removable nodes

In [None]:
remove = [node for node,degree in company2investors_graph.degree() if degree < 0]
print(len(remove))
print(remove[:10])

remove nodes from graph

In [None]:
company2investors_graph.remove_nodes_from(remove)
company2investors_graph.degree()

show current nodes

In [None]:
company2investors_graph.nodes()
print("we have " + str(len(company2investors_graph.nodes())) + " nodes left for analysis")

# Analyse

 Create company statsdict

In [None]:
company_node_list = list(set([edge[0] for edge in company2investor_list]))
company_node_list.sort()
print(company_node_list[:10])

In [None]:
company_statsdict = {}

for node in company_node_list:
    company_statsdict[node] = {}
    company_statsdict[node]["tag"] = "company"

In [None]:
pp.pprint(company_statsdict)

 Create investor statsdict

In [None]:
investor_node_list = list(set([edge[1] for edge in company2investor_list]))
investor_node_list.sort()
print(investor_node_list[:10])

In [None]:
investor_statsdict = {}

for node in investor_node_list:
    investor_statsdict[node] = {}
    investor_statsdict[node]["tag"] = "investor"

In [None]:
pp.pprint(investor_statsdict)

## Degree

Total degree

In [None]:
degree = list(company2investors_graph.degree)
degree = sorted(degree, key=lambda x: x[1],reverse=True)

for node in degree:
    if node[0] in investor_statsdict.keys():
        print(node[0], "is in investors_statsdict")
        investor_statsdict[node[0]]["c_and_i_degree"] = node[1]
    if node[0] in company_statsdict.keys():
        print(node[0], "is not in investors_statsdict so it is in company_statsdict")
        company_statsdict[node[0]]["c_and_i_degree"] = node[1]

## Centrality metrics
https://networkx.github.io/documentation/stable/reference/algorithms/centrality.html

### Degree Centrality

In [None]:
d_centrality = nx.degree_centrality(company2investors_graph)

for key,value in d_centrality.items():
  if key in investor_statsdict.keys():
    investor_statsdict[key]["c_and_i_degree_centrality"] = float(value)
  if key in company_statsdict.keys():
    company_statsdict[key]["c_and_i_degree_centrality"] = float(value)

### Betweeness Centrality

In [None]:
pp.pprint(company_statsdict)

In [None]:
print(datetime.datetime.now())
b_centrality = nx.betweenness_centrality(company2investors_graph)
print(datetime.datetime.now())

for key,value in b_centrality.items():
  if key in investor_statsdict.keys():
    investor_statsdict[key]["c_and_i_betweenness_centrality"] = float(value)
  if key in company_statsdict.keys():
    company_statsdict[key]["c_and_i_betweenness_centrality"] = float(value)

### Eigenvector Centrality

In [None]:
print(datetime.datetime.now())
e_centrality = nx.eigenvector_centrality(company2investors_graph)
#sorted(e_centrality.items(), key=lambda x: x[1],reverse=True)
print(datetime.datetime.now())

for key,value in e_centrality.items():
  if key in investor_statsdict.keys():
    investor_statsdict[key]["c_and_i_eigenvector_centrality"] = float(value)
  if key in company_statsdict.keys():
    company_statsdict[key]["c_and_i_eigenvector_centrality"] = float(value)

In [None]:
pp.pprint(investor_statsdict)

### Turn into dataframe

In [None]:
#df_companyandinvestor = pd.DataFrame.from_dict(investors_statsdict)
df_investor = pd.DataFrame.from_dict(investor_statsdict, orient='index')
df_investor.reset_index(level=0, inplace=True)
df_investor = df_investor.sort_values(by=['c_and_i_degree'],ascending=False)
df_investor = df_investor.rename(index=str, columns={"index" : "investor_name"})
df_investor.head()

In [None]:
#df_companyandinvestor = pd.DataFrame.from_dict(investors_statsdict)
df_company = pd.DataFrame.from_dict(company_statsdict, orient='index')
df_company.reset_index(level=0, inplace=True)
df_company = df_company.sort_values(by=['c_and_i_degree'],ascending=False)
df_company = df_company.rename(index=str, columns={"index" : "company_name"})
df_company.head()

# Save results

In [None]:
#write the analysis dataframe to .xlsx file    
with pd.ExcelWriter(base_path + "Analysis/SNA/SNA investors and companies.xlsx") as writer:
    df_investor.to_excel(writer, sheet_name= version_id +"df_investor_SNA", index=False)
    df_company.to_excel(writer, sheet_name= version_id +"df_company_SNA", index=False)