In [1]:
import pandas as pd
import datetime as dt
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import operator
import numpy as np



In [2]:
def loadEdgelist(filename):
	# Readin graph
	g = nx.read_edgelist(filename, 
	                     delimiter="\t", 
	                     create_using  = nx.DiGraph(), 
	                     data=True)

	# Convert Weight To Float
	temp = map(lambda (x, y): (x, float(y)), 
			   nx.get_edge_attributes(g, "weight").items())
	nx.set_edge_attributes(g, "weight", dict(temp))

	# Convert Date to datetime
	temp = map(lambda (x, y): (x, dt.datetime.strptime(y, "%Y-%m-%d")), 
			   nx.get_edge_attributes(g, "date").items())
	nx.set_edge_attributes(g, "date", dict(temp))

	return g

def writeEdgelist(g, filename):
    # Convert date to string
    temp = map(lambda (x, y): (x, y.strftime('%Y-%m-%d')), 
               nx.get_edge_attributes(g, "date").items())
    nx.set_edge_attributes(g, "date", dict(temp))
    
    # Write to file
    nx.write_edgelist(g,filename, delimiter="\t", data=True)


In [2]:
G = nx.read_edgelist("../0_cleaning/max_wcc_graph.edgelist", create_using=nx.DiGraph(), delimiter="\t", data=True)

In [None]:
# post_2005_edges = pd.read_csv("../0_cleaning/max_wcc_graph.edgelist", low_memory=False)

In [40]:
dates = nx.get_edge_attributes(G, 'date')
# weights = nx.get_edge_attributes(G, 'weight')

flat_table = map(lambda x: [x[0], x[1], dates[x]] , dates)
wcc_df = pd.DataFrame(flat_table)
wcc_df.columns = ["investor", "company", "invest_date"]
wcc_df["invest_date"].apply(pd.to_datetime)
post_2005_edges = wcc_df
post_2005_edges["invest_date"] = post_2005_edges["invest_date"].apply(pd.to_datetime)
post_2005_edges = post_2005_edges.sort_values(['invest_date'])

In [42]:
total_periods = 132
rng = pd.date_range('1/1/2005', periods= total_periods, freq='M')
months_per_period = 3
print rng
for i in range(1,total_periods/months_per_period):
    period = i * months_per_period
    curr_rng = rng[:period]
    curr_start = curr_rng[0]
    curr_end = curr_rng[-1]
    mask = (post_2005_edges['invest_date'] >= curr_start) & (post_2005_edges['invest_date'] <= curr_end)
    period_snapshot = post_2005_edges[mask]
    rtn_string = curr_end.strftime('%Y%m%d')
    file_name = "../../df_wccsnapshots/cumulative/enddate" + rtn_string[:10] + ".csv"
#     file_name = file_name.encode('utf-8').strip().replace("-","")
    period_snapshot.to_csv(file_name, index='false', encoding='utf-8')

DatetimeIndex(['2005-01-31', '2005-02-28', '2005-03-31', '2005-04-30',
               '2005-05-31', '2005-06-30', '2005-07-31', '2005-08-31',
               '2005-09-30', '2005-10-31',
               ...
               '2015-03-31', '2015-04-30', '2015-05-31', '2015-06-30',
               '2015-07-31', '2015-08-31', '2015-09-30', '2015-10-31',
               '2015-11-30', '2015-12-31'],
              dtype='datetime64[ns]', length=132, freq='M')


In [44]:
total_periods = 132
rng = pd.date_range('1/1/2005', periods= total_periods, freq='M')
months_per_period = 3
print rng
for i in range(1,total_periods/months_per_period):
    period = i * months_per_period
    curr_rng = rng[period - months_per_period:period]
    curr_start = curr_rng[0]
    curr_end = curr_rng[-1]
    mask = (post_2005_edges['invest_date'] >= curr_start) & (post_2005_edges['invest_date'] <= curr_end)
    period_snapshot = post_2005_edges[mask]
    rtn_string = curr_end.strftime('%Y%m%d')
    file_name = "../../df_wccsnapshots/diff/enddate" + rtn_string[:10] + ".csv"
#     file_name = file_name.encode('utf-8').strip().replace("-","")
    period_snapshot.to_csv(file_name, index='false', encoding='utf-8')

DatetimeIndex(['2005-01-31', '2005-02-28', '2005-03-31', '2005-04-30',
               '2005-05-31', '2005-06-30', '2005-07-31', '2005-08-31',
               '2005-09-30', '2005-10-31',
               ...
               '2015-03-31', '2015-04-30', '2015-05-31', '2015-06-30',
               '2015-07-31', '2015-08-31', '2015-09-30', '2015-10-31',
               '2015-11-30', '2015-12-31'],
              dtype='datetime64[ns]', length=132, freq='M')


In [23]:
#create graph edge lists from the csv's generated above.

def customGraphGenerator(filename, attribute_key):
    g = nx.DiGraph()
    
    with open(filename, 'r') as f:
        for line in f.readlines():
            
            # Parse Line
            edge = line.strip().split(",")
            
            # Create Attributes
            attributes = dict(zip(attribute_key, edge[2:]))
            
            # Add Edge
            g.add_edge(edge[0], edge[1], attr_dict=attributes)

    return g

total_periods = 132
rng = pd.date_range('1/1/2005', periods= total_periods, freq='M')
months_per_period = 3
print rng
for i in range(1,total_periods/months_per_period):
    period = i * months_per_period
    curr_rng = rng[:period]
    curr_start = curr_rng[0]
    curr_end = curr_rng[-1]
    rtn_string = curr_end.strftime('%Y%m%d')
    input_file_name = "../../df_wccsnapshots/cumulative/enddate" + rtn_string[:10] + ".csv"
    output_file_name = "../../graph_edgelist_wccsnapshots/cumulative/enddate" + rtn_string[:10] + ".csv"

    attributes = ["weight", "date"] # All additiona attributes, order matters
    
    
    g = customGraphGenerator(input_file_name, attribute_key = attributes)

    # Write the edge to a NetworkX format; delimiter='\t'
    nx.write_edgelist(g, output_file_name, delimiter="\t", data=True)



DatetimeIndex(['2005-01-31', '2005-02-28', '2005-03-31', '2005-04-30',
               '2005-05-31', '2005-06-30', '2005-07-31', '2005-08-31',
               '2005-09-30', '2005-10-31',
               ...
               '2015-03-31', '2015-04-30', '2015-05-31', '2015-06-30',
               '2015-07-31', '2015-08-31', '2015-09-30', '2015-10-31',
               '2015-11-30', '2015-12-31'],
              dtype='datetime64[ns]', length=132, freq='M')


In [25]:
#create graph edge lists from the csv's generated above for the differences

def customGraphGenerator(filename, attribute_key):
    g = nx.DiGraph()
    
    with open(filename, 'r') as f:
        for line in f.readlines():
            
            # Parse Line
            edge = line.strip().split(",")
            
            # Create Attributes
            attributes = dict(zip(attribute_key, edge[2:]))
            
            # Add Edge
            g.add_edge(edge[0], edge[1], attr_dict=attributes)

    return g

total_periods = 132
rng = pd.date_range('1/1/2005', periods= total_periods, freq='M')
months_per_period = 3
print rng
for i in range(1,total_periods/months_per_period):
    period = i * months_per_period
    curr_rng = rng[:period]
    curr_start = curr_rng[0]
    curr_end = curr_rng[-1]
    rtn_string = curr_end.strftime('%Y%m%d')
    input_file_name = "../../df_wccsnapshots/diff/enddate" + rtn_string[:10] + ".csv"
    output_file_name = "../../graph_edgelist_wccsnapshots/diff/enddate" + rtn_string[:10] + ".csv"

    attributes = ["weight", "date"] # All additiona attributes, order matters
    
    
    g = customGraphGenerator(input_file_name, attribute_key = attributes)

    # Write the edge to a NetworkX format; delimiter='\t'
    nx.write_edgelist(g, output_file_name, delimiter="\t", data=True)



DatetimeIndex(['2005-01-31', '2005-02-28', '2005-03-31', '2005-04-30',
               '2005-05-31', '2005-06-30', '2005-07-31', '2005-08-31',
               '2005-09-30', '2005-10-31',
               ...
               '2015-03-31', '2015-04-30', '2015-05-31', '2015-06-30',
               '2015-07-31', '2015-08-31', '2015-09-30', '2015-10-31',
               '2015-11-30', '2015-12-31'],
              dtype='datetime64[ns]', length=132, freq='M')


In [4]:
g1 = loadEdgelist("/Users/yishh/Box Sync/CS 224W/siliconvalleyanalysis/code/1_snapshotting/cumulative_snapshots/enddate_20120531.edgelist")

In [7]:
for node in g1.nodes_iter():
    print node
    print node
    break
    
g1.edges(data=True)

/organization/rsj-private-equity
/organization/rsj-private-equity


[(u'/organization/rsj-private-equity',
  u'/organization/cognitive-security',
  {'date': datetime.datetime(2011, 4, 25, 0, 0), 'weight': 350000.0}),
 (u'/organization/rsj-private-equity',
  u'/organization/beepl',
  {'date': datetime.datetime(2011, 7, 20, 0, 0), 'weight': 133333.0}),
 (u'/organization/petra-partners',
  u'/organization/new-century-hospice',
  {'date': datetime.datetime(2010, 10, 13, 0, 0), 'weight': 10000000.0}),
 (u'/organization/inventures',
  u'/organization/foruforever',
  {'date': datetime.datetime(2011, 4, 1, 0, 0), 'weight': 0.0}),
 (u'/organization/healthbox',
  u'/organization/carehubs',
  {'date': datetime.datetime(2012, 4, 4, 0, 0), 'weight': 0.0}),
 (u'/organization/healthbox',
  u'/organization/push-wellness',
  {'date': datetime.datetime(2012, 4, 4, 0, 0), 'weight': 0.0}),
 (u'/organization/healthbox',
  u'/organization/cara-health',
  {'date': datetime.datetime(2012, 4, 4, 0, 0), 'weight': 0.0}),
 (u'/organization/healthbox',
  u'/organization/united-pre

In [None]:
#calculate all shortest path lengths
# G = nx.path_graph(5)
all_path_lengths = nx.all_pairs_shortest_path_length(nx.Graph(g1))
all_path_lengths_sorted = []


all_path_lengths
# all_path_lengths
# for key in all_path_lengths:
#     p = all_path_lengths[key]
#     curr_distances =  sorted(list(zip(p.keys(),p.values())), key = lambda x: x[1], reverse = True)
#     all_path_lengths_sorted.append(curr_distances)
    
# all_path_lengths_sorted

In [None]:
#calculate number of paths between every other node
# G = nx.path_graph(5)
g_size = len(G)

all_number_paths = np.zeros((g_size,g_size))

for i in range(g_size):
    for j in range(i,g_size):
        if i == j:
            continue
        all_paths = nx.all_simple_paths(G, source=i, target=j)
        num = len(list(all_paths))
        all_number_paths[i,j] = num
        all_number_paths[j,i] = num

all_number_paths


In [None]:
#calculate number of paths between every other node
# G = nx.path_graph(5)
g_size = len(G)

all_number_paths = np.zeros((g_size,g_size))

adamic_index = nx.adamic_adar_index(G)
adamic_index_list = [p for p in adamic_index]
adamic_index_list


In [None]:
#once i have generated every graph
#for each snapshot
    #generate a graph
    #for each graph
        #get all shortest paths lengths
        #get number paths between every node
        #get adamic index
        #store it properly
        
#random link predictor
