In [129]:
import pandas as pd
import glob
import datetime as dt
import networkx as nx
import operator
import random

In [80]:
def loadEdgelist(filename):
	# Readin graph
	g = nx.read_edgelist(filename, 
	                     delimiter="\t", 
	                     create_using  = nx.DiGraph(), 
	                     data=True)

	# Convert Weight To Float
	temp = map(lambda (x, y): (x, float(y)), 
			   nx.get_edge_attributes(g, "weight").items())
	nx.set_edge_attributes(g, "weight", dict(temp))

	# Convert Date to datetime
	temp = map(lambda (x, y): (x, dt.datetime.strptime(y, "%Y-%m-%d")), 
			   nx.get_edge_attributes(g, "date").items())
	nx.set_edge_attributes(g, "date", dict(temp))

	return g

def writeEdgelist(g, filename):
    # Convert date to string
    temp = map(lambda (x, y): (x, y.strftime('%Y-%m-%d')), 
               nx.get_edge_attributes(g, "date").items())
    nx.set_edge_attributes(g, "date", dict(temp))
    
    # Write to file
    nx.write_edgelist(g,filename, delimiter="\t", data=True)


In [14]:
df = pd.DataFrame()
for file in glob.glob("/home/charles/siliconvalleyanalysis/code/2_prediction/temp/*"):
    df = pd.concat([df, pd.read_csv(file)], ignore_index=True)

In [98]:
df[df.company == "/organization/uber"]

Unnamed: 0,investor,company,weight,date
6654,/organization/kleiner-perkins-caufield-byers,/organization/uber,150000000,2014-06-06
6668,/organization/blackrock,/organization/uber,150000000,2014-06-06
6671,/organization/summit-partners,/organization/uber,150000000,2014-06-06
6690,/organization/menlo-ventures,/organization/uber,150000000,2014-06-06
6692,/organization/wellington-management,/organization/uber,150000000,2014-06-06
6695,/organization/google-ventures,/organization/uber,150000000,2014-06-06
6708,/organization/fidelity-investments,/organization/uber,150000000,2014-06-06
21248,/organization/microsoft,/organization/uber,500000000,2015-07-31
21254,/organization/bennett-coleman-and-co-ltd,/organization/uber,500000000,2015-07-31
22287,/organization/tata-opportunities-fund,/organization/uber,50000000,2015-08-19


In [31]:
# new_edges = {}
# for i in df[["investor", "company"]].values.tolist():
#     start_node = i[0]
#     end_node = i[1]
    
#     if start_node not in new_edges:
#         new_edges[start_node] = set([end_node])
#     else:
#         new_edges[start_node].add(end_node)

In [99]:
new_edges_s = {}
for i in df[["investor", "company"]].values.tolist():
    start_node = i[0]
    end_node = i[1]

    if end_node not in new_edges_s:
        new_edges_s[end_node] = set([start_node])
    else:
        new_edges_s[end_node].add(start_node)

In [100]:
new_edges_s["/organization/uber"]

{'/organization/accelerate-it-ventures',
 '/organization/baidu',
 '/organization/bennett-coleman-and-co-ltd',
 '/organization/blackrock',
 '/organization/fidelity-investments',
 '/organization/foundation-capital',
 '/organization/goldman-sachs',
 '/organization/google-ventures',
 '/organization/kleiner-perkins-caufield-byers',
 '/organization/lone-pine-capital',
 '/organization/menlo-ventures',
 '/organization/microsoft',
 '/organization/new-enterprise-associates',
 '/organization/qatar-investment-authority',
 '/organization/sherpa-ventures',
 '/organization/summit-partners',
 '/organization/tata-capital',
 '/organization/tata-opportunities-fund',
 '/organization/times-internet',
 '/organization/valiant-capital-partners',
 '/organization/wellington-management'}

In [101]:
m = {}
for key in new_edges_s:
    print key, len(new_edges_s[key])
    m[key] = len(new_edges_s[key])


/organization/intendime 1
/organization/loop-commerce 13
/organization/drizly 6
/organization/cyrus-biotechnology 2
/organization/iron-io 3
/organization/sinbads-supply-chain 1
/organization/akippa 1
/organization/qzzr 4
/organization/framer 2
/organization/medlanes 2
/organization/solar-tower-technologies 1
/organization/smart-vision-labs 4
/organization/tongbanjie 3
/organization/peachme 1
/organization/eatlo 3
/organization/booker-software 4
/organization/form-devices 1
/organization/qdiscovery 2
/organization/basharsoft 4
/organization/telefactor-robotics-com 1
/organization/satvacart 1
/organization/cape-wind 3
/organization/avuba 2
/organization/blackwood-seven 3
/organization/the-vr-company 2
/organization/arthesis-covers 1
/organization/rant-media-network-llc 6
/organization/prevoty 2
/organization/healthexpense-inc- 6
/organization/brainquake 1
/organization/vetpronto 1
/organization/etaskr 1
/organization/transenterix 2
/organization/gigwalk 6
/organization/sokrati 1
/organiz

In [102]:
preG = loadEdgelist("../1_snapshotting/cumulative_snapshots/enddate_20140228.edgelist")
# Set node attributes. 0 is an investor, 1 is a company.
node_attributes =dict(map(lambda (x,y): (x,1) if y > 0 else (x,0), 
                      preG.in_degree().items()))
nx.set_node_attributes(preG, "bipartite", node_attributes)

In [57]:
# startup_degree = filter(lambda (n, d): node_attributes[n] == 1,
#                         preG.degree().items())
# sorted_degree_list = sorted(startup_degree, 
#                             key=operator.itemgetter(1), reverse=True)

In [168]:
investor_degree = filter(lambda (n, d): node_attributes[n] == 0,
                        preG.out_degree().items())
sorted_investor_degree_list = sorted(investor_degree, 
                            key=operator.itemgetter(1), reverse=True)

In [106]:
sorted_investor_degree_list[0:254]

[(u'/organization/goldman-sachs', 3259178984.0),
 (u'/organization/u-s-department-of-energy', 2496298333.0),
 (u'/organization/sequoia-capital', 2030354338.0),
 (u'/organization/kleiner-perkins-caufield-byers', 2002657884.0),
 (u'/organization/new-enterprise-associates', 1849830928.0),
 (u'/organization/google', 1794319046.0),
 (u'/organization/intel-capital', 1738903618.0),
 (u'/organization/accel-partners', 1509631569.0),
 (u'/organization/ta-associates', 1499729666.0),
 (u'/organization/warburg-pincus', 1284074626.0),
 (u'/organization/insight-venture-partners', 1183010778.0),
 (u'/organization/digital-sky-technologies-fo', 1127877199.0),
 (u'/organization/summit-partners', 1094343293.0),
 (u'/organization/venrock', 1039630234.0),
 (u'/organization/oak-investment-partners', 1020506703.0),
 (u'/organization/saif-partners', 939267102.0),
 (u'/organization/microsoft', 909056748.0),
 (u'/organization/ge-capital', 906828559.0),
 (u'/organization/index-ventures', 891035851.0),
 (u'/organi

In [59]:
sorted(m.items(), key=operator.itemgetter(1), reverse=True)

[('/organization/wayra', 254),
 ('/organization/500-startups', 252),
 ('/organization/y-combinator', 220),
 ('/organization/sequoia-capital', 174),
 ('/organization/techstars', 167),
 ('/organization/new-enterprise-associates', 146),
 ('/organization/accel-partners', 135),
 ('/organization/cyberport-hong-kong', 124),
 ('/organization/kima-ventures', 118),
 ('/organization/intel-capital', 115),
 ('/organization/microsoft-ventures', 112),
 ('/organization/start-up-chile', 110),
 ('/organization/plug-and-play-ventures', 105),
 ('/organization/startupbootcamp', 102),
 ('/organization/andreessen-horowitz', 101),
 ('/organization/kleiner-perkins-caufield-byers', 100),
 ('/organization/silicon-valley-bank', 99),
 ('/organization/queensbridge-venture-partners', 95),
 ('/organization/google-ventures', 95),
 ('/organization/khosla-ventures', 93),
 ('/organization/slow-ventures', 87),
 ('/organization/sv-angel', 86),
 ('/organization/first-round-capital', 85),
 ('/organization/great-oaks-venture-

In [107]:
# for investor in new_edges.keys():
#     for company in sorted_degree_list[0:m[investor]]:    
#         if company[0] in new_edges[investor]:
#             print m[investor], investor, company[0]

In [169]:
a = 0
total_m = 0
for company in new_edges_s.keys():
    total_m += m[company]
    for investor in sorted_investor_degree_list[0:m[company]]:
        if investor[0] in new_edges_s[company]:
            #print m[company], company, investor[0]
            a += 1
a

678

In [138]:
new_edges_s

{'/organization/intendime': {'/organization/telecom-italia'},
 '/organization/loop-commerce': {'/organization/paypal',
  '/organization/wicklow-capital',
  '/person/andrew-fine',
  '/person/chuck-geiger',
  '/person/dan-rose',
  '/person/dovi-frances',
  '/person/ken-seiff',
  '/person/mark-carges',
  '/person/michael-scharff',
  '/person/mohan-gyani',
  '/person/oren-zeev',
  '/person/roy-rubin',
  '/person/silas-chou'},
 '/organization/drizly': {'/organization/continental-investors',
  '/organization/fairhaven-capital-partners-2',
  '/organization/first-beverage-group',
  '/organization/polaris-partners',
  '/organization/suffolk-equity-partners',
  '/organization/vayner-rse'},
 '/organization/cyrus-biotechnology': {'/organization/the-w-fund',
  '/organization/wings'},
 '/organization/iron-io': {'/organization/bain-capital-ventures',
  '/organization/baseline-ventures',
  '/organization/divergent-ventures'},
 '/organization/sinbads-supply-chain': {'/organization/kleiner-perkins-caufi

In [157]:
def predict(company, new_edges, investors):
    predict = [random.choice(investors) for i in range(0, len(new_edges))]
    return sum(map(lambda x: 1 if x in new_edges else 0, predict))


investors = dict(investor_degree).keys()
i = 0
total_correct = 0
for company in new_edges_s.keys():
    total_correct += predict(company, new_edges_s[company], investors)
    i += 1
    if i % 1000 == 0:
        print i

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000


In [158]:
total_correct

6

In [160]:
len(df)

34128

In [163]:
6.0/34128

0.00017580872011251758

In [170]:
400.0/34128

0.011720581340834505

In [172]:
678.0/34128

0.019866385372714488

In [175]:
postG = loadEdgelist("../1_snapshotting/cumulative_snapshots/enddate_20150831.edgelist")

In [177]:
# Set node attributes. 0 is an investor, 1 is a company.
node_attributes =dict(map(lambda (x,y): (x,1) if y > 0 else (x,0), 
                      postG.in_degree().items()))
nx.set_node_attributes(postG, "bipartite", node_attributes)

In [181]:
len(filter(lambda (x,y): y == 0, node_attributes.items()))

20858

In [183]:
len(filter(lambda (x,y): y == 1, node_attributes.items()))

34101

In [189]:
investor_degree = filter(lambda (n, d): node_attributes[n] == 0,
                        postG.out_degree(weight="weight").items())
sorted_investor_degree_list = sorted(investor_degree, 
                            key=operator.itemgetter(1), reverse=True)
sorted_investor_degree_list

[(u'/organization/goldman-sachs', 6351100008.0),
 (u'/organization/alibaba', 5252092061.0),
 (u'/organization/sequoia-capital', 4419774196.0),
 (u'/organization/warburg-pincus', 3853103460.0),
 (u'/organization/kleiner-perkins-caufield-byers', 3184299724.0),
 (u'/organization/intel-capital', 3041073120.0),
 (u'/organization/accel-partners', 3012476080.0),
 (u'/organization/new-enterprise-associates', 2999719304.0),
 (u'/organization/u-s-department-of-energy', 2508098333.0),
 (u'/organization/tiger-global', 2399512480.0),
 (u'/organization/carlyle-group', 2390931891.0),
 (u'/organization/temasek', 2378122876.0),
 (u'/organization/insight-venture-partners', 2364647819.0),
 (u'/organization/digital-sky-technologies-fo', 2107724023.0),
 (u'/organization/google', 2101282379.0),
 (u'/organization/ta-associates', 1999729666.0),
 (u'/organization/kkr', 1980386480.0),
 (u'/organization/summit-partners', 1920447459.0),
 (u'/organization/softbank', 1816094783.0),
 (u'/organization/technology-cros