In [1]:
import networkx as nx
import matplotlib.pyplot as plt
from scipy import stats
import pandas as pd

In [2]:
def display_graph(g):
    elarge = [(u, v) for (u, v, d) in g.edges(data=True) if d['weight'] > 5]
    esmall = [(u, v) for (u, v, d) in g.edges(data=True) if d['weight'] <= 5]

    pos = nx.spring_layout(g)  # positions for all nodes

    plt.figure(figsize=(10,10))


    # nodes
    nx.draw_networkx_nodes(g, pos, node_size=700)

    # edges
    nx.draw_networkx_edges(g, pos, edgelist=elarge,
                           width=6)
    nx.draw_networkx_edges(g, pos, edgelist=esmall,
                           width=2, alpha=0.5, edge_color='b')

    # labels
    nx.draw_networkx_labels(g, pos, font_size=10, font_family='sans-serif')

    plt.axis('off')
    plt.show()

    return None

In [3]:
def standardize_names(g):

    new_names = {'BlackWidow':'Black_Widow',
             'Captin_America':'Captain_America','CaptainAmerica':'Captain_America',
             'Clint_Barton':'Hawkeye',
             'Iron_Man':'IronMan',
             'NickFury':'Nick_Fury',
             'ScarletWitch':'Scarlet_Witch',
             'WarMachine':'War_Machine'}

    fixes = set(new_names.keys()) - set(g.nodes())
    
    for fix in fixes:
        del new_names[fix]
        
    return nx.relabel_nodes(g, new_names)

In [4]:
def combine_weighted_graphs(g,h):
    ''' Need special function to combined graphs with weighted edges where you want the weights of edges
    in both graphs to be added together'''
    
    combined = nx.Graph()

    # standardize edge name ordering
    g_edges = [tuple(sorted(x)) for x in g.edges]
    h_edges = [tuple(sorted(x)) for x in h.edges]
  
    add_weights_edges = set(g_edges) & set(h_edges)
    
    for edge in add_weights_edges:
        weight = g[edge[0]][edge[1]]['weight'] + h[edge[0]][edge[1]]['weight']
        combined.add_edge(edge[0], edge[1], weight=weight)
   
    # edges in g not in h
    unique_edges = set(g_edges) - set(h_edges)
    for edge in unique_edges:
        combined.add_edge(edge[0],edge[1], weight=g[edge[0]][edge[1]]['weight'])

    # edges in h not in g
    unique_edges = set(h_edges) - set(g_edges)
    for edge in unique_edges:
        combined.add_edge(edge[0],edge[1], weight=h[edge[0]][edge[1]]['weight'])
                       
    return combined

In [5]:
a1 = nx.read_edgelist('Avengers1_edge_list.txt', data=(('weight',float),))
a1 = standardize_names(a1)

In [6]:
a2 = nx.read_edgelist('Avengers2_edge_list.txt', data=(('weight',float),))
a2 = standardize_names(a2)

In [7]:
combined = combine_weighted_graphs(a1,a2)

In [8]:
combined.nodes

NodeView(('Hulk', 'IronMan', 'Black_Widow', 'Nick_Fury', 'Captain_America', 'Hawkeye', 'Thor', 'Loki', 'Professor_Erik_Selvig', 'Agent_Hill', 'Phil_Coulson', 'Jarvis', 'Jane', 'Pepper_Potts', 'War_Machine', 'Vision', 'Scarlet_Witch', 'Falcon'))

In [13]:
#returns Common Neighbr Distance for given nodes x and y 
def common_neighbor_distance (g, x, y) :
    cn = len(list(nx.common_neighbors(g, x, y)))
    if(cn > 0):
        return (1 + cn)/2
    else:
        return 1/nx.shortest_path_length(g, source = x, target = y)
    
#returns Sorenson Index for given nodes x and y
def sorenson_index (g, x, y) :
    cn = len(list(nx.common_neighbors(g, x, y)))
    return 2*cn/(g.degree(x)+g.degree(y))

#Similarity between two nodes using the Hub Promoted Index
def hpi(G,x,y):
    return len(list(nx.common_neighbors(G, x, y)))/min(G.degree(x),G.degree(y))

In [14]:
#iterates through all pairs of nodes
#returns dataframe with all metric values for each pair
def metric_iterator (g) :
    df = pd.DataFrame(columns = ['CND', 'PA', 'AA', 'CN', 'SI', 'JI', 'RA', 'HPI'])
    nodes = list(g)
    for x in range(0, g.number_of_nodes()):
        for y in range(x + 1, g.number_of_nodes()):
            #call 8 metrics and add to dataframe
            scores = dict()
            scores['CND'] = round(common_neighbor_distance(g, nodes[x], nodes[y]), 3)
            scores['PA'] = int(list(nx.preferential_attachment(g, [(nodes[x], nodes[y])]))[0][2])
            scores['AA'] = round(list(nx.adamic_adar_index(g, [(nodes[x], nodes[y])]))[0][2], 3)
            scores['CN'] = len(list(nx.common_neighbors(g, nodes[x], nodes[y])))
            scores['SI'] = round(sorenson_index(g, nodes[x], nodes[y]), 3)
            scores['JI'] = round(list(nx.jaccard_coefficient(g, [(nodes[x], nodes[y])]))[0][2], 3)
            scores['RA'] = round(list(nx.resource_allocation_index(g, [(nodes[x], nodes[y])]))[0][2], 3)
            scores['HPI'] = round(hpi(g, nodes[x], nodes[y]), 3)
            newRow = pd.Series(data = scores, name = "" + nodes[x] + "," + nodes[y])
            df = df.append(newRow, ignore_index = False)
            
    return df

In [15]:
g = metric_iterator(combined)
print(g)

                             CND    PA     AA   CN     SI     JI     RA    HPI
Hulk,IronMan               3.000  78.0  2.234  5.0  0.526  0.357  0.533  0.833
Hulk,Black_Widow           3.000  54.0  2.169  5.0  0.667  0.500  0.499  0.833
Hulk,Nick_Fury             3.000  60.0  2.190  5.0  0.625  0.455  0.510  0.833
Hulk,Captain_America       3.000  54.0  2.169  5.0  0.667  0.500  0.499  0.833
Hulk,Hawkeye               3.000  42.0  2.190  5.0  0.769  0.625  0.510  0.833
...                          ...   ...    ...  ...    ...    ...    ...    ...
War_Machine,Scarlet_Witch  0.333   1.0  0.000  0.0  0.000  0.000  0.000  0.000
War_Machine,Falcon         0.333   1.0  0.000  0.0  0.000  0.000  0.000  0.000
Vision,Scarlet_Witch       0.250   1.0  0.000  0.0  0.000  0.000  0.000  0.000
Vision,Falcon              0.333   1.0  0.000  0.0  0.000  0.000  0.000  0.000
Scarlet_Witch,Falcon       0.333   1.0  0.000  0.0  0.000  0.000  0.000  0.000

[153 rows x 8 columns]


In [16]:
stats.spearmanr(g['CND'], g['HPI'])

SpearmanrResult(correlation=0.22013721170850004, pvalue=0.006252418888523385)

In [17]:
#Correlations between each link predictior
def correlations(g):
    df = pd.DataFrame()#columns = ['correlation_spearman', 'correlation_kendall', 'correlation_pearson', 'pvalue_spearman',
                               # 'pvalue_kendall', 'correlation_pearson'])
    c = list(g.columns)
    for x in c:
        for y in c:
           # print(stats.spearmanr(g[x], g[y]))
            score = dict()
            score['correlation_spearman'] = stats.spearmanr(g[x], g[y])[0]
            score['correlation_kendall'] = stats.kendalltau(g[x], g[y])[0]
            score['correlation_pearson'] = stats.pearsonr(g[x], g[y])[0]
            score['pvalue_spearman'] = stats.spearmanr(g[x], g[y])[1]
            score['pvalue_kendall'] = stats.kendalltau(g[x], g[y])[1]
            score['pvalue_pearson'] = stats.pearsonr(g[x], g[y])[1]
            newRow = pd.Series(data = score, name = "" + x + "," + y)
            df = df.append(newRow, ignore_index = False)
        
    return df

In [18]:
correlations(g)

Unnamed: 0,correlation_kendall,correlation_pearson,correlation_spearman,pvalue_kendall,pvalue_pearson,pvalue_spearman
"CND,CND",1.000000,1.000000,1.000000,4.598835e-53,0.000000e+00,0.000000e+00
"CND,PA",0.820363,0.955294,0.915277,2.582880e-41,8.328882e-82,1.667763e-61
"CND,AA",0.897500,0.988054,0.950640,3.570823e-47,1.508059e-124,1.235557e-78
"CND,CN",0.968432,0.995151,0.977851,6.298543e-50,5.375183e-154,1.813472e-104
"CND,SI",0.804709,0.857684,0.889999,6.454215e-39,1.800178e-45,2.290695e-53
...,...,...,...,...,...,...
"HPI,CN",0.192765,0.345200,0.280699,3.160035e-03,1.242639e-05,4.405745e-04
"HPI,SI",0.243088,0.537764,0.333155,8.336026e-05,7.617334e-13,2.576836e-05
"HPI,JI",0.243088,0.452812,0.333155,8.336026e-05,4.183675e-09,2.576836e-05
"HPI,RA",0.185502,0.304372,0.287202,2.917017e-03,1.306852e-04,3.188304e-04
