In [1]:
import pickle
import copy
import warnings
warnings.filterwarnings('ignore') # It allows you to ignore warnings from specified module

import pandas as pd
import numpy as np
import networkx as nx

In [21]:
fp = open('./data/twitch_missing_edges_final.p', "rb")
missing_edges = pickle.load(fp)
type(missing_edges)

set([(20811, 24935), (12309, 34516), (46724, 62691), (29626, 22061), (77499, 13481), (31267, 11118), (73909, 55061), (56234, 2449), (7448, 69692), (37447, 35032), (21773, 35046), (40068, 79801), (34136, 9793), (49632, 20444), (38016, 3803), (64490, 10923), (14631, 34243), (52506, 78896), (73921, 5234), (42185, 25779), (77552, 73170), (78609, 80772), (32811, 46093), (23965, 11249), (34650, 60436), (37734, 77119), (40546, 49721), (50754, 42831), (73589, 20081), (30407, 46717), (61339, 30994), (17622, 40323), (37469, 30578), (17975, 20202), (53022, 31822), (56985, 43000), (38542, 45932), (72928, 13579), (15244, 3236), (76216, 11117), (4867, 52349), (18885, 25793), (39278, 52221), (10630, 58563), (48769, 19834), (64114, 3888), (26963, 49581), (20260, 55989), (76376, 73526), (16016, 10540), (39057, 47026), (46815, 68358), (76746, 52102), (5963, 55386), (42362, 42439), (31237, 55268), (15812, 31141), (22725, 33676), (23825, 37812), (4239, 67325), (58525, 5827), (36851, 76672), (5114, 34637),

In [22]:
#df_neg - missing edges
#df_pos - present edges

df_neg = pd.DataFrame(list(missing_edges), columns=['Source', 'Destination']) # stores as a table format 
print df_neg.shape
df_neg.head(1)

(35324, 2)


Unnamed: 0,Source,Destination
0,20811,24935


In [23]:
df_pos = pd.read_csv('./data/musae_ENGB_edges.csv')
df_pos = df_pos.rename(columns = {'from':'Source', 'to':'Destination'})
df_pos = df_pos.drop_duplicates()
print df_pos.shape
print type(df_pos)
df_pos.head(1)

(35324, 2)
<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,Source,Destination
0,6194,255


In [5]:
df_pos['Class'] = 1
df_pos.head(5)

Unnamed: 0,Source,Destination,Class
0,6194,255,1
1,6194,980,1
2,6194,2992,1
3,6194,2507,1
4,6194,986,1


In [6]:
df_neg['Class'] = 0
df_neg.head(5)

Unnamed: 0,Source,Destination,Class
0,34851,11554,0
1,41718,27960,0
2,68334,63109,0
3,37046,47985,0
4,28557,43173,0


In [7]:
frames = [df_pos, df_neg]
df = pd.concat(frames)
print(df.shape)
df.head(5)

(70648, 3)


Unnamed: 0,Source,Destination,Class
0,6194,255,1
1,6194,980,1
2,6194,2992,1
3,6194,2507,1
4,6194,986,1


In [8]:
df.to_csv('./data/twitch_subset.csv')

In [9]:
df_copy = copy.deepcopy(df)

## Feature Extraction

In [24]:
g = nx.from_pandas_edgelist(df[['Source','Destination']], source='Source', target='Destination',create_using=nx.DiGraph())
print nx.info(g)


Name: 
Type: DiGraph
Number of nodes: 50173
Number of edges: 70648
Average in degree:   1.4081
Average out degree:   1.4081


In [29]:
# PageRank computes a ranking of the nodes in the graph G based on the structure of the incoming links
pr = nx.pagerank(g)
df['Page_Rank_Src'] = df.Source.apply(lambda row: pr.get(row))
df['Page_Rank_Dst'] = df.Destination.apply(lambda row: pr.get(row))
print df

       Source  Destination  Class  Page_Rank_Src  Page_Rank_Dst  \
0        6194          255      1       0.000010       0.000022   
1        6194          980      1       0.000010       0.000014   
2        6194         2992      1       0.000010       0.000014   
3        6194         2507      1       0.000010       0.000023   
4        6194          986      1       0.000010       0.000013   
5        6194         4003      1       0.000010       0.000021   
6           0           82      1       0.000010       0.000019   
7          15          343      1       0.000027       0.000014   
8          15         4282      1       0.000027       0.000040   
9          15         5442      1       0.000027       0.000035   
10         15         1162      1       0.000027       0.000030   
11         15         3401      1       0.000027       0.000727   
12       7106         6211      1       0.000010       0.000012   
13       7106         6611      1       0.000010       0.00029

In [12]:
# Shortest Path
def get_shortest_path(x, y):
    d = -1
    try:
        if g.has_edge(x, y):
            g.remove_edge(x, y)
            d = nx.shortest_path_length(g, source=x, target=y)
            g.add_edge(x, y)
        else:
            d = nx.shortest_path_length(g, source=x, target=y)
    except:
        d = -1
    return d

df['Shortest_Path'] = df.apply(lambda row: get_shortest_path(row['Source'], row['Destination']), axis = 1)

In [20]:
# Follows Back
def get_follows_back(x, y):
    return 1 if g.has_edge(y, x) else 0

df['Follows_Back'] = df.apply(lambda row: get_follows_back(row['Source'], row['Destination']), axis = 1)

In [37]:
# Follow Features
followers_src, followers_dst, followees_src, followees_dst, int_followers, int_followees = [], [], [], [], [], []

for i, r in df.iterrows():
    # A predecessor of n is a node m such that there exists a directed edge from m to n ( returns an iterator )
    pre_src = set(g.predecessors(r['Source'])) if set(g.predecessors(r['Source'])) else set()
    suc_src = set(g.successors(r['Source'])) if set(g.successors(r['Source'])) else set()

    pre_dst = set(g.predecessors(r['Destination'])) if set(g.predecessors(r['Destination'])) else set()
    suc_dst = set(g.successors(r['Destination'])) if set(g.successors(r['Destination'])) else set()

    followers_src.append(len(pre_src))
    followees_src.append(len(suc_src))

    followers_dst.append(len(pre_dst))
    followees_dst.append(len(suc_dst))

    int_followers.append(len(pre_src.intersection(pre_dst)))
    int_followees.append(len(suc_src.intersection(suc_dst)))
        
df['Followers_Src'] = followers_src
df['Followees_Src'] = followees_src
df['Followers_Dst'] = followers_dst
df['Followees_Dst'] = followees_dst
df['Int_Followers'] = int_followers
df['Int_Followees'] = int_followees




In [32]:
df

Unnamed: 0,Source,Destination,Class,Page_Rank_Src,Page_Rank_Dst,Shortest_Path,Follows_Back,Followers_Src,Followees_Src,Followers_Dst,Followees_Dst,Int_Followers,Int_Followees
0,6194,255,1,0.000010,0.000022,-1,0,0,5,0,3,0,0
1,6194,980,1,0.000010,0.000014,4,0,0,5,2,15,0,1
2,6194,2992,1,0.000010,0.000014,4,0,0,5,3,0,0,0
3,6194,2507,1,0.000010,0.000023,2,0,0,5,8,14,0,0
4,6194,986,1,0.000010,0.000013,24,0,0,5,1,21,0,0
5,6194,4003,1,0.000010,0.000021,4,0,0,5,4,3,0,0
6,0,82,1,0.000010,0.000019,-1,0,0,0,0,1,0,0
7,15,343,1,0.000027,0.000014,-1,0,0,3,0,17,0,1
8,15,4282,1,0.000027,0.000040,4,0,0,3,16,15,0,0
9,15,5442,1,0.000027,0.000035,5,0,0,3,5,0,0,0


In [16]:
df.to_csv('./data/twitch_final_dataset.csv')