In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt

In [None]:
df_group_edges = pd.read_csv('/content/drive/MyDrive/IS353/data/group-edges.csv')
df_member_edges = pd.read_csv('/content/drive/MyDrive/IS353/data/member_edges_25k.csv')

### Group

In [None]:
G = nx.from_pandas_edgelist(df_group_edges, 'group1', 'group2', 'weight', create_using = nx.Graph())

In [None]:
# List chứa tất cả group
all_group = list(set(df_group_edges["group1"].tolist() + df_group_edges["group2"].tolist()))

In [None]:
import itertools

# Tạo các tất cả các cặp group có trong all_group
pairs = list(itertools.combinations(all_group, 2))
df_all_group = pd.DataFrame(pairs, columns=['group1', 'group2'])
df_all_group['group1'] = df_all_group['group1'].astype('int64')
df_all_group['group2'] = df_all_group['group2'].astype('int64')

In [None]:
# Lấy các cặp group không có trong dữ liệu ban đầu
df_predict = df_all_group[~df_all_group[['group1', 'group2']].apply(tuple, axis=1).isin(df_group_edges[['group1', 'group2']].apply(tuple, axis=1))]

In [None]:
df_predict

Unnamed: 0,group1,group2
0,19292162,21182471
1,19292162,18288648
2,19292162,20064276
3,19292162,19275797
4,19292162,20135961
...,...,...
103735,8968172,47094
103736,8968172,19654655
103737,6121452,47094
103738,6121452,19654655


In [None]:
# Tạo một dictionary để lưu kết quả tính toán cho mỗi cặp group1 và group2
predict = []
# Tính toán cho từng cặp group1 và group2
for index, row in df_predict.iterrows():
    group1 = row['group1']
    group2 = row['group2']

    jaccard_coefficient = next(nx.jaccard_coefficient(G, [(group1, group2)]))[2]
    adamic_adar_coefficient = next(nx.adamic_adar_index(G, [(group1, group2)]))[2]
    preferential_attachment = next(nx.preferential_attachment(G, [(group1, group2)]))[2]

    predict.append((group1, group2, jaccard_coefficient, adamic_adar_coefficient, preferential_attachment))

In [None]:
df_prediction = pd.DataFrame(predict, columns=['group1', 'group2', 'jaccard', 'adamic_adar', 'preferential_attachment'])

In [None]:
df_prediction = df_prediction.sort_values('jaccard', ascending=False)
df_prediction = df_prediction.set_index(np.array(range(df_prediction.shape[0])))

In [None]:
# Top 10 theo jaccard
df_prediction.sort_values('jaccard', ascending=False).head(10)

Unnamed: 0,group1,group2,jaccard,adamic_adar,preferential_attachment
0,25327081,18849449,1.0,0.205769,1
1,25326528,25482608,1.0,0.267546,1
2,22817838,18779992,0.527273,6.612215,1760
3,21533726,16447162,0.517241,6.934214,1932
4,18729267,18549827,0.5,0.22756,2
12,25717708,21651228,0.5,0.207112,2
17,25934999,24925734,0.5,0.624002,18
16,25810092,19680612,0.5,0.352956,2
15,5618532,16447162,0.5,6.924781,2016
5,18658376,25327081,0.5,0.205769,2


In [None]:
# Top 10 theo adamic_adar
df_prediction.sort_values('adamic_adar', ascending=False).head(10)

Unnamed: 0,group1,group2,jaccard,adamic_adar,preferential_attachment
20678,11077852,1776274,0.358696,15.689848,15625
25226,18243826,18562307,0.323864,14.739374,13416
92274,1728035,18955830,0.272321,14.392966,19184
35392,18506072,18589616,0.303191,13.658258,14500
95319,19728145,18589616,0.231441,13.073634,18200
91398,10016242,19728145,0.235043,13.069599,19474
35197,18506072,13560402,0.307692,13.062314,13485
91418,10016242,1187715,0.25,13.019278,17976
35397,18506072,18616278,0.293478,12.802463,13485
8247,20947040,18616278,0.447368,12.033592,6696


In [None]:
# Top 10 theo adamic_adar
df_prediction.sort_values('preferential_attachment', ascending=False).head(10)

Unnamed: 0,group1,group2,jaccard,adamic_adar,preferential_attachment
91398,10016242,19728145,0.235043,13.069599,19474
92274,1728035,18955830,0.272321,14.392966,19184
95319,19728145,18589616,0.231441,13.073634,18200
91418,10016242,1187715,0.25,13.019278,17976
95322,19728145,18616278,0.200873,10.750388,16926
57171,6707902,18955830,0.191111,9.688431,16192
25120,18243826,1776274,0.192488,10.145415,16125
20678,11077852,1776274,0.358696,15.689848,15625
96407,1187715,18616278,0.236967,11.82904,15624
79028,7130232,18955830,0.21659,11.422958,15488


### Member

In [None]:
G = nx.from_pandas_edgelist(df_member_edges, 'member1', 'member2', 'weight', create_using = nx.Graph())

In [None]:
all_member = list(set(df_member_edges["member1"].tolist() + df_member_edges["member2"].tolist()))

In [None]:
import itertools
pairs_member = list(itertools.combinations(all_member, 2))
df_all_member = pd.DataFrame(pairs_member, columns=['member1', 'member2'])
df_all_member['member1'] = df_all_member['member1'].astype('int64')
df_all_member['member2'] = df_all_member['member2'].astype('int64')

In [None]:
df_predict_member = df_all_member[~df_all_member[['member1', 'member2']].apply(tuple, axis=1).isin(df_member_edges[['member1', 'member2']].apply(tuple, axis=1))]

In [None]:
df_predict_member

Unnamed: 0,member1,member2
0,198737924,73498632
1,198737924,182943766
2,198737924,216072216
3,198737924,183566364
4,198737924,234684445
...,...,...
22221106,29720532,198639587
22221107,29720532,207028216
22221108,221609952,198639587
22221109,221609952,207028216


In [None]:
# Tạo một dictionary để lưu kết quả tính toán cho mỗi cặp member1 và member2
predict_member = []
# Tính toán cho từng cặp member1 và member2
for index, row in df_predict_member.iterrows():
    member1 = row['member1']
    member2 = row['member2']

    jaccard_coefficient = next(nx.jaccard_coefficient(G, [(member1, member2)]))[2]
    adamic_adar_coefficient = next(nx.adamic_adar_index(G, [(member1, member2)]))[2]
    preferential_attachment = next(nx.preferential_attachment(G, [(member1, member2)]))[2]

    predict_member.append((member1, member2, jaccard_coefficient, adamic_adar_coefficient, preferential_attachment))

In [None]:
df_prediction_member = pd.DataFrame(predict_member, columns=['member1', 'member2', 'jaccard', 'adamic_adar', 'preferential_attachment'])

In [None]:
df_prediction_member.to_csv('predict.csv', index=False)

In [None]:
# Top 10 theo jaccard
df_prediction_member.sort_values('jaccard', ascending=False).head(10)

Unnamed: 0,member1,member2,jaccard,adamic_adar,preferential_attachment
11567529,183871346,45382252,1.0,0.223347,1
17509531,222643307,215317302,1.0,0.144849,1
5087140,223154086,196076811,1.0,0.171214,1
5087143,223154086,227796243,1.0,0.171214,1
5087160,223154086,220226916,1.0,0.171214,1
17509462,222643307,94861782,1.0,0.144849,1
17509456,222643307,75233722,1.0,0.144849,1
5087186,223154086,229959116,1.0,0.171214,1
5087189,223154086,151283182,1.0,0.171214,1
17509436,222643307,154433872,1.0,0.144849,1


In [None]:
# Top 10 theo adamic_adar
df_prediction_member.sort_values('adamic_adar', ascending=False).head(10)

Unnamed: 0,member1,member2,jaccard,adamic_adar,preferential_attachment
32961,234684445,6160486,0.089228,40.513492,357564
258744,6160486,5900662,0.086705,35.876661,318720
259044,6160486,53121132,0.079279,28.873096,201192
516983,207061281,211585840,0.271505,28.793302,55062
329597,85557392,73895512,0.107311,27.537759,135458
3274284,5900662,211585840,0.193483,26.492438,85120
329111,85557392,214729620,0.108802,25.92971,111106
258457,6160486,208569099,0.074718,25.308733,240036
258378,6160486,207061281,0.072193,24.322764,206172
1310673,214729620,203818557,0.233516,24.226227,44238


In [None]:
# Top 10 theo preferential_attachment
df_prediction_member.sort_values('preferential_attachment', ascending=False).head(10)

Unnamed: 0,member1,member2,jaccard,adamic_adar,preferential_attachment
233359,195657825,6160486,0.007267,5.671007,802776
233370,195657825,85557392,0.007069,3.510327,613366
220869,204669023,6160486,0.00447,2.801066,574692
258354,6160486,190939281,0.003856,2.153961,563736
258362,6160486,191758521,0.004129,2.153961,461148
233371,195657825,190939281,0.032355,17.588391,456196
258365,6160486,28573892,0.004193,2.384034,439236
71894,205193250,6160486,0.004193,2.384034,439236
220879,204669023,85557392,0.00753,3.374503,439097
328973,85557392,190939281,0.0,0.0,430726
