In [1]:
import sys
sys.path.insert(0, ".")
import pandas as pd
import numpy as np
import pickle
import networkx as nx
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from utils.load_numpy_dataset import load_numpy_data
import itertools


In [2]:
path = "datasets_pp/persona/CA-HepTh"
index, train, train_neg, test, test_neg = load_numpy_data(path)

original_num_nodes for CA-HepTh is: 17429
largest_cc_num_nodes for CA-HepTh is: 3348
num_pos_test_edges for CA-HepTh is: 5331
num_neg_test_edges for CA-HepTh is: 5331
num_pos_train_edges for CA-HepTh is: 5330
num_neg_train_edges for CA-HepTh is: 5330


In [4]:
print(train.shape,test.shape)

(5330, 2) (5331, 2)


In [5]:
data = np.concatenate([train,test],axis=0)

In [8]:
arr, uniq_cnt = np.unique(data, axis=0, return_counts=True)
uniq_arr = arr[uniq_cnt==1]

In [11]:
np.savetxt("CA-HepTH.csv", uniq_arr.astype(int),  fmt='%i',delimiter=",")

In [None]:
!mv CA-HepTH.csv ../Splitter/input/
!python src/main.py --edge-path ../Splitter/input/CA-HepTh_edges.csv --embedding-output-path ../Splitter/output/CA-HepTh_embedding.csv --persona-output-path ../Splitter/output/CA-HepTh_personas.json
!cp ../Splitter/output/CA-HepTH_embedding.csv datasets_pp/
!cp ../Splitter/output/CA-HepTH_personas.json datasets_pp/

In [12]:
random_state = 42

In [13]:
def graph_edge(data_train,data_train_ng,if_neg=True):
    df = pd.DataFrame(data_train,columns=['source','target'])
    df['weight'] = np.ones(shape=(data_train.shape[0],))
    df2=pd.DataFrame(data_train_ng,columns=['source','target'])
    if if_neg:
        df2['weight'] = np.zeros(shape=(data_train_ng.shape[0],))
    else:
        df2['weight'] = np.ones(shape=(data_train_ng.shape[0],))
    df = df.append(df2)
    # shuffle the DataFrame rows
    df = df.sample(frac = 1,random_state=random_state)
    # reset the index
    df = df.reset_index(drop=True)
    G = nx.from_pandas_edgelist(df, create_using=nx.Graph(),edge_attr=True)
    return G,df

In [14]:
G_train,df_train =graph_edge(train,train_neg)
G_test,df_test =graph_edge(test,test_neg)

In [15]:
print(len(G_train.nodes()),np.max(G_train.nodes()),np.min(G_train.nodes()))
print(len(G_test.nodes()),np.max(G_test.nodes()),np.min(G_test.nodes()))
print(len(G_train.edges()))
print(len(G_test.edges()))

3348 3347 0
3336 3347 0
10660
10662


In [17]:
df_emb = pd.read_csv("datasets_pp/CA-HepTH_embedding.csv")

In [18]:
df_emb.head()

Unnamed: 0,id,x_0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,...,x_118,x_119,x_120,x_121,x_122,x_123,x_124,x_125,x_126,x_127
0,0.0,0.24203,-0.16187,0.011191,-0.096601,-0.105429,0.040039,-0.021148,0.440244,0.030784,...,-0.108199,-0.135126,-0.215997,-0.158687,0.02343,0.406916,0.694778,0.547517,-0.469263,0.02667
1,1.0,-5.225602,7.858293,2.477211,-1.159189,-11.123,-8.715339,5.556805,8.773579,-11.482063,...,-3.514436,-3.441469,-3.403715,0.653053,3.103514,-0.506251,2.727175,5.450474,-0.258366,0.892216
2,2.0,-5.16332,7.787142,2.399453,-1.354598,-12.443939,-7.809681,4.734442,6.584645,-12.353623,...,-1.800072,-3.878049,-6.624661,1.172072,2.566189,-1.187808,0.975061,5.822483,-0.027944,0.540707
3,3.0,2.159623,7.134664,0.751619,-5.498718,-4.598803,-6.120093,-4.683443,7.638022,-10.368925,...,-1.288053,-1.616205,6.114311,1.54293,12.154725,-1.134041,-2.324757,1.924501,5.173053,-1.418079
4,4.0,-5.186575,9.238568,4.257716,-1.772381,-13.102413,-8.396495,3.760475,7.695497,-13.527689,...,-2.073896,-3.304463,-5.754214,0.512231,4.384974,-2.878014,-0.582051,3.250685,3.260856,-0.563235


In [19]:
df_emb.shape

(3349, 129)

In [20]:
import json
with open("datasets_pp/CA-HepTH_personas.json") as f:
    data = json.loads(f.read())

In [23]:
print(data['0'],data['1'])
# print(data)

1 2690


In [24]:
map_og = {}
temp_og_map=[]
# temp_map_og=[]
for k,v in data.items():
    map_og[int(k)] = int(v)
    # og_map [int(v)] = int(k)
    temp_og_map.append(int(k))
    # temp_map_og.append(int(v))

In [25]:
print(len(temp_og_map))
print(len(np.unique(temp_og_map)))

3349
3349


In [26]:
map_og

{0: 1,
 1: 2690,
 2: 2696,
 3: 2,
 4: 1391,
 5: 1927,
 6: 2360,
 7: 2590,
 8: 2695,
 9: 3,
 10: 1686,
 11: 6,
 12: 2205,
 13: 7,
 14: 627,
 15: 955,
 16: 1283,
 17: 1285,
 18: 1286,
 19: 1288,
 20: 1290,
 21: 1291,
 22: 1292,
 23: 1294,
 24: 1295,
 25: 1296,
 26: 1297,
 27: 1298,
 28: 1948,
 29: 3187,
 30: 3339,
 31: 8,
 32: 3225,
 33: 9,
 34: 13,
 35: 341,
 36: 404,
 37: 799,
 38: 1255,
 39: 2290,
 40: 2805,
 41: 10,
 42: 3327,
 43: 12,
 44: 729,
 45: 2340,
 46: 11,
 47: 207,
 48: 1062,
 49: 2293,
 50: 14,
 51: 376,
 52: 455,
 53: 456,
 54: 19,
 55: 60,
 56: 67,
 57: 69,
 58: 76,
 59: 22,
 60: 21,
 61: 24,
 62: 18,
 63: 87,
 64: 2452,
 65: 25,
 66: 146,
 67: 430,
 68: 1174,
 69: 1790,
 70: 2952,
 71: 3127,
 72: 26,
 73: 1849,
 74: 1857,
 75: 27,
 76: 28,
 77: 29,
 78: 1125,
 79: 1337,
 80: 2195,
 81: 30,
 82: 68,
 83: 213,
 84: 1158,
 85: 1369,
 86: 2326,
 87: 2720,
 88: 2721,
 89: 3015,
 90: 3132,
 91: 33,
 92: 34,
 93: 521,
 94: 2154,
 95: 3105,
 96: 3226,
 97: 38,
 98: 769,
 99: 32

In [27]:
# it containts all original nodes
org_nodes = set(map_og.values())

# use a list comprehension, iterating through keys and checking the values match each n
mappedNodes_fromOrg = {}
for n in org_nodes:
    mappedNodes_fromOrg[n] = [k for k in map_og.keys() if map_og[k] == n]

In [28]:
mappedNodes_fromOrg[0]

[1647]

In [29]:
print(np.max(temp_og_map))

3348


In [36]:
def aggregate(u,v):
    try:
        res = np.dot(u,v)/(np.linalg.norm(u) * np.linalg.norm(v))
    except:
        res = 0.0
    return res
def edge_embed(u,v):
    # first converting it into original nodes
    n1 = u
    n2 = v
    # now we have all mapped nodes which refered to orignial nodes
    nodes1 = set(mappedNodes_fromOrg[n1])
    nodes2 = set(mappedNodes_fromOrg[n2])
    all_nodes = list(nodes1.union(nodes2))
    all_prod = []
    nodes_comb = []
    for u1,v1 in itertools.combinations(all_nodes, 2):
        # df_emb containts the original nodes
        e1 = df_emb.iloc[map_og[u1],1:]
        e2 = df_emb.iloc[map_og[v1],1:]
        all_prod.append(aggregate(e1,e2))
        nodes_comb.append((u1,v1))
    idx = np.argmax(all_prod)
    u2,v2 = nodes_comb[idx]
    return np.maximum(u2,v2)
    
def splitter_edgeEmbedding(G):
    X=[]
    y = []
    for u,v in G.edges():
        lbl = G[u][v]["weight"]
        y.append(lbl)
        
        X.append(edge_embed(u,v))
    X = np.asarray(X)
    return X,y

In [37]:
X_train,y_train = splitter_edgeEmbedding(G_train)
X_test,y_test = splitter_edgeEmbedding(G_test)

In [38]:
try:
    X_train.shape[1]
except:
    X_train = np.expand_dims(X_train,axis=1)
    X_test = np.expand_dims(X_test,axis=1)
print(X_train.shape,X_test.shape)

(10660, 1) (10662, 1)


In [39]:
print(np.max(X_train),np.min(X_train))

3348 1


In [40]:
clf = LogisticRegression(random_state=random_state).fit(X_train, y_train)

In [41]:
# train score
score_train = roc_auc_score(y_true=y_train, y_score=clf.predict(X_train),  average='micro')
# test score
score_test = roc_auc_score(y_true=y_test, y_score=clf.predict(X_test),  average='micro')

print(f"\n ROC-Score (train): {score_train}\n ROC-Score (test): {score_test}")


 ROC-Score (train): 0.5924015009380863
 ROC-Score (test): 0.5949165259801164
