In [3]:
import numpy as np
import networkx as nx

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.datasets import Planetoid

from gensim.models import Word2Vec

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
import sklearn.linear_model as LinearModels

import seaborn as sns
import matplotlib.pyplot as plt

import os
import pandas as pd
from tqdm import *
from scipy import stats

from pathlib import Path

NathanPath="d:\Documents\Info\INF554\INF554_Kaggle_Project"
NathanPath="/users/eleves-a/2019/nathan.peluso/INF554/INF554_Kaggle_Project"

In [4]:
project_path = str(Path(os.getcwd()).parent.absolute())
print("Current directory : " + os.getcwd() + ", Project directory : " + project_path)

Current directory : /users/eleves-a/2019/nathan.peluso/INF554/INF554_Kaggle_Project/DeepWalk, Project directory : /users/eleves-a/2019/nathan.peluso/INF554/INF554_Kaggle_Project


In [5]:
os.chdir(project_path)
os.chdir(NathanPath)

In [6]:

# Read training data
df_train = pd.read_csv('data/train.csv', dtype={'author': np.int64, 'hindex': np.float32})
n_train = df_train.shape[0]

# Read test data
df_test = pd.read_csv('data/test.csv', dtype={'author': np.int64})
n_test = df_test.shape[0]

In [7]:
G = nx.read_edgelist('data/coauthorship.edgelist', delimiter=' ', nodetype=int)
n=G.number_of_nodes()
print("Number of nodes : " + str(n))

abs_nodeID_Graph=dict(enumerate(G.nodes))
nodeID_abs_Graph=dict([(b,a) for a,b in enumerate(G.nodes)])

Number of nodes : 217801


In [8]:
abs_nodeID_Train=dict(df_train["author"])
abs_hindex_Train=dict(df_train["hindex"])
nodeID_abs_Train=dict([(b,a) for a,b in abs_nodeID_Train.items()])

abs_nodeID_Test=dict(df_test["author"])
nodeID_abs_Test=dict([(b,a) for a,b in abs_nodeID_Test.items()])

In [9]:
def random_walk(G, node, walk_length):
    # Simulates a random walk of length "walk_length" starting from node "node"
    walk=[node]
    for _ in range(walk_length):
        node=np.random.choice(list(G.neighbors(node)))
        walk.append(node)
    return walk

In [10]:
def generate_walks(G, num_walks, walk_length):
    # Runs "num_walks" random walks from each node
    walks = []
    for x in G.nodes():
        for _ in range(num_walks):
            walks.append(random_walk(G,x,walk_length))
    np.random.shuffle(walks)
    return walks

In [188]:
def deepwalk(G, num_walks, walk_length, n_dim):
    # Simulates walks and uses the Skipgram model to learn node representations
    print("Generating walks")
    walks = generate_walks(G, num_walks, walk_length)
    print("Training word2vec")
    model = Word2Vec(vector_size=n_dim, window=8, min_count=0, sg=1, workers=8, hs=1)
    model.build_vocab(walks)
    model.train(walks, total_examples=model.corpus_count, epochs=5)
    return model

n_dim = 128
n_walks = 50
walk_length = 10
#model = deepwalk(G, n_walks, walk_length, n_dim) 
#model.save("DeepWalk/Models/model_"+str(n_walks)+'_'+str(walk_length)+'_'+str(n_dim))

Generating walks
Training word2vec


KeyboardInterrupt: 

In [215]:
#model=Word2Vec.load("DeepWalk/Models/model_"+str(n_walks)+'_'+str(walk_length)+'_'+str(n_dim))
model=Word2Vec.load("DeepWalk/Models/model_30_10_128")

In [216]:
print(len(model.wv), G.number_of_nodes())
list(model.wv.vectors)

217801 217801


[array([ 0.01217775,  0.36482286, -0.11519226, -0.1372537 , -0.14410423,
        -0.23477064, -0.01063164, -0.08708626, -0.16003214,  0.2111922 ,
         0.37898564, -0.06617349,  0.02182047, -0.23205146,  0.14751744,
         0.14604399,  0.26978302,  0.0786909 ,  0.0747596 , -0.21657915,
        -0.41824594, -0.15726748,  0.33606747, -0.22219485,  0.00862307,
        -0.47990733,  0.1389106 , -0.3571496 ,  0.15610634,  0.03858445,
         0.11382578, -0.01316798, -0.32820693, -0.16619101, -0.16536109,
         0.05011184,  0.0948635 , -0.38553488, -0.15778014, -0.19517452,
        -0.20050393, -0.05832153, -0.37455174,  0.10498916,  0.3085475 ,
        -0.18332407,  0.00548472,  0.15115702, -0.26101154,  0.19902681,
        -0.03988563, -0.09886429, -0.32636693,  0.11999419, -0.03426258,
        -0.24258652,  0.45441094,  0.06539854,  0.17724663,  0.16606304,
         0.02945892, -0.10808288, -0.24899364, -0.36005914,  0.2012129 ,
        -0.18443975,  0.09798025, -0.0275696 , -0.3

In [217]:
n=G.number_of_nodes()
DeepWalk_embeddings = np.empty(shape=(n, n_dim))

print("Filling embeddings")
for node in nodeID_abs_Graph.keys():
    DeepWalk_embeddings[nodeID_abs_Graph[node]]=model.wv.get_vector(node)


Filling embeddings


In [35]:
def MSE(X,Y):
    if (len(X)!=len(Y)):
        print("Sizes not identical")
        return -1
    return (X-Y)@(X-Y) / len(X)

In [219]:
n=abs_nodeID_Train.__len__()

idx=np.random.permutation(n)
#Careful, those indexes are related to the TRAIN set, not to the global graph indexing
idx_train=idx[:int(0.8*n)]
idx_val=idx[int(0.8*n):]

nodes_train=[abs_nodeID_Train[i] for i in idx_train]
nodes_val=[abs_nodeID_Train[i] for i in idx_val]

X_train_x = [DeepWalk_embeddings[nodeID_abs_Graph[i]] for i in nodes_train]
X_val_x = [DeepWalk_embeddings[nodeID_abs_Graph[i]] for i in nodes_val]

hindex_train_x=[abs_hindex_Train[i] for i in idx_train]
hindex_val_x=[abs_hindex_Train[i] for i in idx_val]

lin_reg=LinearModels.LinearRegression()
lin_reg.fit(X_train_x,hindex_train_x)
_pred=lin_reg.predict(X_val_x)
score=MSE(hindex_val_x,_pred)

print("Accuracy using DeepWalk embeddings ", score)

Accuracy using DeepWalk embeddings  134.2629017381004


## Saving model in proper format

In [214]:
M=np.zeros((G.number_of_nodes(), n_dim + 1), dtype=np.float64)
for i in range(G.number_of_nodes()):
    M[i][0]=abs_nodeID_Graph[i]
    M[i][1:]=model.wv.get_vector(M[i][0])
np.save("DeepWalk/Models/output_proper.npy", M)

## Generating submission

In [180]:
X_train_glob = [DeepWalk_embeddings[nodeID_abs_Graph[node]] for node in nodeID_abs_Train.keys()]
hindex_train_glob=[abs_hindex_Train[nodeID_abs_Train[node]] for node in nodeID_abs_Train.keys()]
X_test=[DeepWalk_embeddings[nodeID_abs_Graph[node]] for node in nodeID_abs_Test.keys()]
nodes_test=[node for node in nodeID_abs_Test.keys()]

In [181]:
lin_reg=LinearModels.LinearRegression()
lin_reg.fit(X_train_glob,hindex_train_glob)
_pred=lin_reg.predict(X_test)

In [182]:
submission=dict([(nodes_test[i], _pred[i]) for i in range(len(X_test))])

In [183]:
with open("submissions/deepwalk_lin_submission.csv", 'w') as f:
    f.write("author,hindex\n")
    for k,h in submission.items():
        f.write(str(k)+","+str(h)+"\n")
    f.close()

## Comparing

In [184]:
df_baseline = pd.read_csv('submissions/baseline_submission.csv', dtype={'author': np.float64})
baseline_dict=dict(np.array(df_baseline))
df_mine = pd.read_csv('submissions/deepwalk_lin_submission.csv', dtype={'author': np.float64})
mine_dict=dict(np.array(df_mine))

In [185]:
len(baseline_dict.keys())

43560

In [186]:
base=[baseline_dict.get(node) for node in baseline_dict.keys()]
mine_reordered=[mine_dict.get(node) for node in baseline_dict.keys()]

In [187]:
MSE(base,mine_reordered)

26.80904155594524

## Testing improved regressor

In [278]:
model=Word2Vec.load("DeepWalk/Models/model_30_10_128")
n=G.number_of_nodes()

print("Starting")
core_n=nx.core_number(G)
print("Core number OK")
degrees=nx.degree(G)
print("Degree OK")
surr_mean_deg=nx.average_neighbor_degree(G)
print("Neighb degree OK")
coef_clust=nx.clustering(G)
print("Clustering OK")
deg_cent=nx.degree_centrality(G)
print("Degree centrality OK")

Starting
Core number OK
Degree OK
Neighb degree OK
Clustering OK


NameError: name 'degree_centrality' is not defined

In [None]:
voterank=nx.voterank(G)

In [281]:
DeepWalk_embeddings_i = np.empty(shape=(n, n_dim+6))
print("Filling embeddings")
for node in tqdm(nodeID_abs_Graph.keys()):
    DeepWalk_embeddings_i[nodeID_abs_Graph[node]][0]=core_n[node]
    DeepWalk_embeddings_i[nodeID_abs_Graph[node]][1]=degrees[node]
    DeepWalk_embeddings_i[nodeID_abs_Graph[node]][2]=surr_mean_deg[node]
    DeepWalk_embeddings_i[nodeID_abs_Graph[node]][3]=coef_clust[node]
    DeepWalk_embeddings_i[nodeID_abs_Graph[node]][4]=deg_cent[node]
    DeepWalk_embeddings_i[nodeID_abs_Graph[node]][5]=voterank[node]
    DeepWalk_embeddings_i[nodeID_abs_Graph[node]][6:]=model.wv.get_vector(node)

Filling embeddings


100%|██████████| 217801/217801 [00:01<00:00, 195094.68it/s]


In [282]:
np.save("DeepWalk/embeddings_improved.npy", DeepWalk_embeddings_i)

In [283]:
n_train=abs_nodeID_Train.__len__()

idx=np.random.permutation(n_train)
#Careful, those indexes are related to the TRAIN set, not to the global graph indexing
idx_train=idx[:int(0.8*n_train)]
idx_val=idx[int(0.8*n_train):]

nodes_train=[abs_nodeID_Train[i] for i in idx_train]
nodes_val=[abs_nodeID_Train[i] for i in idx_val]
core_n=nx.core_number(G)
degrees=nx.degree(G)

X_train_x = [DeepWalk_embeddings_i[nodeID_abs_Graph[i]] for i in nodes_train]
X_val_x = [DeepWalk_embeddings_i[nodeID_abs_Graph[i]] for i in nodes_val]

hindex_train_x=[abs_hindex_Train[i] for i in idx_train]
hindex_val_x=[abs_hindex_Train[i] for i in idx_val]

lin_reg=LinearModels.LinearRegression()
lin_reg.fit(X_train_x,hindex_train_x)
_pred=lin_reg.predict(X_val_x)
score=MSE(hindex_val_x,_pred)


print("Accuracy using DeepWalk embeddings++ ", score)

Accuracy using DeepWalk embeddings++  118.70438055719114


## MLP

In [28]:
import torch
import torch.nn as nn

class MLP(nn.Module):
    """Simple MLP model"""
    def __init__(self, n_feat, n_hidden_1, n_hidden_2, dropout):
        super(MLP, self).__init__()

        self.fc1 = nn.Linear(n_feat, n_hidden_1)
        self.fc2 = nn.Linear(n_hidden_1, n_hidden_2)
        self.fc3 = nn.Linear(n_hidden_2, 1)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()

    def forward(self, x):
        z0 = self.relu(self.fc1(x))
        z0 = self.dropout(z0)
        z1 = self.relu(self.fc2(z0))
        out = self.fc3(z1)
        return out

In [29]:
n=abs_nodeID_Train.__len__()
DeepWalk_embeddings_i=np.load("DeepWalk/embeddings_improved.npy")

n_feat=DeepWalk_embeddings_i.shape[1]

idx=np.random.permutation(n)
#Careful, those indexes are related to the TRAIN set, not to the global graph indexing
idx_train=idx[:int(0.8*n)]
idx_val=idx[int(0.8*n):]

nodes_train=[abs_nodeID_Train[i] for i in idx_train]
nodes_val=[abs_nodeID_Train[i] for i in idx_val]

X_train_x = torch.tensor([DeepWalk_embeddings_i[nodeID_abs_Graph[i]] for i in nodes_train], dtype=torch.float32)
X_val_x = torch.tensor([DeepWalk_embeddings_i[nodeID_abs_Graph[i]] for i in nodes_val], dtype=torch.float32)

hindex_train_x=torch.tensor([abs_hindex_Train[i] for i in idx_train], dtype=torch.float32)
hindex_val_x=torch.tensor([abs_hindex_Train[i] for i in idx_val], dtype=torch.float32)

In [30]:
X_train_x.shape

torch.Size([139392, 133])

In [41]:
n_dim=X_train_x.shape[1]
model=MLP(n_dim,256,64,0.4)
optimizer = optim.Adam(model.parameters(), lr=1e-2)
loss = nn.MSELoss()

for epoch in range(1000):
    model.train()
    optimizer.zero_grad()
    output = model(X_train_x)
    loss_train = loss(output.reshape(-1), hindex_train_x)
    loss_train.backward()
    optimizer.step()

    model.eval()
    output= model(X_val_x)

    loss_val = loss(output.reshape(-1), hindex_val_x)
    print('Epoch: {:03d}'.format(epoch+1),
            'loss_train: {:.4f}'.format(loss_train.item()),
            'loss_val: {:.4f}'.format(loss_val.item()))

Epoch: 001 loss_train: 248.7923 loss_val: 330.8441
Epoch: 002 loss_train: 337.7948 loss_val: 185.4569
Epoch: 003 loss_train: 189.1593 loss_val: 193.3616
Epoch: 004 loss_train: 193.0517 loss_val: 185.8450
Epoch: 005 loss_train: 185.8117 loss_val: 159.0692
Epoch: 006 loss_train: 161.4124 loss_val: 166.6325
Epoch: 007 loss_train: 177.1102 loss_val: 141.9621
Epoch: 008 loss_train: 148.0261 loss_val: 139.7579
Epoch: 009 loss_train: 143.4741 loss_val: 140.3967
Epoch: 010 loss_train: 143.8020 loss_val: 136.5768
Epoch: 011 loss_train: 141.1854 loss_val: 135.7345
Epoch: 012 loss_train: 143.2215 loss_val: 132.9178
Epoch: 013 loss_train: 138.3583 loss_val: 134.4293
Epoch: 014 loss_train: 138.5604 loss_val: 133.6492
Epoch: 015 loss_train: 137.8179 loss_val: 130.1240
Epoch: 016 loss_train: 135.2461 loss_val: 128.9188
Epoch: 017 loss_train: 136.9529 loss_val: 127.6148
Epoch: 018 loss_train: 133.6950 loss_val: 127.5572
Epoch: 019 loss_train: 132.3150 loss_val: 127.5951
Epoch: 020 loss_train: 131.9941

In [42]:
torch.save(model.state_dict(), "DW_MLP_Models/1000ep-2lr0.1dr_best.pt")

## MLP Submission

In [53]:
DeepWalk_embeddings_i=np.load("DeepWalk/embeddings_improved.npy")
X_train_glob = torch.tensor([DeepWalk_embeddings_i[nodeID_abs_Graph[node]] for node in nodeID_abs_Train.keys()], dtype=torch.float32)
hindex_train_glob=torch.tensor([abs_hindex_Train[nodeID_abs_Train[node]] for node in nodeID_abs_Train.keys()], dtype=torch.float32)
X_test=torch.tensor([DeepWalk_embeddings_i[nodeID_abs_Graph[node]] for node in nodeID_abs_Test.keys()], dtype=torch.float32)

In [54]:
n_dim=X_train_x.shape[1]
model=MLP(n_dim,256,64,0.4)
optimizer = optim.Adam(model.parameters(), lr=1e-2)
loss = nn.MSELoss()

for epoch in range(1000):
    model.train()
    optimizer.zero_grad()
    output = model(X_train_glob)
    loss_train = loss(output.reshape(-1), hindex_train_glob)
    loss_train.backward()
    optimizer.step()

    print('Epoch: {:03d}'.format(epoch+1),
            'loss_train: {:.4f}'.format(loss_train.item()))

Epoch: 001 loss_train: 292.8006
Epoch: 002 loss_train: 215.1341
Epoch: 003 loss_train: 190.4556
Epoch: 004 loss_train: 181.2618
Epoch: 005 loss_train: 164.9641
Epoch: 006 loss_train: 162.6019
Epoch: 007 loss_train: 146.1072
Epoch: 008 loss_train: 145.5275
Epoch: 009 loss_train: 144.4196
Epoch: 010 loss_train: 142.7182
Epoch: 011 loss_train: 138.8270
Epoch: 012 loss_train: 138.2556
Epoch: 013 loss_train: 137.6770
Epoch: 014 loss_train: 136.2206
Epoch: 015 loss_train: 135.3170
Epoch: 016 loss_train: 133.1581
Epoch: 017 loss_train: 132.0508
Epoch: 018 loss_train: 131.1693
Epoch: 019 loss_train: 130.5230
Epoch: 020 loss_train: 129.0853
Epoch: 021 loss_train: 128.5503
Epoch: 022 loss_train: 127.7811
Epoch: 023 loss_train: 127.2714
Epoch: 024 loss_train: 126.6769
Epoch: 025 loss_train: 125.9616
Epoch: 026 loss_train: 125.0136
Epoch: 027 loss_train: 123.5989
Epoch: 028 loss_train: 122.9080
Epoch: 029 loss_train: 121.8904
Epoch: 030 loss_train: 121.3194
Epoch: 031 loss_train: 120.1321
Epoch: 0

In [None]:
_pred=model(X_test)

In [None]:
submission=dict([(nodes_test[i], _pred[i]) for i in range(len(X_test))])
with open("submissions/deepwalk_lin_submission.csv", 'w') as f:
    f.write("author,hindex\n")
    for k,h in submission.items():
        f.write(str(k)+","+str(h)+"\n")
    f.close()