In [4]:
import numpy as np
import networkx as nx

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric.datasets import Planetoid

from gensim.models import Word2Vec

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
import sklearn.linear_model as LinearModels

import seaborn as sns
import matplotlib.pyplot as plt

import os
import pandas as pd
from tqdm import *
from scipy import stats

from pathlib import Path

NathanPath="d:\Documents\Info\INF554\INF554_Kaggle_Project"
NathanPath="/users/eleves-a/2019/nathan.peluso/INF554/INF554_Kaggle_Project"

project_path = str(Path(os.getcwd()).parent.absolute())
print("Current directory : " + os.getcwd() + ", Project directory : " + project_path)

os.chdir(project_path)
os.chdir(NathanPath)

Current directory : /users/eleves-a/2019/nathan.peluso/INF554/INF554_Kaggle_Project/DeepWalk, Project directory : /users/eleves-a/2019/nathan.peluso/INF554/INF554_Kaggle_Project


In [15]:

# Read training data
df_train = pd.read_csv('data/train.csv', dtype={'author': np.int64, 'hindex': np.float32})
n_train = df_train.shape[0]

# Read test data
df_test = pd.read_csv('data/test.csv', dtype={'author': np.int64})
n_test = df_test.shape[0]

In [16]:
G = nx.read_edgelist('data/coauthorship.edgelist', delimiter=' ', nodetype=int)
n=G.number_of_nodes()
print("Number of nodes : " + str(n))

abs_nodeID_Graph=dict(enumerate(G.nodes))
nodeID_abs_Graph=dict([(b,a) for a,b in enumerate(G.nodes)])

Number of nodes : 217801


In [17]:
abs_nodeID_Train=dict(df_train["author"])
abs_hindex_Train=dict(df_train["hindex"])
nodeID_abs_Train=dict([(b,a) for a,b in abs_nodeID_Train.items()])

abs_nodeID_Test=dict(df_test["author"])
nodeID_abs_Test=dict([(b,a) for a,b in abs_nodeID_Test.items()])

In [None]:
def random_walk(G, node, walk_length):
    # Simulates a random walk of length "walk_length" starting from node "node"
    walk=[node]
    for _ in range(walk_length):
        node=np.random.choice(list(G.neighbors(node)))
        walk.append(node)
    return walk

In [None]:
def generate_walks(G, num_walks, walk_length):
    # Runs "num_walks" random walks from each node
    walks = []
    for x in G.nodes():
        for _ in range(num_walks):
            walks.append(random_walk(G,x,walk_length))
    np.random.shuffle(walks)
    return walks

In [None]:
def deepwalk(G, num_walks, walk_length, n_dim):
    # Simulates walks and uses the Skipgram model to learn node representations
    print("Generating walks")
    walks = generate_walks(G, num_walks, walk_length)
    print("Training word2vec")
    model = Word2Vec(vector_size=n_dim, window=8, min_count=0, sg=1, workers=8, hs=1)
    model.build_vocab(walks)
    model.train(walks, total_examples=model.corpus_count, epochs=5)
    return model

n_dim = 128
n_walks = 50
walk_length = 10
#model = deepwalk(G, n_walks, walk_length, n_dim) 
#model.save("DeepWalk/Models/model_"+str(n_walks)+'_'+str(walk_length)+'_'+str(n_dim))

In [None]:
#model=Word2Vec.load("DeepWalk/Models/model_"+str(n_walks)+'_'+str(walk_length)+'_'+str(n_dim))
model=Word2Vec.load("DeepWalk/Models/model_30_10_128")

In [None]:
print(len(model.wv), G.number_of_nodes())
list(model.wv.vectors)

In [None]:
n=G.number_of_nodes()
DeepWalk_embeddings = np.empty(shape=(n, n_dim))

print("Filling embeddings")
for node in nodeID_abs_Graph.keys():
    DeepWalk_embeddings[nodeID_abs_Graph[node]]=model.wv.get_vector(node)


In [None]:
def MSE(X,Y):
    if (len(X)!=len(Y)):
        print("Sizes not identical")
        return -1
    return (X-Y)@(X-Y) / len(X)

In [None]:
n=abs_nodeID_Train.__len__()

idx=np.random.permutation(n)
#Careful, those indexes are related to the TRAIN set, not to the global graph indexing
idx_train=idx[:int(0.8*n)]
idx_val=idx[int(0.8*n):]

nodes_train=[abs_nodeID_Train[i] for i in idx_train]
nodes_val=[abs_nodeID_Train[i] for i in idx_val]

X_train_x = [DeepWalk_embeddings[nodeID_abs_Graph[i]] for i in nodes_train]
X_val_x = [DeepWalk_embeddings[nodeID_abs_Graph[i]] for i in nodes_val]

hindex_train_x=[abs_hindex_Train[i] for i in idx_train]
hindex_val_x=[abs_hindex_Train[i] for i in idx_val]

lin_reg=LinearModels.LinearRegression()
lin_reg.fit(X_train_x,hindex_train_x)
_pred=lin_reg.predict(X_val_x)
score=MSE(hindex_val_x,_pred)

print("Accuracy using DeepWalk embeddings ", score)

## Saving model in proper format

In [None]:
M=np.zeros((G.number_of_nodes(), n_dim + 1), dtype=np.float64)
for i in range(G.number_of_nodes()):
    M[i][0]=abs_nodeID_Graph[i]
    M[i][1:]=model.wv.get_vector(M[i][0])
np.save("DeepWalk/Models/output_proper.npy", M)

## Generating submission

In [None]:
X_train_glob = [DeepWalk_embeddings[nodeID_abs_Graph[node]] for node in nodeID_abs_Train.keys()]
hindex_train_glob=[abs_hindex_Train[nodeID_abs_Train[node]] for node in nodeID_abs_Train.keys()]
X_test=[DeepWalk_embeddings[nodeID_abs_Graph[node]] for node in nodeID_abs_Test.keys()]
nodes_test=[node for node in nodeID_abs_Test.keys()]

In [None]:
lin_reg=LinearModels.LinearRegression()
lin_reg.fit(X_train_glob,hindex_train_glob)
_pred=lin_reg.predict(X_test)

In [None]:
submission=dict([(nodes_test[i], _pred[i]) for i in range(len(X_test))])

In [None]:
with open("submissions/deepwalk_lin_submission.csv", 'w') as f:
    f.write("author,hindex\n")
    for k,h in submission.items():
        f.write(str(k)+","+str(h)+"\n")
    f.close()

## Comparing

In [None]:
df_baseline = pd.read_csv('submissions/baseline_submission.csv', dtype={'author': np.float64})
baseline_dict=dict(np.array(df_baseline))
df_mine = pd.read_csv('submissions/deepwalk_lin_submission.csv', dtype={'author': np.float64})
mine_dict=dict(np.array(df_mine))

In [None]:
len(baseline_dict.keys())

In [None]:
base=[baseline_dict.get(node) for node in baseline_dict.keys()]
mine_reordered=[mine_dict.get(node) for node in baseline_dict.keys()]

In [None]:
MSE(base,mine_reordered)

## Testing improved regressor

In [7]:
model=Word2Vec.load("DeepWalk/Models/model_30_10_128")
G = nx.read_edgelist('data/coauthorship.edgelist', delimiter=' ', nodetype=int)
n=G.number_of_nodes()

print("Starting")
core_n=nx.core_number(G)
print("Core number OK")
degrees=nx.degree(G)
print("Degree OK")
surr_mean_deg=nx.average_neighbor_degree(G)
print("Neighb degree OK")
coef_clust=nx.clustering(G)
print("Clustering OK")
deg_cent=nx.degree_centrality(G)
print("Degree centrality OK")

Starting
Core number OK
Degree OK
Neighb degree OK
Clustering OK
Degree centrality OK


In [13]:
n_dim=model.wv.vectors.shape[1]

In [18]:
DeepWalk_embeddings_i = np.empty(shape=(n, n_dim+7))
print("Filling embeddings")
for node in tqdm(G.nodes):
    DeepWalk_embeddings_i[nodeID_abs_Graph[node]][0]=node
    DeepWalk_embeddings_i[nodeID_abs_Graph[node]][1]=core_n[node]
    DeepWalk_embeddings_i[nodeID_abs_Graph[node]][2]=degrees[node]
    DeepWalk_embeddings_i[nodeID_abs_Graph[node]][3]=degrees[node]*degrees[node]
    DeepWalk_embeddings_i[nodeID_abs_Graph[node]][4]=surr_mean_deg[node]
    DeepWalk_embeddings_i[nodeID_abs_Graph[node]][5]=coef_clust[node]
    DeepWalk_embeddings_i[nodeID_abs_Graph[node]][6]=deg_cent[node]
    DeepWalk_embeddings_i[nodeID_abs_Graph[node]][7:]=model.wv.get_vector(node)

Filling embeddings


100%|██████████| 217801/217801 [00:01<00:00, 155375.61it/s]


In [19]:
np.save("DeepWalk/embeddings_improved.npy", DeepWalk_embeddings_i)

In [None]:
n_train=abs_nodeID_Train.__len__()

idx=np.random.permutation(n_train)
#Careful, those indexes are related to the TRAIN set, not to the global graph indexing
idx_train=idx[:int(0.8*n_train)]
idx_val=idx[int(0.8*n_train):]

nodes_train=[abs_nodeID_Train[i] for i in idx_train]
nodes_val=[abs_nodeID_Train[i] for i in idx_val]
core_n=nx.core_number(G)
degrees=nx.degree(G)

X_train_x = [DeepWalk_embeddings_i[nodeID_abs_Graph[i]] for i in nodes_train]
X_val_x = [DeepWalk_embeddings_i[nodeID_abs_Graph[i]] for i in nodes_val]

hindex_train_x=[abs_hindex_Train[i] for i in idx_train]
hindex_val_x=[abs_hindex_Train[i] for i in idx_val]

lin_reg=LinearModels.LinearRegression()
lin_reg.fit(X_train_x,hindex_train_x)
_pred=lin_reg.predict(X_val_x)
score=MSE(hindex_val_x,_pred)


print("Accuracy using DeepWalk embeddings++ ", score)

## MLP

In [None]:
import torch
import torch.nn as nn

class MLP(nn.Module):
    """Simple MLP model"""
    def __init__(self, n_feat, n_hidden_1, n_hidden_2, dropout):
        super(MLP, self).__init__()

        self.fc1 = nn.Linear(n_feat, n_hidden_1)
        self.fc2 = nn.Linear(n_hidden_1, n_hidden_2)
        self.fc3 = nn.Linear(n_hidden_2, 1)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()

    def forward(self, x):
        z0 = self.relu(self.fc1(x))
        z0 = self.dropout(z0)
        z1 = self.relu(self.fc2(z0))
        out = self.fc3(z1)
        return out

In [None]:
n=abs_nodeID_Train.__len__()
#DeepWalk_embeddings_i=np.load("DeepWalk/embeddings_improved.npy")
DeepWalk_embeddings_i=np.load("Global/full_embedding_matrix.npy")

n_feat=DeepWalk_embeddings_i.shape[1]

idx=np.random.permutation(n)
#Careful, those indexes are related to the TRAIN set, not to the global graph indexing
idx_train=idx[:int(0.8*n)]
idx_val=idx[int(0.8*n):]

nodes_train=[abs_nodeID_Train[i] for i in idx_train]
nodes_val=[abs_nodeID_Train[i] for i in idx_val]

X_train_x = torch.tensor([DeepWalk_embeddings_i[nodeID_abs_Graph[i]] for i in nodes_train], dtype=torch.float32)
X_val_x = torch.tensor([DeepWalk_embeddings_i[nodeID_abs_Graph[i]] for i in nodes_val], dtype=torch.float32)

hindex_train_x=torch.tensor([abs_hindex_Train[i] for i in idx_train], dtype=torch.float32)
hindex_val_x=torch.tensor([abs_hindex_Train[i] for i in idx_val], dtype=torch.float32)

In [None]:
X_train_x.shape

In [None]:
n_dim=X_train_x.shape[1]
model=MLP(n_dim,256,64,0.4)
optimizer = optim.Adam(model.parameters(), lr=1e-2)
loss = nn.MSELoss()

for epoch in range(1000):
    model.train()
    optimizer.zero_grad()
    output = model(X_train_x)
    loss_train = loss(output.reshape(-1), hindex_train_x)
    loss_train.backward()
    optimizer.step()

    model.eval()
    output= model(X_val_x)

    loss_val = loss(output.reshape(-1), hindex_val_x)
    print('Epoch: {:03d}'.format(epoch+1),
            'loss_train: {:.4f}'.format(loss_train.item()),
            'loss_val: {:.4f}'.format(loss_val.item()))

In [None]:
torch.save(model.state_dict(), "DW_MLP_Models/1000ep-2lr0.1dr_best.pt")

## MLP Submission

In [None]:
DeepWalk_embeddings_i=np.load("DeepWalk/embeddings_improved.npy")
X_train_glob = torch.tensor([DeepWalk_embeddings_i[nodeID_abs_Graph[node]] for node in nodeID_abs_Train.keys()], dtype=torch.float32)
hindex_train_glob=torch.tensor([abs_hindex_Train[nodeID_abs_Train[node]] for node in nodeID_abs_Train.keys()], dtype=torch.float32)
X_test=torch.tensor([DeepWalk_embeddings_i[nodeID_abs_Graph[node]] for node in nodeID_abs_Test.keys()], dtype=torch.float32)

In [None]:
n_dim=X_train_x.shape[1]
model=MLP(n_dim,256,64,0.4)
optimizer = optim.Adam(model.parameters(), lr=1e-2)
loss = nn.MSELoss()

for epoch in range(1000):
    model.train()
    optimizer.zero_grad()
    output = model(X_train_glob)
    loss_train = loss(output.reshape(-1), hindex_train_glob)
    loss_train.backward()
    optimizer.step()

    print('Epoch: {:03d}'.format(epoch+1),
            'loss_train: {:.4f}'.format(loss_train.item()))

In [None]:
_pred=model(X_test)

In [None]:
submission=dict([(nodes_test[i], _pred[i]) for i in range(len(X_test))])
with open("submissions/deepwalk_MLP_submission.csv", 'w') as f:
    f.write("author,hindex\n")
    for k,h in submission.items():
        f.write(str(k)+","+str(h.item())+"\n")
    f.close()

In [None]:
submission.values()