In [None]:
import pathpy as pp
import numpy as np
import pandas as pd
import torch
import torch_geometric
from sklearn.model_selection import cross_validate
import json
from utils import network_to_pyg
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, recall_score, precision_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
def load_data():
    # load the data set
    df = pd.read_csv('new_data.csv')

    # load the labels
    with open('book_labels.json') as f:
        labelsGB = json.load(f)

    # change to scalar
    labelsGB = {k: 0 if v=='lotr' else 1 if v =='hobbit' else 2 for k,v in labelsGB.items()}

    # load the empty network
    one = pp.Network(directed=False)

    # add the nodes
    for i in range(df.shape[0]):
        one.add_edge(df.loc[i, 'v'], df.loc[i, 'w'])

    # add the classes as node features
    for v in one.nodes:
        v['y'] = torch.tensor([labelsGB[v.uid]])

    # add the node2vec embeddings as node features
    df = pd.read_csv('node2vec-p1q1.csv')
    for v in one.nodes:
        v['node2vec-p1q1'] = torch.from_numpy(df[df['characters'] == v.uid].iloc[:, :-1].values).squeeze()

    # add the node2vec embeddings as node features
    df = pd.read_csv('node2vec-p1q4.csv')
    for v in one.nodes:
        v['node2vec-p1q4'] = torch.from_numpy(df[df['characters'] == v.uid].iloc[:, :-1].values).squeeze()

    # add the node2vec embeddings as node features
    df = pd.read_csv('node2vec-p4q1.csv')
    for v in one.nodes:
        v['node2vec-p4q1'] = torch.from_numpy(df[df['characters'] == v.uid].iloc[:, :-1].values).squeeze()

    # add the word2vec as node features
    df = pd.read_csv('words_and_vectors.csv')
    for v in one.nodes:
        v['word2vec'] = torch.from_numpy(df[df['words'] == v.uid].iloc[:, :-1].values).squeeze()

    # add the Laplacian Embeddings as node features
    df = pd.read_csv('LE_embedding.csv')
    for v in one.nodes:
        v['le'] = torch.from_numpy(df[df['characters'] == v.uid].iloc[:, :-1].values).squeeze()

    # adding the weights
    df = pd.read_csv('new_data.csv').loc[:, ['v', 'w']]
    weights = df.value_counts().to_dict()
    for e in one.edges:
        e['weight'] = weights[(e.v.uid, e.w.uid)]

    # convert the network to PyG data set
    data = network_to_pyg(one)

    return data, one

In [None]:
data, one = load_data()
data

# Random Walk methods

In [None]:
# from torch_geometric.nn import Node2Vec

# # initialize the model
# model = Node2Vec(data.edge_index, embedding_dim=20, walk_length=8,
#                  context_size=4, walks_per_node=3,
#                  num_negative_samples=1, p=1, q=1, sparse=True)

# # data loader to speed the train 
# loader = model.loader(batch_size=32, shuffle=True, num_workers=4)  
# # initzialize the optimizer 
# optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)

# def train():
#     # put model in train model
#     model.train()  
#     total_loss = 0
#     for pos_rw, neg_rw in loader:
#         # set the gradients to 0
#         optimizer.zero_grad()  
#         # compute the loss for the batch
#         loss = model.loss(pos_rw, neg_rw)  
#         loss.backward()
#         # optimize the parameters
#         optimizer.step()  
#         total_loss += loss.item()
#     return total_loss / len(loader)

# # train for n epochs
# for epoch in range(1, 201):
#     loss = train()
#     if epoch % 10 == 0:
#         print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}')
        
# # get the embeddings from the trained model
# X_node2vec = model(torch.arange(n.number_of_nodes())).detach()

In [None]:
# # get the embeddings from the trained model
# embeddings = pd.DataFrame([tensor.detach().numpy() for tensor in model(torch.arange(n.number_of_nodes()))], 
#                           columns=list(range(128)))
# embeddings['characters'] = pd.Series([i.uid for i in n.nodes])
# embeddings.to_csv('node2vec-p1q1.csv', index=False)

In [None]:
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, recall_score, precision_score


def calculate_metrics(cv):
    return {'F1-score': np.mean(cv['test_f1_macro']), 'Accuracy':np.mean(cv['test_accuracy']), 
            'Precision': np.mean(cv['test_precision_macro']), 'Recall':np.mean(cv['test_recall_macro'])}

def calculate_metrics_std(cv):
    return {'F1-score': np.std(cv['test_f1_macro']), 'Accuracy':np.std(cv['test_accuracy']), 
            'Precision': np.std(cv['test_precision_macro']), 'Recall':np.std(cv['test_recall_macro'])}

In [None]:
from sklearn.linear_model import LogisticRegression

df = pd.read_csv('node2vec-p1q4.csv')

logreg = LogisticRegression(solver='lbfgs', max_iter=1000, tol=1e-8, penalty='none')


scores = cross_validate(logreg, df.iloc[:, :-1], [v['y'].item() for v in one.nodes], cv=10,
                        scoring=('f1_macro', 'accuracy', 'precision_macro', 'recall_macro'),
                        return_train_score=True)

    
lr_p1q4 = calculate_metrics(scores)
print(lr_p1q4)
calculate_metrics_std(scores)

In [None]:
from sklearn.linear_model import LogisticRegression

df = pd.read_csv('node2vec-p4q1.csv')

logreg = LogisticRegression(solver='lbfgs', max_iter=1000, tol=1e-8, penalty='none')

scores = cross_validate(logreg, df.iloc[:, :-1], [v['y'].item() for v in one.nodes], cv=10,
                        scoring=('f1_macro', 'accuracy', 'precision_macro', 'recall_macro'),
                        return_train_score=True)
    
lr_p4q1 = calculate_metrics(scores)
print(lr_p4q1)
calculate_metrics_std(scores)

In [None]:
from sklearn.linear_model import LogisticRegression

df = pd.read_csv('node2vec-p1q1.csv')

logreg = LogisticRegression(solver='lbfgs', max_iter=1000, tol=1e-8, penalty='none')

scores = cross_validate(logreg, df.iloc[:, :-1], [v['y'].item() for v in one.nodes], cv=10,
                        scoring=('f1_macro', 'accuracy', 'precision_macro', 'recall_macro'),
                        return_train_score=True)
    
lr_p1q1 = calculate_metrics(scores)
lr_p1q1
print(lr_p1q1)
calculate_metrics_std(scores)

In [None]:
random_walk_table = pd.DataFrame({'Node2vec-p1q4': lr_p1q4,
                                 'Node2vec-p4q1': lr_p4q1,
                                 'DeepWalk': lr_p1q1})

random_walk_table.to_csv('CV_NodeClassification_node2vec_deepwalk.csv')
random_walk_table.style.highlight_max(color = 'lightgreen', axis = 1)

# MessagePassing Methods

# GCN with deepwalk

In [None]:
def calculate_metrics(y_pred, y_true):
    return {'F1-score':f1_score(y_pred, y_true, average="macro"), 'Accuracy':accuracy_score(y_pred, y_true), 
            'Precision':precision_score(y_pred, y_true, average="macro"), 'Recall':recall_score(y_pred, y_true,average="macro")}

In [None]:
from sklearn.model_selection import StratifiedKFold
def k_fold(dataset, folds, y ):
    skf = StratifiedKFold(folds, shuffle=False)

    test_indices, train_indices = [], []
    for _, idx in skf.split(torch.zeros(len(dataset)), y):
        test_indices.append(torch.from_numpy(idx).to(torch.long))


    for i in range(folds):
        train_mask = torch.ones(len(dataset), dtype=torch.bool)
        train_mask[test_indices[i]] = 0
        train_indices.append(train_mask.nonzero(as_tuple=False).view(-1))

    return train_indices, test_indices

In [None]:
class GCN(torch.nn.Module):

    def __init__(self, num_features, num_classes, hidden_dim=20):
        super().__init__()

        self.gcn1 = torch_geometric.nn.GCNConv(num_features, hidden_dim) 
        self.gcn2 = torch_geometric.nn.GCNConv(hidden_dim, num_classes)
        
    def forward(self, x, edge_index, edge_weights):
        
        x = self.gcn1(x, edge_index, edge_weights)
        x = torch.nn.functional.selu(x)
        x = self.gcn2(x, edge_index, edge_weights)
        x = torch.nn.functional.selu(x)
        
        return torch.softmax(x, dim=1)

In [None]:
scores = {'F1-score':[], 'Accuracy':[], 'Precision':[], 'Recall':[]}
k = 10
X = data.node2vec11
y = data.y.squeeze().long()
indices = k_fold(X,k,y)
losses = []
for train_indices, test_indices in zip(*indices):
    model = GCN(num_features=X.shape[1], num_classes=3, hidden_dim=20)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
    loss_function = torch.nn.CrossEntropyLoss()
    for epoch in range(5000):
        output = model(X.float(), data.edge_index, data.edge_weights)
        loss = loss_function(output[train_indices], y[train_indices])
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        losses.append(loss.item())

    model.eval()
    _ ,pred = model(X.float(), data.edge_index, data.edge_weights).max(dim=1)
    cm = calculate_metrics(y.numpy()[test_indices], pred.numpy()[test_indices])
    scores['F1-score'].append(cm['F1-score'])
    scores['Accuracy'].append(cm['Accuracy'])
    scores['Precision'].append(cm['Precision'])
    scores['Recall'].append(cm['Recall'])
    
gcn_p1q1 = {k:np.mean(v) for k,v in scores.items()}
gcn_p1q1

In [None]:
{k:np.std(v) for k,v in scores.items()}

# GAT with deepwalk

In [None]:
class GAT(torch.nn.Module):

    def __init__(self, num_features, num_classes, hidden_dim=20):
        super().__init__()

        self.gcn1 = torch_geometric.nn.GATConv(num_features, int(hidden_dim/2), heads=2) 
        self.gcn2 = torch_geometric.nn.GATConv(hidden_dim, num_classes, heads=1) 
        
    def forward(self, x, edge_index):
        
        x = self.gcn1(x, edge_index)
        x = torch.nn.functional.selu(x)
        x = self.gcn2(x, edge_index)
        x = torch.nn.functional.selu(x)
        
        return torch.softmax(x, dim=1)

In [None]:
scores = {'F1-score':[], 'Accuracy':[], 'Precision':[], 'Recall':[]}
k = 10
X = data.node2vec11
y = data.y.squeeze().long()
indices = k_fold(X,k,y)
losses = []
for train_indices, test_indices in zip(*indices):
    model = GAT(num_features=X.shape[1], num_classes=3, hidden_dim=20)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
    loss_function = torch.nn.CrossEntropyLoss()
    for epoch in range(5000):
        output = model(X, data.edge_index)
        loss = loss_function(output[train_indices], y[train_indices])
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        losses.append(loss.item())

    model.eval()
    _ ,pred = model(X, data.edge_index).max(dim=1)
    cm = calculate_metrics(y.numpy()[test_indices], pred.numpy()[test_indices])
    scores['F1-score'].append(cm['F1-score'])
    scores['Accuracy'].append(cm['Accuracy'])
    scores['Precision'].append(cm['Precision'])
    scores['Recall'].append(cm['Recall'])
    
gat_p1q1 = {k:np.mean(v) for k,v in scores.items()}
gat_p1q1

In [None]:
{k:np.std(v) for k,v in scores.items()}

# GCN with one-hot-encoding

In [None]:
scores = {'F1-score':[], 'Accuracy':[], 'Precision':[], 'Recall':[]}
k = 10
X = data.ohe
y = data.y.squeeze().long()
indices = k_fold(X,k,y)
losses = []
for train_indices, test_indices in zip(*indices):
    model = GCN(num_features=X.shape[1], num_classes=3, hidden_dim=8)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
    loss_function = torch.nn.CrossEntropyLoss()
    for epoch in range(5000):
        output = model(X.float(), data.edge_index, data.edge_weights)
        loss = loss_function(output[train_indices], y[train_indices])
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        losses.append(loss.item())

    model.eval()
    _ ,pred = model(X.float(), data.edge_index, data.edge_weights).max(dim=1)
    cm = calculate_metrics(y.numpy()[test_indices], pred.numpy()[test_indices])
    scores['F1-score'].append(cm['F1-score'])
    scores['Accuracy'].append(cm['Accuracy'])
    scores['Precision'].append(cm['Precision'])
    scores['Recall'].append(cm['Recall'])
    
gcn_onehot = {k:np.mean(v) for k,v in scores.items()}
gcn_onehot

In [None]:
{k:np.std(v) for k,v in scores.items()}

# GAT with one-hot-encoding

In [None]:
scores = {'F1-score':[], 'Accuracy':[], 'Precision':[], 'Recall':[]}
k = 10
X = data.ohe
y = data.y.squeeze().long()
indices = k_fold(X,k,y)
losses = []
for train_indices, test_indices in zip(*indices):
    model = GAT(num_features=X.shape[1], num_classes=3, hidden_dim=64)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
    loss_function = torch.nn.CrossEntropyLoss()
    for epoch in range(5000):
        output = model(X.float(), data.edge_index)
        loss = loss_function(output[train_indices], y[train_indices])
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        losses.append(loss.item())

    model.eval()
    _ ,pred = model(X.float(), data.edge_index).max(dim=1)
    cm = calculate_metrics(y.numpy()[test_indices], pred.numpy()[test_indices])
    scores['F1-score'].append(cm['F1-score'])
    scores['Accuracy'].append(cm['Accuracy'])
    scores['Precision'].append(cm['Precision'])
    scores['Recall'].append(cm['Recall'])
    print(pred[test_indices])
    print(y.numpy()[test_indices])
    
gat_onehot = {k:np.mean(v) for k,v in scores.items()}
gat_onehot

In [None]:
{k:np.std(v) for k,v in scores.items()}

In [None]:
gnn_table = pd.DataFrame({'GCN_DeepWalk':gcn_p1q1,
                         'GCN_OneHotEncoding':gcn_onehot,
                         'GAT_DeepWalk':gat_p1q1,
                         'GAT_OneHotEncoding':gat_onehot})

gnn_table.to_csv('CV_NodeClassification_gcn_gat.csv')
gnn_table.style.highlight_max(color = 'lightgreen', axis = 1)

# Message Passing methods with word embeddings

# GCN with word2vec

In [None]:
scores = {'F1-score':[], 'Accuracy':[], 'Precision':[], 'Recall':[]}
k = 10
X = data.word2vec
y = data.y.squeeze().long()
indices = k_fold(X,k,y)
for train_indices, test_indices in zip(*indices):
    model = GCN(num_features=X.shape[1], num_classes=3, hidden_dim=20)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
    loss_function = torch.nn.CrossEntropyLoss()
    losses = []
    for epoch in range(5000):
        output = model(X.float(), data.edge_index, data.edge_weights)
        loss = loss_function(output[train_indices], y[train_indices])
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        losses.append(loss.item())
    

    model.eval()
    _ ,pred = model(X.float(), data.edge_index, data.edge_weights).max(dim=1)
    cm = calculate_metrics(y.numpy()[test_indices], pred.numpy()[test_indices])
    scores['F1-score'].append(cm['F1-score'])
    scores['Accuracy'].append(cm['Accuracy'])
    scores['Precision'].append(cm['Precision'])
    scores['Recall'].append(cm['Recall'])
    
gcn_word = {k:np.mean(v) for k,v in scores.items()}
gcn_word

In [None]:
{k:np.std(v) for k,v in scores.items()}

# GAT with word2vec

In [None]:
scores = {'F1-score':[], 'Accuracy':[], 'Precision':[], 'Recall':[]}
k = 10
X = data.word2vec
y = data.y.squeeze().long()
indices = k_fold(X,k,y)
for train_indices, test_indices in zip(*indices):
    model = GAT(num_features=X.shape[1], num_classes=3, hidden_dim=20)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
    loss_function = torch.nn.CrossEntropyLoss()
    for epoch in range(2000):
        output = model(X.float(), data.edge_index)
        loss = loss_function(output[train_indices], y[train_indices])
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    model.eval()
    _ ,pred = model(X.float(), data.edge_index).max(dim=1)
    cm = calculate_metrics(y.numpy()[test_indices], pred.numpy()[test_indices])
    scores['F1-score'].append(cm['F1-score'])
    scores['Accuracy'].append(cm['Accuracy'])
    scores['Precision'].append(cm['Precision'])
    scores['Recall'].append(cm['Recall'])
    
gat_word = {k:np.mean(v) for k,v in scores.items()}
gat_word

In [None]:
{k:np.std(v) for k,v in scores.items()}

In [None]:
gnn_table = pd.DataFrame({'GCN_Word2Vec':gcn_word,
                         'GAT_Word2Vec':gat_word})

gnn_table.to_csv('CV_NodeClassification_gcn_gat_word2vec.csv')
gnn_table.style.highlight_max(color = 'lightgreen', axis = 1)

# GCN with LE

In [None]:
scores = {'F1-score':[], 'Accuracy':[], 'Precision':[], 'Recall':[]}
k = 10
X = data.le
y = data.y.squeeze().long()
indices = k_fold(X,k,y)
for train_indices, test_indices in zip(*indices):
    model = GCN(num_features=X.shape[1], num_classes=3, hidden_dim=20)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
    loss_function = torch.nn.CrossEntropyLoss()
    losses = []
    for epoch in range(5000):
        output = model(X.float(), data.edge_index, data.edge_weights)
        loss = loss_function(output[train_indices], y[train_indices])
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        losses.append(loss.item())
    

    model.eval()
    _ ,pred = model(X.float(), data.edge_index, data.edge_weights).max(dim=1)
    cm = calculate_metrics(y.numpy()[test_indices], pred.numpy()[test_indices])
    scores['F1-score'].append(cm['F1-score'])
    scores['Accuracy'].append(cm['Accuracy'])
    scores['Precision'].append(cm['Precision'])
    scores['Recall'].append(cm['Recall'])
    
gcn_le = {k:np.mean(v) for k,v in scores.items()}
gcn_le

In [None]:
{k:np.std(v) for k,v in scores.items()}

# GAT with LE

In [None]:
scores = {'F1-score':[], 'Accuracy':[], 'Precision':[], 'Recall':[]}
k = 10
X = data.le
y = data.y.squeeze().long()
indices = k_fold(X,k,y)
for train_indices, test_indices in zip(*indices):
    model = GAT(num_features=X.shape[1], num_classes=3, hidden_dim=20)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
    loss_function = torch.nn.CrossEntropyLoss()
    for epoch in range(2000):
        output = model(X.float(), data.edge_index)
        loss = loss_function(output[train_indices], y[train_indices])
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    model.eval()
    _ ,pred = model(X.float(), data.edge_index).max(dim=1)
    cm = calculate_metrics(y.numpy()[test_indices], pred.numpy()[test_indices])
    scores['F1-score'].append(cm['F1-score'])
    scores['Accuracy'].append(cm['Accuracy'])
    scores['Precision'].append(cm['Precision'])
    scores['Recall'].append(cm['Recall'])
    
gat_le = {k:np.mean(v) for k,v in scores.items()}
gat_le

In [None]:
{k:np.std(v) for k,v in scores.items()}

In [None]:
gnn_table = pd.DataFrame({'GCN_LE':gcn_le,
                         'GAT_LE':gat_le})

gnn_table.to_csv('CV_NodeClassification_gcn_gat_le.csv')
gnn_table.style.highlight_max(color = 'lightgreen', axis = 1)

In [None]:
rw = pd.read_csv('CV_NodeClassification_node2vec_deepwalk.csv', index_col=0)
gnn = pd.read_csv('CV_NodeClassification_gcn_gat.csv', index_col=0)
word = pd.read_csv('CV_NodeClassification_gcn_gat_word2vec.csv', index_col=0)
le = pd.read_csv('CV_NodeClassification_gcn_gat_le.csv', index_col=0)
rw_gnn = pd.concat([rw,gnn,word,le],axis=1)
rw_gnn.to_csv('CV_NodeClassification_GNN_RW_Word_LE.csv')
rw_gnn.T.apply(lambda x: round(x*100, 2)).style.highlight_max(color = 'lightgreen', axis = 0)