In [None]:
import pathpy as pp
import numpy as np
import pandas as pd
import torch
import torch_geometric
import json
from utils import network_to_pyg
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, recall_score, precision_score

In [None]:
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, recall_score, precision_score


def calculate_metrics(cv):
    return {'F1-score': np.mean(cv['test_f1_macro']), 'Accuracy':np.mean(cv['test_accuracy']), 
            'Precision': np.mean(cv['test_precision_macro']), 'Recall':np.mean(cv['test_recall_macro'])}

In [None]:
def load_data():
    # load the data set
    df = pd.read_csv('new_data.csv')

    # load the labels
    with open('book_labels.json') as f:
        labelsGB = json.load(f)

    # change to scalar
    labelsGB = {k: 0 if v=='lotr' else 1 if v =='hobbit' else 2 for k,v in labelsGB.items()}

    # load the empty network
    one = pp.Network(directed=False)

    # add the nodes
    for i in range(df.shape[0]):
        one.add_edge(df.loc[i, 'v'], df.loc[i, 'w'])

    # add the classes as node features
    for v in one.nodes:
        v['y'] = torch.tensor([labelsGB[v.uid]])

    # add the node2vec embeddings as node features
    df = pd.read_csv('node2vec-p1q1.csv')
    for v in one.nodes:
        v['node2vec-p1q1'] = torch.from_numpy(df[df['characters'] == v.uid].iloc[:, :-1].values).squeeze()

    # add the node2vec embeddings as node features
    df = pd.read_csv('node2vec-p1q4.csv')
    for v in one.nodes:
        v['node2vec-p1q4'] = torch.from_numpy(df[df['characters'] == v.uid].iloc[:, :-1].values).squeeze()

    # add the node2vec embeddings as node features
    df = pd.read_csv('node2vec-p4q1.csv')
    for v in one.nodes:
        v['node2vec-p4q1'] = torch.from_numpy(df[df['characters'] == v.uid].iloc[:, :-1].values).squeeze()

    # add the word2vec as node features
    df = pd.read_csv('words_and_vectors.csv')
    for v in one.nodes:
        v['word2vec'] = torch.from_numpy(df[df['words'] == v.uid].iloc[:, :-1].values).squeeze()

    # add the Laplacian Embeddings as node features
    df = pd.read_csv('LE_embedding.csv')
    for v in one.nodes:
        v['le'] = torch.from_numpy(df[df['characters'] == v.uid].iloc[:, :-1].values).squeeze()

    # adding the weights
    df = pd.read_csv('new_data.csv').loc[:, ['v', 'w']]
    weights = df.value_counts().to_dict()
    for e in one.edges:
        e['weight'] = weights[(e.v.uid, e.w.uid)]

    # convert the network to PyG data set
    data = network_to_pyg(one)

    return data, one

In [None]:
data, one = load_data()
data

# Random Walk methods

In [None]:
from torch_geometric.nn import Node2Vec

# initialize the model
model = Node2Vec(data.edge_index, embedding_dim=20, walk_length=8,
                 context_size=4, walks_per_node=3,
                 num_negative_samples=1, p=1, q=1, sparse=True)

# data loader to speed the train 
loader = model.loader(batch_size=32, shuffle=True, num_workers=4)  
# initzialize the optimizer 
optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.01)

losses = []
def train():
    # put model in train model
    model.train()  
    total_loss = 0
    for pos_rw, neg_rw in loader:
        # set the gradients to 0
        optimizer.zero_grad()  
        # compute the loss for the batch
        loss = model.loss(pos_rw, neg_rw)  
        loss.backward()
        # optimize the parameters
        optimizer.step()  
        total_loss += loss.item()
    return total_loss / len(loader)

# train for n epochs
for epoch in range(1, 201):
    loss = train()
    losses.append(loss)
    if epoch % 10 == 0:
        print(f'Epoch: {epoch:02d}, Loss: {loss:.4f}')
        
# get the embeddings from the trained model
X_node2vec = model(torch.arange(data.num_nodes)).detach()

In [None]:
# get the embeddings from the trained model
embeddings = pd.DataFrame([tensor.detach().numpy() for tensor in model(torch.arange(data.num_nodes))], 
                          columns=list(range(20)))
embeddings['characters'] = pd.Series([i.uid for i in one.nodes])
embeddings.to_csv('node2vec-p1q1.csv', index=False)

In [None]:
from sklearn.linear_model import LogisticRegression

df = pd.read_csv('node2vec-p1q4.csv')

logreg = LogisticRegression(solver='lbfgs', max_iter=1000, tol=1e-8, penalty='none')

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1], [v['y'].item() for v in one.nodes], 
                                                    test_size=0.2, random_state=42)

logreg.fit(X_train,y_train )
lr_pred = logreg.predict(X_test)



    
lr_p1q4 = calculate_metrics(y_test, lr_pred)
print(confusion_matrix(y_test, lr_pred))
lr_p1q4

In [None]:
from sklearn.linear_model import LogisticRegression

df = pd.read_csv('node2vec-p4q1.csv')

logreg = LogisticRegression(solver='lbfgs', max_iter=1000, tol=1e-8, penalty='none')

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1], [v['y'].item() for v in one.nodes], 
                                                    test_size=0.2, random_state=42)

logreg.fit(X_train,y_train )
lr_pred = logreg.predict(X_test)
    
lr_p4q1 = calculate_metrics(y_test, lr_pred)
print(confusion_matrix(y_test, lr_pred))
lr_p4q1

In [None]:
from sklearn.linear_model import LogisticRegression

df = pd.read_csv('node2vec-p1q1.csv')

logreg = LogisticRegression(solver='lbfgs', max_iter=1000, tol=1e-8, penalty='none')

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1], [v['y'].item() for v in one.nodes], 
                                                    test_size=0.2, random_state=42)

logreg.fit(X_train,y_train )
lr_pred = logreg.predict(X_test)
    
lr_p1q1 = calculate_metrics(y_test, lr_pred)
print(confusion_matrix(y_test, lr_pred))
lr_p1q1

# MessagePassing Methods

# GCN with deepwalk

In [None]:
mask = list(range(df.shape[0]))

df = pd.read_csv('node2vec-p1q4.csv')

from sklearn.model_selection import train_test_split

_, _, y_train, y_test = train_test_split(df.iloc[:, :-1], mask, 
                                                    test_size=0.2, random_state=42)

train_mask = torch.tensor([False]*df.shape[0])
for i in y_train:
    train_mask[i] = True
        
test_mask = torch.from_numpy(np.invert(train_mask).detach().numpy()).bool()

X = torch.from_numpy(df.iloc[:,:-1].values)
y = torch.tensor([v['y'].item() for v in one.nodes])

In [None]:
class GCN(torch.nn.Module):

    def __init__(self, num_features, num_classes, hidden_dim=20):
        super().__init__()

        self.gcn1 = torch_geometric.nn.GCNConv(num_features, 10) 
        self.gcn2 = torch_geometric.nn.GCNConv(10, 20)
        self.lin = torch.nn.Linear(20, num_classes)
        
    def forward(self, x, edge_index):
        
        x = self.gcn1(x, edge_index)
        x = torch.nn.functional.selu(x)
        x = self.gcn2(x, edge_index)
        x = torch.nn.functional.selu(x)
        x = self.lin(x)
        x = torch.nn.functional.selu(x)
        
        return x

In [None]:
model = GCN(num_features=data.node2vec11.shape[1], num_classes=3, hidden_dim=512)

optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
loss_function = torch.nn.CrossEntropyLoss()

zeros_loss_list = []
for epoch in range(5000):
    output = model(data.node2vec11, data.edge_index)
    loss = loss_function(output[train_mask], y[train_mask])
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    zeros_loss_list.append(loss.data.item())

In [None]:
model.eval()
_ ,pred = model(data.node2vec11, data.edge_index).max(dim=1)
gcn_p1q1 = calculate_metrics(y.numpy()[test_mask], pred.numpy()[test_mask])
print(confusion_matrix(y.numpy()[test_mask], pred.numpy()[test_mask]))
gcn_p1q1

In [None]:
import matplotlib.pyplot as plt
plt.plot(zeros_loss_list)
plt.show()

In [None]:
model.eval()

x = model.gcn1(data.node2vec11, data.edge_index)
x = torch.nn.functional.selu(x)
x = model.gcn2(x, data.edge_index)
x = torch.nn.functional.selu(x)

In [None]:
from sklearn.manifold import TSNE
X_embedded = TSNE(n_components=2, learning_rate='auto', init='random').fit_transform(x.detach().numpy())

colors = {}
for v in one.nodes:
    index = one.nodes.index[v.uid]
    if v['y'] == 1:
        colors[index] = 'blue'
    elif v['y'] == 0:
        colors[index] = 'red'
    else:
        colors[index] = 'green'

plt.scatter(X_embedded[:,0], X_embedded[:,1], c=colors.values(), alpha=0.5)

In [None]:
embeddings = pd.DataFrame([tensor.detach().numpy() for tensor in x], 
                          columns=list(range(20)))
embeddings['characters'] = pd.Series([i.uid for i in one.nodes])
embeddings.to_csv('GCN_deepwalk_embedding.csv', index=False)

# GAT with deepwalk

In [None]:
class GAT(torch.nn.Module):

    def __init__(self, num_features, num_classes, hidden_dim=64):
        super().__init__()

        self.gcn1 = torch_geometric.nn.GATConv(num_features, 5, heads=2) 
        self.gcn2 = torch_geometric.nn.GATConv(5*2, 20, heads=1) 
        self.lin = torch.nn.Linear(20, num_classes)
        
    def forward(self, x, edge_index):
        
        x = self.gcn1(x, edge_index)
        x = torch.nn.functional.selu(x)
        x = self.gcn2(x, edge_index)
        x = torch.nn.functional.selu(x)
        x = self.lin(x)
        x = torch.nn.functional.selu(x)
        
        return x

In [None]:
model = GAT(num_features=data.node2vec11.shape[1], num_classes=3, hidden_dim=64)

optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
loss_function = torch.nn.CrossEntropyLoss()

zeros_loss_list = []
for epoch in range(5000):
    output = model(data.node2vec11, data.edge_index)
    loss = loss_function(output[train_mask], y[train_mask])
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    zeros_loss_list.append(loss.data.item())

In [None]:
model.eval()
_ ,pred = model(data.node2vec11, data.edge_index).max(dim=1)
    
gat_p1q1 = calculate_metrics(y.numpy()[test_mask], pred.numpy()[test_mask])
print(confusion_matrix(y.numpy()[test_mask], pred.numpy()[test_mask]))
gat_p1q1

In [None]:
import matplotlib.pyplot as plt
plt.plot(zeros_loss_list)
plt.show()

In [None]:
model.eval()

x = model.gcn1(data.node2vec11, data.edge_index)
x = torch.nn.functional.selu(x)
x = model.gcn2(x, data.edge_index)
x = torch.nn.functional.selu(x)

In [None]:
from sklearn.manifold import TSNE
X_embedded = TSNE(n_components=2, learning_rate='auto', init='random').fit_transform(x.detach().numpy())

colors = {}
for v in one.nodes:
    index = one.nodes.index[v.uid]
    if v['y'] == 1:
        colors[index] = 'blue'
    elif v['y'] == 0:
        colors[index] = 'red'
    else:
        colors[index] = 'green'

plt.scatter(X_embedded[:,0], X_embedded[:,1], c=colors.values(), alpha=0.5)

In [None]:
embeddings = pd.DataFrame([tensor.detach().numpy() for tensor in x], 
                          columns=list(range(20)))
embeddings['characters'] = pd.Series([i.uid for i in one.nodes])
embeddings.to_csv('GAT_deepwalk_embedding.csv', index=False)

# GCN with one-hot-encoding

In [None]:
mask = list(range(one.number_of_nodes()))

X = torch.eye(one.number_of_nodes())

from sklearn.model_selection import train_test_split

_, _, y_train, y_test = train_test_split(X, mask, 
                                                    test_size=0.2, random_state=42)

train_mask = torch.tensor([False]*df.shape[0])
for i in y_train:
    train_mask[i] = True
        
test_mask = torch.from_numpy(np.invert(train_mask).detach().numpy()).bool()

y = torch.tensor([v['y'].item() for v in one.nodes])

In [None]:
model = GCN(num_features=data.ohe.shape[1], num_classes=3, hidden_dim=512)

optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
loss_function = torch.nn.CrossEntropyLoss()

zeros_loss_list = []
for epoch in range(5000):
    output = model(data.ohe, data.edge_index)
    loss = loss_function(output[train_mask], y[train_mask])
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    zeros_loss_list.append(loss.data.item())

In [None]:
model.eval()
_ ,pred = model(data.ohe, data.edge_index).max(dim=1)
gcn_onehot = calculate_metrics(y.numpy()[test_mask], pred.numpy()[test_mask])
print(confusion_matrix(y.numpy()[test_mask], pred.numpy()[test_mask]))
gcn_onehot

In [None]:
import matplotlib.pyplot as plt
plt.plot(zeros_loss_list)
plt.show()

In [None]:
model.eval()

x = model.gcn1(data.ohe, data.edge_index)
x = torch.nn.functional.selu(x)
x = model.gcn2(x, data.edge_index)
x = torch.nn.functional.selu(x)

In [None]:
from sklearn.manifold import TSNE
X_embedded = TSNE(n_components=2, learning_rate='auto', init='random').fit_transform(x.detach().numpy())

colors = {}
for v in one.nodes:
    index = one.nodes.index[v.uid]
    if v['y'] == 1:
        colors[index] = 'blue'
    elif v['y'] == 0:
        colors[index] = 'red'
    else:
        colors[index] = 'green'

plt.scatter(X_embedded[:,0], X_embedded[:,1], c=colors.values(), alpha=0.5)

In [None]:
embeddings = pd.DataFrame([tensor.detach().numpy() for tensor in x], 
                          columns=list(range(20)))
embeddings['characters'] = pd.Series([i.uid for i in one.nodes])
embeddings.to_csv('GCN_onehot_embedding.csv', index=False)

# GAT with one-hot-encoding

In [None]:
model = GAT(num_features=data.ohe.shape[1], num_classes=3, hidden_dim=64)

optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
loss_function = torch.nn.CrossEntropyLoss()

zeros_loss_list = []
for epoch in range(5000):
    output = model(data.ohe, data.edge_index)
    loss = loss_function(output[train_mask], y[train_mask])
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    zeros_loss_list.append(loss.data.item())

In [None]:
model.eval()
_ ,pred = model(data.ohe, data.edge_index).max(dim=1)
gat_onehot = calculate_metrics(y.numpy()[test_mask], pred.numpy()[test_mask])
print(confusion_matrix(y.numpy()[test_mask], pred.numpy()[test_mask]))
gat_onehot

In [None]:
import matplotlib.pyplot as plt
plt.plot(zeros_loss_list)
plt.show()

In [None]:
model.eval()

x = model.gcn1(data.ohe, data.edge_index)
x = torch.nn.functional.selu(x)
x = model.gcn2(x, data.edge_index)
x = torch.nn.functional.selu(x)

In [None]:
from sklearn.manifold import TSNE
X_embedded = TSNE(n_components=2, learning_rate='auto', init='random').fit_transform(x.detach().numpy())

colors = {}
for v in one.nodes:
    index = one.nodes.index[v.uid]
    if v['y'] == 1:
        colors[index] = 'blue'
    elif v['y'] == 0:
        colors[index] = 'red'
    else:
        colors[index] = 'green'

plt.scatter(X_embedded[:,0], X_embedded[:,1], c=colors.values(), alpha=0.5)

In [None]:
embeddings = pd.DataFrame([tensor.detach().numpy() for tensor in x], 
                          columns=list(range(20)))
embeddings['characters'] = pd.Series([i.uid for i in one.nodes])
embeddings.to_csv('GAT_onehot_embedding.csv', index=False)

# Message Passing methods with word embeddings

# GCN with word2vec

In [None]:
mask = list(range(one.number_of_nodes()))

from sklearn.model_selection import train_test_split

_, _, y_train, y_test = train_test_split(mask, mask, 
                                                    test_size=0.2, random_state=42)

train_mask = torch.tensor([False]*one.number_of_nodes())
for i in y_train:
    train_mask[i] = True
        
test_mask = torch.from_numpy(np.invert(train_mask).detach().numpy()).bool()

y = torch.tensor([v['y'].item() for v in one.nodes])

In [None]:
model = GCN(num_features=data.word2vec.shape[1], num_classes=3, hidden_dim=512)

optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
loss_function = torch.nn.CrossEntropyLoss()

zeros_loss_list = []
for epoch in range(5000):
    output = model(data.word2vec, data.edge_index)
    loss = loss_function(output[train_mask], y[train_mask])
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    zeros_loss_list.append(loss.data.item())

In [None]:
model.eval()
_ ,pred = model(data.word2vec, data.edge_index).max(dim=1)
gcn_word = calculate_metrics(y.numpy()[test_mask], pred.numpy()[test_mask])
print(confusion_matrix(y.numpy()[test_mask], pred.numpy()[test_mask]))
gcn_word

In [None]:
import matplotlib.pyplot as plt
plt.plot(zeros_loss_list)
plt.show()

In [None]:
model.eval()

x = model.gcn1(data.word2vec, data.edge_index)
x = torch.nn.functional.selu(x)
x = model.gcn2(x, data.edge_index)
x = torch.nn.functional.selu(x)

In [None]:
from sklearn.manifold import TSNE
X_embedded = TSNE(n_components=2, learning_rate='auto', init='random').fit_transform(x.detach().numpy())

colors = {}
for v in one.nodes:
    index = one.nodes.index[v.uid]
    if v['y'] == 1:
        colors[index] = 'blue'
    elif v['y'] == 0:
        colors[index] = 'red'
    else:
        colors[index] = 'green'

plt.scatter(X_embedded[:,0], X_embedded[:,1], c=colors.values(), alpha=0.5)

In [None]:
embeddings = pd.DataFrame([tensor.detach().numpy() for tensor in x], 
                          columns=list(range(20)))
embeddings['characters'] = pd.Series([i.uid for i in one.nodes])
embeddings.to_csv('GCN_word_embedding.csv', index=False)

# GAT with word2vec

In [None]:
model = GAT(num_features=data.word2vec.shape[1], num_classes=3, hidden_dim=64)

optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
loss_function = torch.nn.CrossEntropyLoss()

zeros_loss_list = []
for epoch in range(2000):
    output = model(data.word2vec, data.edge_index)
    loss = loss_function(output[train_mask], y[train_mask])
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    zeros_loss_list.append(loss.data.item())

In [None]:
model.eval()
_ ,pred = model(data.word2vec, data.edge_index).max(dim=1)
gat_word = calculate_metrics(y.numpy()[test_mask], pred.numpy()[test_mask])
print(confusion_matrix(y.numpy()[test_mask], pred.numpy()[test_mask]))
gat_word

In [None]:
import matplotlib.pyplot as plt
plt.plot(zeros_loss_list)
plt.show()

In [None]:
model.eval()

x = model.gcn1(data.word2vec, data.edge_index)
x = torch.nn.functional.selu(x)
x = model.gcn2(x, data.edge_index)
x = torch.nn.functional.selu(x)

In [None]:
from sklearn.manifold import TSNE
X_embedded = TSNE(n_components=2, learning_rate='auto', init='random').fit_transform(x.detach().numpy())

colors = {}
for v in one.nodes:
    index = one.nodes.index[v.uid]
    if v['y'] == 1:
        colors[index] = 'blue'
    elif v['y'] == 0:
        colors[index] = 'red'
    else:
        colors[index] = 'green'

plt.scatter(X_embedded[:,0], X_embedded[:,1], c=colors.values(), alpha=0.5)

In [None]:
embeddings = pd.DataFrame([tensor.detach().numpy() for tensor in x], 
                          columns=list(range(20)))
embeddings['characters'] = pd.Series([i.uid for i in one.nodes])
embeddings.to_csv('GAT_word_embedding.csv', index=False)