In [None]:
!pip install dgl
!pip install node2vec

import time

import numpy as np                         # load all neccesaary libraries 
import matplotlib.pyplot as plt
import pandas as pd
                                                
import scipy as sp
import scipy.sparse.linalg as linalg
import scipy.cluster.hierarchy as hr
from scipy.spatial.distance import pdist, squareform
from sklearn.model_selection import train_test_split

import os
import sklearn.metrics as metrics
import sklearn.utils as utils
import sklearn.linear_model as linear_model
import sklearn.svm as svm
import sklearn.cluster as cluster
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler


import networkx as nx
from node2vec import Node2Vec
import torch
import seaborn as sns
from dgl.nn.pytorch import GraphConv
import torch.nn as nn
import torch.nn.functional as F

In [None]:
os.chdir("/content/sample_data")
G = nx.read_adjlist('adjedges.txt',  nodetype = int)
        ## create graph from edge list
#G.nodes()
#G.edges()              # no of nodes
G.number_of_nodes()

In [None]:
### install necessary libraries from pytorch geometric
!pip install torch-geometric
!pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.4.0+cu102.html
!pip install torch-sparse -f https://pytorch-geometric.com/whl/torch-1.4.0+cu102.html
!pip install torch-cluster -f https://pytorch-geometric.com/whl/torch-1.4.0+cu102.html
!pip install torch-spline-conv -f https://pytorch-geometric.com/whl/torch-1.4.0+cu102.html
import torch 
from  torch_geometric.data import Data

#### Loading training data
Now we load training data where a list of nodes along with their class label is given

In [None]:
# loading datasets
data_train = pd.read_csv('labels.txt', sep=" ", header=None)     # create training data with column names
data_train.columns=["node_id","class"]
#df_test 
#df_train.head()
paper_info_df = open("docs.txt")
paper_info_df = paper_info_df.readlines()


#### Read each line from text file and make a pandas dataframe of node_id and corresponding text

In [None]:
gr_bag_words_df = {}     # initialize dictionary
a = []                                
node_ids = []                       
text = []
for i in paper_info_df:
    a = i.split()                      #  for every line split ad sperate node_id from text
    temp = a.pop(0)
    node_ids.append(int(temp))
    text.append(" ".join(a))         # append to respective list
gr_bag_words_df['node_id'] = node_ids
gr_bag_words_df['text'] = text                  

text_df = pd.DataFrame.from_dict(gr_bag_words_df)   # create a pandas dataframe from dictionary with text from above

In [None]:
text_df.head()

Unnamed: 0,node_id,text
0,12828558,"Assessing Local Institutional Capacity, Data A..."
1,66779408,THE PROSPECTS FOR INTERNET TELEPHONY IN EUROPE...
2,38902949,"Economic Shocks, Safety Nets, and Fiscal Const..."
3,33450563,"Reform, Growth, and Poverty in Vietnam"
4,57470294,Households and Economic Growth in Latin Americ...


## SPACY library for text processing 

In [None]:
!pip install -U pip setuptools wheel
!pip install -U spacy
!pip install -m spacy download en_core_web_sm


In [None]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

nlp = spacy.load("en_core_web_sm")




In [None]:
## define a function to preprocess text and tokenize 
def preprocess_text(each_row):
    BOG = []
    each_row = each_row.lower()

    tokenized_words = nlp(each_row)
    for i in tokenized_words:
        if i.is_punct or i.is_digit or  i.text in STOP_WORDS or i.pos_ != "NOUN" or len(i.text) <= 1:   # remove punctuations and if necessary include only nouns
            continue
        else:
            BOG.append(i.lemma_)


    return BOG

In [None]:
text_df['text_pre'] = text_df['text'].apply(lambda x: preprocess_text(x))
text_df.head()


Unnamed: 0,node_id,text,text_pre
0,12828558,"Assessing Local Institutional Capacity, Data A...","[capacity, datum, availability, outcome]"
1,66779408,THE PROSPECTS FOR INTERNET TELEPHONY IN EUROPE...,"[prospect, internet, telephony, modeling, poli..."
2,38902949,"Economic Shocks, Safety Nets, and Fiscal Const...","[shock, safety, net, constraint, protection]"
3,33450563,"Reform, Growth, and Poverty in Vietnam","[reform, growth, poverty]"
4,57470294,Households and Economic Growth in Latin Americ...,"[household, growth]"


### After further investigation , it was found that certain papers or nodes did not have valid information or data was missing 

In [None]:
corpus = [" ".join(x) for x in text_df['text_pre'] if len(x) >= 1]
#corpus
print(len(corpus))
print(len(text_df))
print("The number of nodes missing information:",(len(text_df)-len(corpus)))

17449
18720
The number of nodes missing information: 1271


In [None]:
text_df['new_text'] = [" ".join(x) for x in text_df['text_pre']]
new_textdf = text_df[text_df['new_text'] != ""]
len(new_textdf)                                      # subset dataframe by getting rid  of paper without information

17449

### Left outer join to include nodes with features and respective labels from training dataframe

In [None]:
final_train_df = new_textdf.merge(data_train, left_on='node_id', right_on='node_id')
final_train_df.head()

Unnamed: 0,node_id,text,text_pre,new_text,class
0,12828558,"Assessing Local Institutional Capacity, Data A...","[capacity, datum, availability, outcome]",capacity datum availability outcome,0
1,66779408,THE PROSPECTS FOR INTERNET TELEPHONY IN EUROPE...,"[prospect, internet, telephony, modeling, poli...",prospect internet telephony modeling policy an...,0
2,38902949,"Economic Shocks, Safety Nets, and Fiscal Const...","[shock, safety, net, constraint, protection]",shock safety net constraint protection,0
3,33450563,"Reform, Growth, and Poverty in Vietnam","[reform, growth, poverty]",reform growth poverty,0
4,57470294,Households and Economic Growth in Latin Americ...,"[household, growth]",household growth,0


#### Join the training data with the adjcacency list to determine equal number of edge list elements and no of node features

In [None]:
grp_edges = list(G.edges)
#grp_edges = [list(x) for x in grp_edges]           
    # create training data with column names
# get tuples of edges information about source and destination nodes
src_nodes = []
dest_nodes = []       # make a list of two lists (i.e src nodes and dest nodes list)
                                 
for i in grp_edges:
    src_nodes.append(i[0])
    dest_nodes.append(i[1])
   


column_names = ["source_node", "dest_node"]

df_adj_list = pd.DataFrame(columns = column_names)
df_adj_list['source_node'] = src_nodes
df_adj_list['dest_node'] = dest_nodes

df_adj_list.head()


Unnamed: 0,source_node,dest_node
0,38902949,38998399
1,38998399,23801630
2,38998399,63525655
3,38998399,13157756
4,38998399,14987799


In [None]:
f_train_df = final_train_df.merge(df_adj_list, left_on='node_id', right_on='source_node')
f_train_df.head()
#print(len(f_train_df))

Unnamed: 0,node_id,text,text_pre,new_text,class,source_node,dest_node
0,38902949,"Economic Shocks, Safety Nets, and Fiscal Const...","[shock, safety, net, constraint, protection]",shock safety net constraint protection,0,38902949,38998399
1,33450563,"Reform, Growth, and Poverty in Vietnam","[reform, growth, poverty]",reform growth poverty,0,33450563,26547200
2,57470294,Households and Economic Growth in Latin Americ...,"[household, growth]",household growth,0,57470294,20968604
3,54791317,World Economic Forum EDITORS,[editor],editor,0,54791317,9564967
4,54791317,World Economic Forum EDITORS,[editor],editor,0,54791317,14589786


### After the  dataframe is filtered with only nodes or papers that have valid information , we now extract node features from Tf-idf which gives relative features about important key words associated with respect to a node or a paper in this case

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer   # get node features from tfidf vectorizer
tf_idf = TfidfVectorizer(max_features=10)
node_features = tf_idf.fit_transform(f_train_df['new_text'])   # fit and transsform t get tf-idf values as node features
x = torch.tensor(node_features.todense(),dtype = torch.float32)    # convert the sparse matrix to dense in order to conver the matrix to a pytorch tensor
y_labels = f_train_df['class']
y = torch.tensor(y_labels, dtype=torch.long)             # convert all dataframe array objects to torch tensors
edge_indx = []
src = f_train_df['source_node']                        
dst = f_train_df['dest_node']
edge_indx.append(src)
edge_indx.append(dst)
edge_indx = torch.tensor(edge_indx,dtype=torch.long)


In [None]:
#len(edge_index_train)  # list of two lists 
print(x) 
print(x.shape)
print(y.shape)
print(edge_indx.shape)
  

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
torch.Size([28891, 10])
torch.Size([28891])
torch.Size([2, 28891])


In [None]:
data.num_nodes

28891

In [None]:
data = Data(x=x, edge_index=edge_indx,y=y)



#--------------------------------------------
## test mask and training mask vectors by pytorch  to be masked for training and prediction
data.train_mask = torch.zeros(data.num_nodes, dtype=torch.uint8)
data.train_mask[:int(0.8 * data.num_nodes)] = 1 #train only on the 80% nodes
data.test_mask = torch.zeros(data.num_nodes, dtype=torch.uint8) #test on 20 % nodes 
data.test_mask[- int(0.2 * data.num_nodes):] = 1


In [None]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class gcn(torch.nn.Module):
    def __init__(self):
        super(gcn, self).__init__()
        self.conv1 = GCNConv(28891, 16 )
        self.conv2 = GCNConv(16, 16)
        self.l1 = nn.Linear(16,5)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        return F.softmax(self.f1(x), dim=1)

In [None]:
# Define model parametres and initialise the gcn class
model = gcn()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
loss_func = nn.CrossEntropyLoss()
# Define a function for training -----------------------------------------
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = loss_func(out[data.train_mask], data.y[data.train_mask])
    loss.backward()        
    optimizer.step()
    return loss

# eval and testing ----------------------------------------------------

def test():
    mode.eval()
    out = model(data.x , data.edge_index)
    # choose highest prob value and corresponding class is the predicted
    pred = out.argmax(dim = 1 )
    # check against test mask 
    test_true = pred[data.test_mask] == data.y[data.test]
    # accuracy based on ratio of correct predictions to actual true values
    test_acr = int(test_correct.sum())/ int(data.test_mask.sum())
    return test_acr


all_loss = []
no_of_epochs = 50
for each in range(no_of_epochs):
    loss = train()
    all_loss.append(loss)
    print(f'EPOCH:{each:02d},LOSS:{loss:.4f}' )

IndexError: ignored