<a href="https://colab.research.google.com/github/Hrishikesh-Harsh/Text_Classification_IR/blob/main/Text_Classifier_IR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Use TF-IDF as prescribed
### Treat Abstract, Key and Title differently
### Order: $Key > Abstract ≈ Title$
### Hence, $W_k > W_a \approx W_t$
### For TF, we can use $TF = 1 + log(n_t*W_t + n_k*W_k + n_a*W_a)$
### One More Hyper-parameter to vary is Window size for (Word,Word) pairs
### Use $3$ different Window sizes for $Key, Abstract, Title$: $Win_k, Win_a, Win_t$

In [1]:
#run
!pip3 install scipy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
#run
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#run
import csv
import math
import nltk
import re
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm.notebook import tqdm
from torch.nn.parameter import Parameter
from torch.nn.modules.module import Module
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder

import scipy.sparse as sp
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 

import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [283]:
#run
NUM_LAYERS = 2 
HIDDEN_DIM = 440
DROP_OUT = 0.55
LR = 0.02
WEIGHT_DECAY = 0
EARLY_STOPPING = 10
NUM_EPOCHS = 200

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_ratio = 0.7
test_ratio = 0.3
N = 206

train_N = int(train_ratio*N)
test_N = N-train_N

wt_k = 9    # Weight to be given to Keywords in tf score
wt_t = 3    # Weight to be given to Title in tf score
wt_a = 1    # Weight to be given to Abstract in tf score
window_size = 20 # Window size for PMI calculation

In [173]:
#run
# Loading original .csv file
file = open('/content/drive/MyDrive/IR_Project/dataset/PubMed.csv',encoding='Latin1')
type(file)
csvreader = csv.reader(file)

In [6]:
#run
# Set of stopwords to be removed
stop_words = set(stopwords.words('english'))

In [7]:
#run
# Just the first line of the .csv file (Column Names)
header = []
header = next(csvreader)

In [8]:
#run
# vocab_words would be a set of all distinct words found in the dataset/.csv file minus the stopwords
vocab_words = set()
vocab_words_list = []

In [9]:
#run
rows = []
index = 0
title = []
keyword = []
abstract = []
for r in csvreader: 
  r_sub = []
  r[2] = r[2].replace('.',' ')
  r[2] = r[2].replace(',',' ')
  r[2] = r[2].replace(';',' ')
  r[2] = r[2].replace('|',' ')
  r[2] = r[2].replace('<',' ')
  r[2] = r[2].replace('>',' ')
  r[2] = r[2].replace(':',' ')
  r[2] = r[2].replace('=',' ')
  r[2] = r[2].replace('(',' ')
  r[2] = r[2].replace(')',' ')
  r[2] = r[2].replace('[',' ')
  r[2] = r[2].replace(']',' ')
  r[2] = r[2].replace('?',' ')


  r[3] = r[3].replace('.',' ')
  r[3] = r[3].replace(',',' ')
  r[3] = r[3].replace(';',' ')
  r[3] = r[3].replace('|',' ')
  r[3] = r[3].replace('<',' ')
  r[3] = r[3].replace('>',' ')
  r[3] = r[3].replace(':',' ')
  r[3] = r[3].replace('=',' ')
  r[3] = r[3].replace('(',' ')
  r[3] = r[3].replace(')',' ')
  r[3] = r[3].replace('[',' ')
  r[3] = r[3].replace(']',' ')
  r[3] = r[3].replace('?',' ')

  r[4] = r[4].replace('.',' ')
  r[4] = r[4].replace(',',' ')
  r[4] = r[4].replace(';',' ')
  r[4] = r[4].replace('|',' ')
  r[4] = r[4].replace('<',' ')
  r[4] = r[4].replace('>',' ')
  r[4] = r[4].replace(':',' ')
  r[4] = r[4].replace('=',' ')
  r[4] = r[4].replace('(',' ')
  r[4] = r[4].replace(')',' ')
  r[4] = r[4].replace('[',' ')
  r[4] = r[4].replace(']',' ')
  r[4] = r[4].replace('?',' ')

  words_title = word_tokenize(r[2])     # Tokenize the Title of that doc (row)
  words_keyword = word_tokenize(r[3])   # Tokenize the Keywords of that doc (row)
  words_abstract = word_tokenize(r[4])  # Tokenize the Abstract of that doc (row)

  title.append([])
  keyword.append([])
  abstract.append([])

  w_t = ""
  w_k = ""
  w_a = ""

  ''' 
      - Adding all non-stop-words to the vocabulary
      - Also maintaining doc-wise collection of Keywords, Title and Abstract words 
  '''

  for w in words_title:
    if w not in stop_words and w!="'s":
        w_t = w_t + w + " "
        vocab_words.add(w)
        # vocab_words_list.append(w)
        title[index].append(w)
  
  for w in words_keyword:
    if w not in stop_words and w!="'s":
        w_k = w_k + w + " "
        vocab_words.add(w)
        # vocab_words_list.append(w)
        keyword[index].append(w)

  for w in words_abstract:
    if w not in stop_words and w!="'s":
        w_a = w_a + w + " "
        vocab_words.add(w)
        # vocab_words_list.append(w)
        abstract[index].append(w)
  
  index=index+1

  '''
    - Creating 'rows' to write back to Clean File
  '''
  for i in range(0,len(r)):
    if(i==2):
      r_sub.append(w_t)
    elif(i==3):
      r_sub.append(w_k)
    elif(i==4):
      r_sub.append(w_a) 
    else:
      r_sub.append(r[i])
    
  rows.append(r_sub)


In [284]:
#run
train_docs = []
test_docs = []

train_docs = rows[:train_N]
test_docs = rows[train_N:train_N+test_N]


In [285]:
all_labels = []

In [286]:
print(train_N)
print(train_N+test_N)

144
206


In [287]:
original_labels_train=[]
for i,r in enumerate(train_docs):
  original_labels_train.append(int(r[5]))
  all_labels.append(int(r[5]))

In [288]:
original_labels_test=[]
for i,r in enumerate(test_docs):
  original_labels_test.append(int(r[5]))
  all_labels.append(int(r[5]))

In [289]:
print(len(test_docs))

62


In [290]:
print(len(all_labels))

206


In [291]:
Labels_1 = original_labels_train.count(1)+original_labels_test.count(1)
Labels_0 = original_labels_train.count(0)+original_labels_test.count(0)

In [292]:
print("1:",Labels_1,"| 0:",Labels_0)

1: 116 | 0: 90


In [293]:
unique_labels=np.unique(original_labels_train)

num_class = len(unique_labels)
lEnc = LabelEncoder()
lEnc.fit(unique_labels)

print(unique_labels)
print(lEnc.transform(unique_labels))

train_labels = lEnc.transform(original_labels_train)
test_labels = lEnc.transform(original_labels_test)

labels = train_labels.tolist()+test_labels.tolist()
labels = torch.LongTensor(labels).to(device)

[0 1]
[0 1]


In [18]:
#run
vocab_words_list=[]
for wd in vocab_words:
  vocab_words_list.append(wd)

In [19]:
#run
# Globals
docs_size = len(rows)
vocab_size = len(vocab_words)

In [None]:
'''
  To write back to Clean File
'''
file = open('/content/drive/MyDrive/IR_Project/dataset/PubMed_Clean.csv', 'w', newline='')
writer = csv.writer(file)
writer.writerow(header)

for r in rows:
  writer.writerow(r)


In [20]:
#run
'''
  Create a dict to store {word: (ID, idf)} mapping
'''
dict_vocab = {}

for i,w in enumerate(vocab_words):
  dict_vocab[w]=(i,0)

In [None]:
for i,w in enumerate(vocab_words):
  if(i>5):
    break
  print(w)
print(vocab_size)

Estimates
Neuroticism
saves
postmortem
belief
inter-
59972


In [None]:
file = open('/content/drive/MyDrive/IR_Project/dataset/idf.csv', 'w', newline='') 
writer = csv.writer(file) 
writer.writerow(["word","idf"])
for wd in dict_vocab: 
  count = 0
  for i,doc in enumerate(rows):
    flag = 0 
    for w_t in title[i]:
      if(wd==w_t):
        count=count+1
        flag = 1
        break 

    if(flag==1):
      continue 

    for w_k in keyword[i]:
      if(wd==w_k):
        count=count+1
        flag = 1
        break

    if(flag==1):
      continue 

    for w_a in abstract[i]:
      if(wd==w_a):
        count=count+1
        flag = 1
        break

    if(flag==1):
      continue

  (id,idf) = dict_vocab[wd]
  if(count==0):
    idf = 0
  else:
    idf = math.log((docs_size/count),10) 
  dict_vocab[wd] = (id,idf)
  templine=[]
  templine.append(wd)
  templine.append(idf)
  writer.writerow(templine)
  print(id,templine)
  count = 0
writer.writerow("fine")
file.close()

In [21]:
#run
file = open('/content/drive/MyDrive/IR_Project/dataset/idf.csv',encoding='Latin1')
type(file)
csvreader = csv.reader(file)

header = []
header = next(csvreader)

In [22]:
#run
for val in dict_vocab: 
  r = next(csvreader)
  if(r[0] not in dict_vocab):
    continue
  if(r[1]=='i'):
    break
  (id,idf) = dict_vocab[r[0]]
  idf = float(r[1])
  dict_vocab[r[0]] = (id,idf)

In [None]:
for i,ele in enumerate(dict_vocab):
  if(i==5):
    break;
  
  print(ele,",",dict_vocab[ele])

Estimates , (0, 3.247138226100887)
Neuroticism , (1, 3.468986975717243)
saves , (2, 3.468986975717243)
postmortem , (3, 3.167956980053262)
belief , (4, 2.312639774857319)


In [228]:
#run
Adj_Matrix = np.zeros((docs_size+vocab_size,docs_size+vocab_size))
weights = []
row_list = []
col_list = []

In [229]:
print(Adj_Matrix.shape)

(68805, 68805)


In [230]:
#run
for i,doc in enumerate(rows):
  for w_k in keyword[i]:
    Adj_Matrix[i,dict_vocab[w_k][0]+docs_size] += wt_k
    Adj_Matrix[dict_vocab[w_k][0]+docs_size,i] += wt_k

  for w_t in title[i]:
    Adj_Matrix[i,dict_vocab[w_t][0]+docs_size] += wt_t
    Adj_Matrix[dict_vocab[w_t][0]+docs_size,i] += wt_t

  for w_a in abstract[i]:
    Adj_Matrix[i,dict_vocab[w_a][0]+docs_size] += wt_a
    Adj_Matrix[dict_vocab[w_a][0]+docs_size,i] += wt_a
  # # wt = 0
  # for w_k in keyword[i]:
  #   # wt += wt_k
  #   # weights.append(wt*dict_vocab[w_k][1])
  #   row_list.append(i)
  #   col_list.append(dict_vocab[w_k][0]+docs_size)
  #   x=pow(10,Adj_Matrix(i,dict_vocab[w_k][0]+docs_size)-1)
  #   Adj_Matrix[i][dict_vocab[w_k][0]+docs_size]=x+wt_k
  #   Adj_Matrix[i][dict_vocab[w_k][0]+docs_size]=1+math.log(Adj_Matrix[i][dict_vocab[w_k][0]+docs_size])
  # for w_t in title[i]:
  #   # wt += wt_t
  #   # weights.append(wt*dict_vocab[w_t][1])
  #   row_list.append(i)
  #   col_list.append(dict_vocab[w_t][0]+docs_size)
  # for w_a in abstract[i]:
  #   # wt += wt_a
  #   # weights.append(wt*dict_vocab[w_a][1])
  #   row_list.append(i)
  #   col_list.append(dict_vocab[w_a][0]+docs_size)


In [231]:
# run
for i in range(0,docs_size):
  for j in range(docs_size,Adj_Matrix.shape[0]):
    if(Adj_Matrix[i,j]!=0):
      wd = vocab_words_list[j-docs_size]
      idf = dict_vocab[wd][1]
      Adj_Matrix[i,j] = (1+math.log(Adj_Matrix[i,j],10))*idf
      Adj_Matrix[j,i]=Adj_Matrix[i,j]


In [None]:
# word co-occurence with context windows
'''
windows = []

for row in rows:
    content=row[2]+" "+row[3]+" "+row[4]
    words = content.split()
    length = len(words)
    if length <= window_size: 
        windows.append(words)
    else:
        # print(length, length - window_size + 1)
        for j in range(length - window_size + 1):
            window = words[j: j + window_size]
            windows.append(window)
            # print(window)

print(len(windows))

#calculating p(i) , word_window_freq has the number of windows a particular word appears in across all windows.
word_window_freq = {}
k = 0
for window in windows:
    appeared = set()
    for i in range(len(window)):
        if window[i] in appeared:
            continue
        if window[i] in word_window_freq:
            word_window_freq[window[i]] += 1
        else:
            word_window_freq[window[i]] = 1
        appeared.add(window[i])
    # print("k=",k)  
    # k+=1
print(len(word_window_freq))

word_pair_count = {}
k1=0
for window in windows:
    for i in range(1, len(window)):
        for j in range(0, i):
            word_i = window[i] #ith word in window
            word_i_id = dict_vocab[word_i][0]
            word_j = window[j] #jth word in range 0-i in the same window
            word_j_id = dict_vocab[word_j][0]
            if word_i_id == word_j_id:
                continue
            word_pair_str = str(word_i_id) + ',' + str(word_j_id) #concat id and use it to count a pair or p(i,j)
            if word_pair_str in word_pair_count: #word_pair_count stores number of pairs along with number of times they appear.
                word_pair_count[word_pair_str] += 1
            else:
                word_pair_count[word_pair_str] = 1
            # two orders
            word_pair_str = str(word_j_id) + ',' + str(word_i_id)
            if word_pair_str in word_pair_count:
                word_pair_count[word_pair_str] += 1
            else:
                word_pair_count[word_pair_str] = 1
    # k1+=1
    # print("k1=",k1)    
print(len(word_pair_count))
'''

# row = []
# col = []
# weight = []

# pmi as weights



829994
59972
13434816


In [None]:
'''
num_window = len(windows)

for key in word_pair_count:
    temp = key.split(',')
    i = int(temp[0])
    j = int(temp[1])
    # print("pair(i,j)=",i,j)
    count = word_pair_count[key] #p(i,j)
    word_freq_i = word_window_freq[vocab_words_list[i]] #p(i)
    word_freq_j = word_window_freq[vocab_words_list[j]] #p(j)
    pmi = math.log((1.0 * count / num_window) /(1.0 * word_freq_i * word_freq_j/(num_window * num_window)),10) #adj(i,j)
    if pmi <= 0:
        continue
    # Adj_Matrix[docs_size+i][docs_size+j]=pmi
    weights.append(pmi)
    row_list.append(docs_size+i)
    col_list.append(docs_size+j)
'''


In [None]:
print(len(weights))
print(len(row_list))

11465454
11465454


In [None]:
'''
file = open('/content/drive/MyDrive/IR_Project/dataset/pmi.csv', 'w', newline='') 
writer = csv.writer(file) 
writer.writerow(["weight","row","col"])
for i,weight_pmi in enumerate(weights):
  line=[]
  line.append(weights[i])
  line.append(row_list[i])
  line.append(col_list[i])
  writer.writerow(line)
file.close()
'''

In [232]:
weights=[]
row_list=[]
col_list=[]
print(len(weights))

0


In [233]:
#run
file = open('/content/drive/MyDrive/IR_Project/dataset/pmi.csv',encoding='Latin1')
type(file)
csvreader = csv.reader(file)

header = []
header = next(csvreader)


In [234]:
#run
weights=[]
row_list=[]
col_list=[]
count_skip=0
for r in csvreader:
    weights.append(float(r[0]))
    row_list.append(int(r[1]))
    col_list.append(int(r[2]))
file.close()
print(weights[:5])
print(row_list[:5])
print(col_list[:5])
print(len(weights))

[0.2854235499355359, 0.2854235499355359, 0.27660597339490123, 0.27660597339490123, 0.3417359872561016]
[33789, 34119, 33789, 62635, 62357]
[34119, 33789, 62635, 33789, 34119]
11465454


In [235]:
print(len(weights))
print(Adj_Matrix.shape)

11465454
(68805, 68805)


In [236]:
#run
from scipy.sparse import csr_matrix
Adj_Mat=csr_matrix(Adj_Matrix)

In [237]:
print(Adj_Mat.shape)

(68805, 68805)


In [238]:
# AdjMat1=Adj_Mat
print(docs_size)
print(len(vocab_words_list))

8833
59972


In [239]:
#run
Adj_Matrix = sp.csr_matrix((weights, (row_list, col_list)), shape=(docs_size+len(vocab_words_list), docs_size+len(vocab_words_list)))

# build symmetric adjacency matrix
Adj_Matrix = Adj_Matrix + Adj_Matrix.T.multiply(Adj_Matrix.T > Adj_Matrix) - Adj_Matrix.multiply(Adj_Matrix.T > Adj_Matrix)

In [240]:
print(Adj_Mat.shape)
print(Adj_Matrix.shape)

(68805, 68805)
(68805, 68805)


In [241]:
#run
Adj_Matrix=Adj_Mat+Adj_Matrix

In [242]:
#run
Adj_Matrix=csr_matrix(Adj_Matrix)

In [243]:
if torch.cuda.is_available:
  print('GPU available')
else:
  print('Please set GPU via Edit -> Notebook Settings.')


GPU available


In [244]:
print(len(test_labels))
print(test_labels.shape)

83
(83,)


In [245]:
def normalise(Adj_Matrix):
    """Symmetrically normalize adjacency matrix."""
    Adj_Matrix = sp.coo_matrix(Adj_Matrix)
    rowsum = np.array(Adj_Matrix.sum(1))
    D = np.power(rowsum, -0.5).flatten()
    D[np.isinf(D)] = 0.
    D_Factor = sp.diags(D)
    return Adj_Matrix.dot(D_Factor).transpose().dot(D_Factor).tocoo(), D
    
Adj_Matrix, D = normalise(Adj_Matrix + sp.eye(Adj_Matrix.shape[0]))

In [None]:
print(labels)
for i in range(0,labels.shape[0]):
  labels[i] = 1.0 - labels[i]
print(labels)

[0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0
 0 0 1 1 0 1 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 0 0 0 0 0 0 1 1
 1 0 1 0 0 1 1 1 0 0 0 1 1 0 1 1 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1 1 0 1 0 0 0 0 0 0 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 1 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0]
[1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1
 1 1 0 0 1 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 1 1 1 1 1 1 0 0
 0 1 0 1 1 0 0 0 1 1 1 0 0 1 0 0 1 0 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 0 0 0 1 0 1 1 1 1 1 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 1 0 1 1 0 0 1 1 1 1 1 1 1 1 1 1 0 0 1 0 1 0 0 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 0 1 1 1]


In [None]:
from sklearn import metrics

def purity_score(y_true, y_pred):
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    print(contingency_matrix)
    # return purity
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix) 


In [None]:
score = purity_score(all_labels,labels)
print("score =",score)

[[29 61]
 [35 81]]
score = 0.5631067961165048


In [246]:
def spToTensor(mx):
    """Convert a scipy sparse matrix to a torch sparse tensor."""
    mx = mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((mx.row, mx.col)).astype(np.int64))
    values = torch.from_numpy(mx.data)
    shape = torch.Size(mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape).to(device)

Adj_Matrix = spToTensor(Adj_Matrix)

In [247]:
torch.save(Adj_Matrix, '/content/drive/MyDrive/IR_Project/dataset/Adj931.pt')

In [248]:
#run
weights=[]
row_list=[]
col_list=[]
Adj_Mat=[]
windows=[]
word_window_freq = {}
word_pair_count = {}

In [None]:
xt = torch.FloatTensor(3,2)

In [None]:
print(xt)

tensor([[0.0000e+00, 0.0000e+00],
        [1.8788e+31, 1.7220e+22],
        [2.1715e-18, 2.6309e+20]])


In [294]:
class GraphConvolution(Module):
    """
    Simple GCN layer, similar to https://arxiv.org/abs/1609.02907
    """

    def __init__(self, in_features, out_features,  drop_out = 0, activation=None, bias=True):
        super(GraphConvolution, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.FloatTensor(in_features, out_features))
        print(self.weight.size())
        if bias:
            self.bias = Parameter(torch.zeros(1, out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters(in_features, out_features)
        self.dropout = torch.nn.Dropout(drop_out)
        self.activation =  activation

    def reset_parameters(self,in_features, out_features):
        stdv = np.sqrt(6.0/(in_features+out_features))
        # stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        # if self.bias is not None:
        #     torch.nn.init.zeros_(self.bias)
            # self.bias.data.uniform_(-stdv, stdv)


    def forward(self, input, adj, feature_less = False):
        if feature_less:
            support = self.weight
            support = self.dropout(support)
        else:
            input = self.dropout(input)
            support = torch.mm(input, self.weight)
        output = torch.spmm(adj, support)
        if self.bias is not None:
            output = output + self.bias
        if self.activation is not None:
            output = self.activation(output)
        return output

    def __repr__(self):
        return self.__class__.__name__ + ' (' \
               + str(self.in_features) + ' -> ' \
               + str(self.out_features) + ')'

In [295]:
class GCN(nn.Module):
    def __init__(self, nfeat, nhid, nclass, dropout, n_layers = 2):
        super(GCN, self).__init__()
        self.n_layers = n_layers
        self.gc_list = []
        if n_layers >= 2:
            self.gc1 = GraphConvolution(nfeat, nhid, dropout, activation = nn.ReLU())
            self.gc_list = nn.ModuleList([GraphConvolution(nhid, nhid, dropout, activation = nn.ReLU()) for _ in range(self.n_layers-2)])
            self.gcf = GraphConvolution(nhid, nclass, dropout)
        else:
            self.gc1 = GraphConvolution(nfeat, nclass, dropout)

    def forward(self, x, adj):
        if self.n_layers>=2:
            x = self.gc1(x, adj, feature_less = True)
            for i in range(self.n_layers-2):
                x = self.gc_list[i](x,adj)
            x = self.gcf(x,adj)
        else:
            x = self.gc1(x, adj, feature_less = True)
        return x

In [296]:
def cal_accuracy(predictions,labels):
    pred = torch.argmax(predictions,-1).cpu().tolist()
    lab = labels.cpu().tolist()
    cor = 0
    for i in range(len(pred)):
        if pred[i] == lab[i]:
            cor += 1
    return cor/len(pred)

In [297]:
criterion = nn.CrossEntropyLoss()

model = GCN(nfeat=docs_size+vocab_size, nhid=HIDDEN_DIM, nclass=2, dropout=DROP_OUT,n_layers=NUM_LAYERS).to(device)
optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)

torch.Size([68805, 440])
torch.Size([440, 2])


In [299]:
train_size=train_N
test_size=test_N


In [300]:
features = np.arange(docs_size+vocab_size)
features = torch.FloatTensor(features).to(device)

In [301]:
def generate_train_val(train_pro=0.9):
    real_train_size = int(train_pro*train_size)
    val_size = train_size-real_train_size

    idx_train = np.random.choice(train_size, real_train_size,replace=False)
    idx_train.sort()
    idx_val = []
    pointer = 0
    for v in range(train_size):
        if pointer<len(idx_train) and idx_train[pointer] == v:
            pointer +=1
        else:
            idx_val.append(v)
    idx_test = range(train_N, train_N+test_N)
    return idx_train, idx_val, idx_test

idx_train, idx_val, idx_test = generate_train_val()

In [302]:
import time

def train_model(show_result = True):
    val_loss = []
    l_calc = 1
    for epoch in tqdm(np.arange(NUM_EPOCHS)):
        t = time.time()
        model.train()
        optimizer.zero_grad()
        output= model(features, Adj_Matrix)
        loss_train = criterion(output[idx_train], labels[idx_train])
        acc_train = cal_accuracy(output[idx_train], labels[idx_train])
        loss_train.backward()
        optimizer.step()

        model.eval()
        output = model(features, Adj_Matrix)

        loss_val = criterion(output[idx_val], labels[idx_val])
        val_loss.append(loss_val.item())
        acc_val = cal_accuracy(output[idx_val], labels[idx_val])
        if show_result:
            print(  'Epoch: {:04d}'.format(epoch+1),
                    'loss_train: {:.4f}'.format(loss_train.item()),
                    'acc_train: {:.4f}'.format(acc_train),
                    'loss_val: {:.4f}'.format(loss_val.item()),
                    'acc_val: {:.4f}'.format(acc_val),
                    'time: {:.4f}s'.format(time.time() - t))
        # if epoch%10==0:
        #   torch.save(model.state_dict(), "/content/drive/MyDrive/IR_Project/dataset/model_latest.pth")
        if epoch > EARLY_STOPPING and np.min(val_loss[-EARLY_STOPPING:]) > np.min(val_loss[:-EARLY_STOPPING]) :
            if show_result:
                print("Early Stopping...")
            break
        l_calc+=1
    return l_calc
l_calc = train_model()

  0%|          | 0/200 [00:00<?, ?it/s]

Epoch: 0001 loss_train: 0.6931 acc_train: 0.4806 loss_val: 0.6875 acc_val: 0.7333 time: 1.2032s
Epoch: 0002 loss_train: 0.6812 acc_train: 0.9302 loss_val: 0.6602 acc_val: 0.6667 time: 1.1578s
Epoch: 0003 loss_train: 0.6578 acc_train: 0.5271 loss_val: 0.6869 acc_val: 0.6000 time: 1.1592s
Epoch: 0004 loss_train: 0.6247 acc_train: 0.8605 loss_val: 0.6514 acc_val: 0.8000 time: 1.1618s
Epoch: 0005 loss_train: 0.5737 acc_train: 1.0000 loss_val: 0.6040 acc_val: 0.8000 time: 1.1633s
Epoch: 0006 loss_train: 0.5222 acc_train: 0.9612 loss_val: 0.6043 acc_val: 0.8000 time: 1.1670s
Epoch: 0007 loss_train: 0.4551 acc_train: 1.0000 loss_val: 0.6135 acc_val: 0.7333 time: 1.1715s
Epoch: 0008 loss_train: 0.3927 acc_train: 0.9922 loss_val: 0.5597 acc_val: 0.8000 time: 1.1750s
Epoch: 0009 loss_train: 0.3233 acc_train: 1.0000 loss_val: 0.5089 acc_val: 0.8667 time: 1.1794s
Epoch: 0010 loss_train: 0.2643 acc_train: 1.0000 loss_val: 0.5164 acc_val: 0.8000 time: 1.1792s
Epoch: 0011 loss_train: 0.2044 acc_train

In [303]:
print(len(test_labels))
print(len(train_labels))

62
144


In [304]:
from sklearn.metrics import f1_score, accuracy_score
def test():
    model.eval()
    output = model(features, Adj_Matrix)
    predictions = torch.argmax(output[idx_test],-1).cpu().tolist()
    acc = accuracy_score(test_labels,predictions)
    f11 = f1_score(test_labels,predictions, average='macro')
    f12 = f1_score(test_labels,predictions, average = 'weighted')
    return acc, f11, f12

(acc, f11, f12) = test()
print("Accuracy =",acc)
print("Macro F-1 =",f11)
print("Weighted =",f12)

Accuracy = 0.8225806451612904
Macro F-1 = 0.8168143969916735
Weighted = 0.8252016670565708


In [305]:
file = open('/content/drive/MyDrive/IR_Project/dataset/Analysis.csv', 'a', newline='') 
writer = csv.writer(file) 
# writer.writerow(["NUM_LAYERS","HIDDEN_DIM","DROP_OUT","LR","WEIGHT_DECAY","EARLY_STOPPING","NUM_EPOCHS","train_ratio","test_ratio","wt_k","wt_t","wt_a","window_size","Accuracy","F-1 Macro","F-1 Weighted","epochs_completed"])
writer.writerow([NUM_LAYERS,HIDDEN_DIM,DROP_OUT,LR,WEIGHT_DECAY,EARLY_STOPPING,NUM_EPOCHS,train_ratio,test_ratio,wt_k,wt_t,wt_a,window_size,acc,f11,f12,l_calc])
file.close()


In [None]:
doc_vectors = []

for doc in rows:
  vec = nlp(doc[1]+" "+doc[2]+" "+doc[3])
  doc_vectors.append(vec.vector)

In [None]:
idx_unlabelled = range(N,N+200)
r = list(idx_unlabelled)
file = open('/content/drive/MyDrive/IR_Project/dataset/Unlabelled.csv', 'a', newline='') 
writer = csv.writer(file)
# writer.writerow(["ID","Pred"])
def pred_unlabelled():
  model.eval()
  output = model(features, Adj_Matrix)
  preds = torch.argmax(output[idx_unlabelled],-1).cpu().tolist()
  for i in r:
    writer.writerow([i,preds[i-N]])
  return

pred_unlabelled()
file.close()

In [None]:
type(idx_unlabelled)
print(r)

[206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405]

In [None]:
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

X = doc_vectors[:N] #KMeans
km = KMeans(n_clusters=2)
km.fit(X)
km.predict(X)
labels = km.labels_

