In [None]:
import torch
from torch import nn
from torch import optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn as sk
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import copy
from numpy import dtype
from torch import nn, tensor
from torch import optim
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')


#load train and test json data
train_data = pd.read_json('train.json')
test_data = pd.read_json('test.json')
print(train_data.head())

In [None]:
#split the authors feature in train data into coauthors and p_authors where p_authors is the author that < 100
new_train_data = pd.DataFrame(columns=['id','coauthors','p_author','year','title','abstract','venue'])

#using list to store the paper information
the_columns = ['id','coauthors','p_author','title','abstract']
id_list = []
coauthors_list = []
p_author_list = []
title_list = []
abstract_list = []
year_list = []
venue_list = []

for i in range(len(train_data)):
    paper_info = train_data.iloc[i]
    authors = paper_info['authors']
    venue = paper_info['venue']
    abstract = paper_info['abstract']
    title = paper_info['title']
    year = paper_info['year']

    coauthors = []
    p_authors = []
    for author in authors:
        if author < 100:
            p_authors.append(author)
        else:
            coauthors.append(author)
    
    if p_authors == []:
        p_authors.append(-1)
    
    if len(coauthors) != 0:
        #not use append to append a list to a dataframe choose a faster way
        id_list.append(i)
        coauthors_list.append(coauthors)
        p_author_list.append(p_authors)
        title_list.append(title)
        abstract_list.append(abstract)
        year_list.append(year)
        venue_list.append(venue)

#transform the list to dic
new_train_data_dic = {'id':id_list,'coauthors':coauthors_list,'p_author':p_author_list,'title':title_list,'abstract':abstract_list,'year':year_list,'venue':venue_list}

#transform the dic to dataframe
new_train_data = pd.DataFrame(new_train_data_dic)

print(new_train_data[0:10])

In [None]:
#using one hot encoding on the prolific author in the train data using sklearn
from sklearn.preprocessing import OneHotEncoder

id_list = []
coauthors_list = []
p_author_list = []
title_list = []
abstract_list = []
year_list = []
venue_list = []

one_hot_training = ['id','coauthors','p_author','title','abstract','year','venue']
for i in range(len(new_train_data)):
    if i % 100 == 0:
        print(i)
        print('left to process: ',len(new_train_data)-i)
    id = new_train_data.iloc[i]['id']
    p_author = new_train_data.iloc[i]['p_author']
    coauthors = new_train_data.iloc[i]['coauthors']
    title = new_train_data.iloc[i]['title']
    abstract = new_train_data.iloc[i]['abstract']
    year = new_train_data.iloc[i]['year']
    venue = new_train_data.iloc[i]['venue']

    for p in p_author:
        id_list.append(id)
        coauthors_list.append(coauthors)
        p_author_list.append(p)
        title_list.append(title)
        abstract_list.append(abstract)
        year_list.append(year)
        venue_list.append(venue)

#create a new dic to store the new data
new_train_data_dic = {'id':id_list,'coauthors':coauthors_list,'p_author':p_author_list,'title':title_list,'abstract':abstract_list,'year':year_list,'venue':venue_list}
#convert the dic to dataframe
new_train_data = pd.DataFrame(new_train_data_dic)


In [268]:
#one hot encode using pd get_dummies
one_hot_training = pd.get_dummies(new_train_data, columns = ['p_author'])
#remove all rows that have empty venue
one_hot_training = one_hot_training[one_hot_training['venue'] != '']

#convert the abstract to doc2vec
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(one_hot_training['abstract'])]
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)
#convert the abstract to doc2vec
abstract_list = []
for i in range(len(one_hot_training)):
    #print(model.docvecs[i])
    abstract_list.append(model.docvecs[i])
#add the abstract to the dataframe
one_hot_training['abstract'] = abstract_list

#convert the coauthors to 5 demension and fill the empty coauthors with -1
coauthors_list = []
for i in range(len(one_hot_training)):
    coauthors = one_hot_training.iloc[i]['coauthors']
    if len(coauthors) < 5:
        coauthors = coauthors + [-1] * (5 - len(coauthors))
    else:
        #only add the first five coauthors
        coauthors = coauthors[0:5]
    coauthors_list.append(coauthors)
one_hot_training['coauthors'] = coauthors_list

#convert the title to doc2vec
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(one_hot_training['title'])]
model = Doc2Vec(documents, vector_size=5, window=2, min_count=1, workers=4)

#convert the abstract to doc2vec
title_list = []
for i in range(len(one_hot_training)):
    #print(model.docvecs[i])
    title_list.append(model.docvecs[i])
one_hot_training['title'] = title_list

#convert the venue and year to 5 demension and fill the empty using copy of itself
venue_list = []
year_list = []
for i in range(len(one_hot_training)):
    venue = one_hot_training.iloc[i]['venue']
    year = one_hot_training.iloc[i]['year']
    venue_list.append([venue] * 5)
    year_list.append([year] * 5)
one_hot_training['venue'] = venue_list
one_hot_training['year'] = year_list



In [283]:
#split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(one_hot_training[['coauthors','title','abstract','year','venue']], one_hot_training.drop(['id','coauthors','title','abstract','year','venue'],axis=1), test_size=0.2, random_state=42)

epochs = 100
batch_size = 100
learning_rate = 0.001

def convert_to_tensor(training_set):
    all_train = []

    for i in range(len(training_set)):
        if i % 1000 == 0:
            print(i)
            print('left to process: ',len(training_set)-i)

        coauthors = training_set.iloc[i]['coauthors']
        #convert to a float list
        coauthors = [float(x) for x in coauthors]

        title = training_set.iloc[i]['title']
        #convert to a float list
        title = [float(x) for x in title]

        abstract = training_set.iloc[i]['abstract']
        #convert to a float list
        abstract = [float(x) for x in abstract]

        year = training_set.iloc[i]['year']
        #convert to a float list
        year = [float(x) for x in year]

        venue = training_set.iloc[i]['venue']
        #convert to a float list
        venue = [float(x) for x in venue]
        
        all_train.append([coauthors,title,abstract,year,venue])

    x = torch.tensor(all_train)

    return x


#create a MLP model with 1 hidden layer and softmax as the output layer
class MLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.hidden = nn.Linear(input_size, hidden_size)
        self.output = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        x = F.relu(self.hidden(x))
        x = self.output(x)
        x = self.softmax(x)
        return x

    def train(self, train_tensor, label_train_tensor, iterations):
        loss_func = nn.CrossEntropyLoss()
        optimizer = optim.SGD(self.parameters(), lr=0.01)

        for epoch in range(epochs):
            train_loss = 0
            for i in range(len(train_tensor)):
                optimizer.zero_grad()
                output = self(tensor_train[i][0] + tensor_train[i][1] + tensor_train[i][2] + tensor_train[i][3] + tensor_train[i][4])
                loss = loss_func(output, label_tensor[i])
                loss.backward()
                optimizer.step()
                train_loss += loss.item()
            print('Epoch = ',epoch,'Train loss =',train_loss)

model = MLP(5,20,101)
tensor_train = convert_to_tensor(X_train)

#convert the y_train to tensor
label_train_c = []
for i in range(len(y_train)):
    #convert the row to a list
    y_train_row = np.array(y_train.iloc[i].tolist())
    label_train_c.append(y_train_row)
label_tensor = torch.tensor(label_train_c)

#training the NN






0
left to process:  20095
1000
left to process:  19095
2000
left to process:  18095
3000
left to process:  17095
4000
left to process:  16095
5000
left to process:  15095
6000
left to process:  14095
7000
left to process:  13095
8000
left to process:  12095
9000
left to process:  11095
10000
left to process:  10095
11000
left to process:  9095
12000
left to process:  8095
13000
left to process:  7095
14000
left to process:  6095
15000
left to process:  5095
16000
left to process:  4095
17000
left to process:  3095
18000
left to process:  2095
19000
left to process:  1095
20000
left to process:  95


TypeError: MLP.train() takes 4 positional arguments but 5 were given

In [284]:
model.train(tensor_train,label_tensor,10000)


IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [None]:
#change the test data to the same shape as train data
