In [110]:
import pandas as pd
import numpy as np
import gensim
import re
import nltk
from nltk.corpus import stopwords
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
import tqdm
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
import pickle


In [73]:
df=pd.read_csv('../dataset/train.csv')

In [74]:
df.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0


In [75]:
df=df.drop('ID',axis=1)

In [76]:
def remove_misc(text):
    text=re.sub(r'\n', ' ', text)
    text=re.sub(r'-', ' ', text)
    text=re.sub(r'\s+', ' ', text)
    text=re.sub(r'\([A-za-z\d]*\)', '', text)
    text=text.strip()
    return text

In [77]:
df['TITLE'] = df['TITLE'].apply(remove_misc)
df['ABSTRACT'] = df['ABSTRACT'].apply(remove_misc)

In [78]:
df['TITLE'] = df['TITLE'].apply(lambda x:x.lower())
df['ABSTRACT'] = df['ABSTRACT'].apply(lambda x:x.lower())

In [79]:
df.head()

Unnamed: 0,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,reconstructing subject specific effect maps,predictive models allow subject specific infer...,1,0,0,0,0,0
1,rotation invariance neural network,rotation invariance and translation invariance...,1,0,0,0,0,0
2,spherical polyharmonics and poisson kernels fo...,we introduce and develop the notion of spheric...,0,0,1,0,0,0
3,a finite element approximation for the stochas...,the stochastic landau lifshitz gilbert equati...,0,0,1,0,0,0
4,comparative study of discrete wavelet transfor...,fourier transform infra red spectra of sample...,1,0,0,1,0,0


In [80]:
sw_list = stopwords.words('english')

df['TITLE'] = df['TITLE'].apply(lambda x: [item for item in x.split() if item not in sw_list]).apply(lambda x:" ".join(x))
df['ABSTRACT'] = df['ABSTRACT'].apply(lambda x: [item for item in x.split() if item not in sw_list]).apply(lambda x:" ".join(x))

In [81]:
df.head()

Unnamed: 0,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,reconstructing subject specific effect maps,predictive models allow subject specific infer...,1,0,0,0,0,0
1,rotation invariance neural network,rotation invariance translation invariance gre...,1,0,0,0,0,0
2,spherical polyharmonics poisson kernels polyha...,introduce develop notion spherical polyharmoni...,0,0,1,0,0,0
3,finite element approximation stochastic maxwel...,stochastic landau lifshitz gilbert equation co...,0,0,1,0,0,0
4,comparative study discrete wavelet transforms ...,fourier transform infra red spectra samples 7 ...,1,0,0,1,0,0


In [None]:
title = []
for doc in df['TITLE']:
    raw_sent = sent_tokenize(doc)
    for sent in raw_sent:
        title.append(simple_preprocess(sent))
abstract=[]
for doc in df['ABSTRACT']:
    raw_sent = sent_tokenize(doc)
    for sent in raw_sent:
        abstract.append(simple_preprocess(sent))

In [None]:
model_title = gensim.models.Word2Vec(
    window=4,
    min_count=2
)

model_abstract= gensim.models.Word2Vec(
    window=7,
    min_count=4
)

In [None]:
model_title.build_vocab(title)
model_abstract.build_vocab(abstract)

In [None]:
model_title.train(title, total_examples=model_title.corpus_count, epochs=100)
model_abstract.train(abstract, total_examples=model_abstract.corpus_count, epochs=100)

In [82]:
model_abstract=gensim.models.Word2Vec.load('Word2vec Model/model_abstract.model')
model_title=gensim.models.Word2Vec.load('Word2vec Model/model_title.model')

In [None]:
model_title.save('Word2vec Model/model_title.model')

In [None]:
model_abstract.save('Word2vec Model/model_abstract.model')

In [83]:

def document_vector(doc,model,i):
    doc = [word for word in doc.split() if word in model.wv.index_to_key]
    if len(doc)==0:
        return np.zeros((100))
    return np.mean(model.wv[doc], axis=0)

In [84]:
X_abstract = []
for i,doc in tqdm.tqdm(enumerate(df['ABSTRACT'].values)):
    X_abstract.append(document_vector(doc,model_abstract,i))

20972it [02:30, 139.18it/s]


In [85]:
X_title= []
for i,doc in tqdm.tqdm(enumerate(df['TITLE'].values)):
    X_title.append(document_vector(doc,model_abstract,i))

20972it [00:09, 2308.93it/s]


In [86]:
X_train_abstract=np.array(X_abstract)

In [87]:
X_train_title=np.array(X_title)

In [88]:
X_train_abstract.shape

(20972, 100)

In [89]:
X_train_title.shape

(20972, 100)

In [90]:
X_train=np.hstack((X_train_title,X_train_abstract))

In [91]:
X_train.shape

(20972, 200)

In [92]:
y_train=df[['Computer Science','Physics','Mathematics','Statistics','Quantitative Biology','Quantitative Finance']]

In [93]:
y_train=y_train.values

In [94]:
y_train.shape

(20972, 6)

In [95]:
X_train_final,X_test_final,y_train,y_test=train_test_split(X_train,y_train,test_size=0.2)

## Logistic regression Model

In [104]:
lr_classifier = OneVsRestClassifier(LogisticRegression(penalty='elasticnet',max_iter=1000,solver='saga',l1_ratio=0.5))
lr_classifier.fit(X_train_final, y_train)
predictions = lr_classifier.predict (X_test_final)


print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = metrics.precision_score(y_test, predictions, average='micro')
recall = metrics.recall_score(y_test, predictions, average='micro')
f1 = metrics.f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = metrics.precision_score(y_test, predictions, average='macro')
recall = metrics.recall_score(y_test, predictions, average='macro')
f1 = metrics.f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

Accuracy : 0.6491060786650775
Hamming loss  0.07747318235995232
Micro-average quality numbers
Precision: 0.8306, Recall: 0.7894, F1-measure: 0.8095
Macro-average quality numbers
Precision: 0.7431, Recall: 0.6558, F1-measure: 0.6927


In [99]:
C=[0.1,0.01,0.001,1,10,100]
penalty=['l1', 'l2', 'elasticnet', 'none']
for c in C:
    print("For C ",c)
    lr_classifier = OneVsRestClassifier(LogisticRegression(C=c,max_iter=500))
    lr_classifier.fit(X_train_final, y_train)
    predictions = lr_classifier.predict (X_test_final)


    print("Accuracy :",metrics.accuracy_score(y_test, predictions))


    precision = metrics.precision_score(y_test, predictions, average='micro')
    recall = metrics.recall_score(y_test, predictions, average='micro')
    f1 = metrics.f1_score(y_test, predictions, average='micro')
 
    print("Micro-average quality numbers")
    print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

    precision = metrics.precision_score(y_test, predictions, average='macro')
    recall = metrics.recall_score(y_test, predictions, average='macro')
    f1 = metrics.f1_score(y_test, predictions, average='macro')
 
    print("Macro-average quality numbers")
    print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
        
    print('\n\n')

For C  0.1
Accuracy : 0.6500595947556616
Micro-average quality numbers
Precision: 0.8328, Recall: 0.7879, F1-measure: 0.8098
Macro-average quality numbers
Precision: 0.7537, Recall: 0.6439, F1-measure: 0.6871



For C  0.01
Accuracy : 0.6483909415971395
Micro-average quality numbers
Precision: 0.8370, Recall: 0.7816, F1-measure: 0.8084
Macro-average quality numbers
Precision: 0.7684, Recall: 0.6055, F1-measure: 0.6567



For C  0.001
Accuracy : 0.6410011918951132
Micro-average quality numbers
Precision: 0.8388, Recall: 0.7607, F1-measure: 0.7978
Macro-average quality numbers
Precision: 0.8286, Recall: 0.5238, F1-measure: 0.5528



For C  1
Accuracy : 0.6491060786650775
Micro-average quality numbers
Precision: 0.8311, Recall: 0.7893, F1-measure: 0.8096
Macro-average quality numbers
Precision: 0.7434, Recall: 0.6571, F1-measure: 0.6941



For C  10
Accuracy : 0.6491060786650775
Micro-average quality numbers
Precision: 0.8302, Recall: 0.7898, F1-measure: 0.8095
Macro-average quality numbe

In [105]:
lr_classifier = OneVsRestClassifier(LogisticRegression(C=1,max_iter=500))
lr_classifier.fit(X_train_final, y_train)
predictions = lr_classifier.predict (X_test_final)


print("Accuracy :",metrics.accuracy_score(y_test, predictions))


precision = metrics.precision_score(y_test, predictions, average='micro')
recall = metrics.recall_score(y_test, predictions, average='micro')
f1 = metrics.f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = metrics.precision_score(y_test, predictions, average='macro')
recall = metrics.recall_score(y_test, predictions, average='macro')
f1 = metrics.f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))
        
print('\n\n')

Accuracy : 0.6491060786650775
Micro-average quality numbers
Precision: 0.8311, Recall: 0.7893, F1-measure: 0.8096
Macro-average quality numbers
Precision: 0.7434, Recall: 0.6571, F1-measure: 0.6941





In [107]:
classifier = OneVsRestClassifier(LinearSVC(max_iter=3000))
classifier.fit(X_train_final, y_train)
predictions = classifier.predict (X_test_final)


print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = metrics.precision_score(y_test, predictions, average='micro')
recall = metrics.recall_score(y_test, predictions, average='micro')
f1 = metrics.f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = metrics.precision_score(y_test, predictions, average='macro')
recall = metrics.recall_score(y_test, predictions, average='macro')
f1 = metrics.f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))



Accuracy : 0.6498212157330155
Hamming loss  0.07711561382598331
Micro-average quality numbers
Precision: 0.8334, Recall: 0.7875, F1-measure: 0.8098
Macro-average quality numbers
Precision: 0.7495, Recall: 0.6404, F1-measure: 0.6825




In [108]:
rf_classifier = OneVsRestClassifier(RandomForestClassifier())
classifier.fit(X_train_final, y_train)
predictions = classifier.predict (X_test_final)


print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = metrics.precision_score(y_test, predictions, average='micro')
recall = metrics.recall_score(y_test, predictions, average='micro')
f1 = metrics.f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = metrics.precision_score(y_test, predictions, average='macro')
recall = metrics.recall_score(y_test, predictions, average='macro')
f1 = metrics.f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))



Accuracy : 0.6493444576877235
Hamming loss  0.07719507350019865
Micro-average quality numbers
Precision: 0.8335, Recall: 0.7870, F1-measure: 0.8096
Macro-average quality numbers
Precision: 0.7496, Recall: 0.6400, F1-measure: 0.6823




In [111]:
with open('../models/word2vec_lr.pkl', 'wb') as fid:
    pickle.dump(lr_classifier, fid)