In [27]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import hstack
from sklearn import metrics
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB,GaussianNB,BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC,SVC
import pickle
from sklearn.ensemble import RandomForestClassifier

In [2]:
X_train=pd.read_csv("../dataset/X_train.csv")
X_test=pd.read_csv("../dataset/X_test.csv")
y_train=pd.read_csv("../dataset/y_train.csv")
y_test=pd.read_csv("../dataset/y_test.csv")

In [3]:
X_test.shape

(4195, 2)

In [4]:
X_train_title=X_train['TITLE']
X_train_abstract=X_train['ABSTRACT']

In [5]:
cv_title=CountVectorizer()

In [6]:
title_feature=cv_title.fit_transform(X_train_title)

In [7]:
print(len(cv_title.vocabulary_))
print(cv_title.vocabulary_)

12505
{'number': 7727, 'rich': 9468, 'word': 12230, 'partial': 8117, 'invers': 5522, 'problem': 8735, 'sturm': 10682, 'liouvil': 6194, 'oper': 7863, 'graph': 4593, 'loop': 6277, 'mind': 6920, 'gap': 4357, 'well': 12136, 'log': 6250, 'data': 2747, 'analysi': 684, 'hyper': 5101, 'suprim': 10887, 'cam': 1712, 'softwar': 10308, 'pipelin': 8417, 'electr': 3446, 'transient': 11394, 'law': 6019, 'neuron': 7496, 'microdomain': 6833, 'base': 1163, 'electro': 3449, 'diffus': 3059, 'evolutionari': 3737, 'central': 1882, 'maxim': 6635, 'cliqu': 2101, 'mobil': 7008, 'social': 10293, 'network': 7481, 'cyclic': 2699, 'weight': 12127, 'ellp': 3492, 'space': 10353, 'coupl': 2525, 'ident': 5195, 'local': 6237, 'fermion': 3978, 'chain': 1928, 'quasi': 8986, 'random': 9084, 'disord': 3140, 'densiti': 2928, 'repres': 9344, 'diagon': 3012, 'form': 4166, 'larg': 5986, 'degre': 2874, 'geodet': 4439, 'hull': 5066, 'hard': 4747, 'chordal': 2023, 'fast': 3891, 'autom': 1021, 'strong': 10667, 'gravit': 4618, 'len

In [8]:
print(title_feature.shape)

(16777, 12505)


In [9]:
title_feature_test=cv_title.transform(X_test['TITLE'])

In [10]:
cv_abstract=CountVectorizer(ngram_range=(1,3),min_df=4)
abstract_feature=cv_abstract.fit_transform(X_train_abstract)

In [11]:
print(len(cv_abstract.vocabulary_))
print(cv_abstract.vocabulary_)

72501
{'finit': 23900, 'word': 71902, 'length': 34361, 'contain': 12279, 'n1': 40652, 'distinct': 17690, 'factor': 22798, 'bound': 7317, 'reach': 52792, 'call': 7894, 'rich': 55434, 'number': 42788, 'alphabet': 2765, 'cardin': 8116, 'denot': 15249, 'rn': 55563, 'binari': 7000, 'deduc': 14727, 'constant': 12039, 'prove': 51166, 'infti': 31130, 'ie': 29444, 'subexponenti': 63139, 'growth': 27895, 'finit word': 23996, 'word length': 71917, 'binari alphabet': 7001, 'sturm': 63085, 'liouvil': 35125, 'oper': 44333, 'singular': 59412, 'potenti': 48178, 'lasso': 33551, 'graph': 27401, 'consid': 11763, 'suppos': 63673, 'known': 32943, 'priori': 49298, 'boundari': 7488, 'edg': 18777, 'recov': 53237, 'loop': 35558, 'part': 46005, 'spectrum': 60923, 'addit': 1444, 'data': 13853, 'uniqu': 68474, 'theorem': 65563, 'provid': 51300, 'construct': 12164, 'algorithm': 2101, 'solut': 59955, 'partial': 46039, 'invers': 32148, 'problem': 49407, 'sturm liouvil': 63086, 'graph consid': 27430, 'known priori': 

In [12]:
print(abstract_feature.shape)

(16777, 72501)


In [13]:
abstract_feature_test=cv_abstract.transform(X_test['ABSTRACT'])

In [14]:
print(abstract_feature_test.shape)

(4195, 72501)


In [15]:
print(title_feature_test.shape)

(4195, 12505)


In [16]:
X_train_final=hstack((title_feature,abstract_feature))
X_test_final=hstack((title_feature_test,abstract_feature_test))

In [17]:
y_train=y_train.values

In [18]:
y_train.shape

(16777, 6)

In [19]:
y_test=y_test.values

In [20]:
y_test.shape

(4195, 6)

## naive Bayes

In [21]:
classifier = OneVsRestClassifier(MultinomialNB())
classifier.fit(X_train_final, y_train)
predictions = classifier.predict (X_test_final)


print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = metrics.precision_score(y_test, predictions, average='micro')
recall = metrics.recall_score(y_test, predictions, average='micro')
f1 = metrics.f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = metrics.precision_score(y_test, predictions, average='macro')
recall = metrics.recall_score(y_test, predictions, average='macro')
f1 = metrics.f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

Accuracy : 0.6188319427890345
Hamming loss  0.08247914183551848
Micro-average quality numbers
Precision: 0.7655, Recall: 0.8721, F1-measure: 0.8153
Macro-average quality numbers
Precision: 0.7866, Recall: 0.6472, F1-measure: 0.6322


In [22]:
nb_classifier = OneVsRestClassifier(BernoulliNB())
nb_classifier.fit(X_train_final, y_train)
predictions = nb_classifier.predict (X_test_final)


print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = metrics.precision_score(y_test, predictions, average='micro')
recall = metrics.recall_score(y_test, predictions, average='micro')
f1 = metrics.f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = metrics.precision_score(y_test, predictions, average='macro')
recall = metrics.recall_score(y_test, predictions, average='macro')
f1 = metrics.f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

Accuracy : 0.6071513706793802
Hamming loss  0.0867699642431466
Micro-average quality numbers
Precision: 0.7709, Recall: 0.8314, F1-measure: 0.8000
Macro-average quality numbers
Precision: 0.5177, Recall: 0.5719, F1-measure: 0.5410


  _warn_prf(average, modifier, msg_start, len(result))


## Logistic regression

In [23]:
lr_classifier = OneVsRestClassifier(LogisticRegression(max_iter=500,class_weight='balanced'))
lr_classifier.fit(X_train_final, y_train)
predictions = lr_classifier.predict (X_test_final)


print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = metrics.precision_score(y_test, predictions, average='micro')
recall = metrics.recall_score(y_test, predictions, average='micro')
f1 = metrics.f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = metrics.precision_score(y_test, predictions, average='macro')
recall = metrics.recall_score(y_test, predictions, average='macro')
f1 = metrics.f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

Accuracy : 0.6405244338498212
Hamming loss  0.07957886372665872
Micro-average quality numbers
Precision: 0.8061, Recall: 0.8148, F1-measure: 0.8104
Macro-average quality numbers
Precision: 0.7286, Recall: 0.7159, F1-measure: 0.7214


## SVM

In [24]:
svm_classifier = OneVsRestClassifier(LinearSVC())
svm_classifier.fit(X_train_final, y_train)
predictions = svm_classifier.predict (X_test_final)


print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = metrics.precision_score(y_test, predictions, average='micro')
recall = metrics.recall_score(y_test, predictions, average='micro')
f1 = metrics.f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = metrics.precision_score(y_test, predictions, average='macro')
recall = metrics.recall_score(y_test, predictions, average='macro')
f1 = metrics.f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

Accuracy : 0.6240762812872467
Hamming loss  0.08398887564560985
Micro-average quality numbers
Precision: 0.8102, Recall: 0.7805, F1-measure: 0.7951
Macro-average quality numbers
Precision: 0.7510, Recall: 0.6598, F1-measure: 0.6960


In [29]:
svm_classifier = OneVsRestClassifier(RandomForestClassifier(class_weight='balanced'))
svm_classifier.fit(X_train_final, y_train)
predictions = svm_classifier.predict (X_test_final)


print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = metrics.precision_score(y_test, predictions, average='micro')
recall = metrics.recall_score(y_test, predictions, average='micro')
f1 = metrics.f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = metrics.precision_score(y_test, predictions, average='macro')
recall = metrics.recall_score(y_test, predictions, average='macro')
f1 = metrics.f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

Accuracy : 0.6131108462455304
Hamming loss  0.08585617798967024
Micro-average quality numbers
Precision: 0.8328, Recall: 0.7366, F1-measure: 0.7817
Macro-average quality numbers
Precision: 0.5643, Recall: 0.4963, F1-measure: 0.5251


  _warn_prf(average, modifier, msg_start, len(result))


## Best model out of all three is Logistic Regression

In [26]:
with open('../models/count_vectorizer_lr.pkl', 'wb') as fid:
    pickle.dump(lr_classifier, fid)

## saved the model