In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn import metrics
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import pickle

In [2]:
X_train=pd.read_csv("../dataset/X_train.csv")
X_test=pd.read_csv("../dataset/X_test.csv")
y_train=pd.read_csv("../dataset/y_train.csv")
y_test=pd.read_csv("../dataset/y_test.csv")

In [3]:
X_train_title=X_train['TITLE']
X_train_abstract=X_train['ABSTRACT']

In [4]:
tf_idf_title=TfidfVectorizer()
title_feature=tf_idf_title.fit_transform(X_train_title)

In [5]:
print(title_feature.shape)

(16777, 12505)

In [6]:
title_feature_test=tf_idf_title.transform(X_test['TITLE'])

In [10]:
tf_idf_abstract=TfidfVectorizer(ngram_range=(1,3),min_df=3)
abstract_feature=tf_idf_abstract.fit_transform(X_train_abstract)

In [11]:
print(abstract_feature.shape)

(16777, 115133)


In [12]:
abstract_feature_test=tf_idf_abstract.transform(X_test['ABSTRACT'])

In [13]:
print(abstract_feature_test.shape)
print(title_feature_test.shape)

(4195, 115133)
(4195, 12505)


In [14]:
X_train_final=hstack((title_feature,abstract_feature))
X_test_final=hstack((title_feature_test,abstract_feature_test))

In [16]:
print(X_train_final.shape)
print(X_test_final.shape)

(16777, 127638)
(4195, 127638)


In [17]:
y_train=y_train.values

In [18]:
y_test=y_test.values

In [19]:
classifier = OneVsRestClassifier(MultinomialNB())
classifier.fit(X_train_final, y_train)
predictions = classifier.predict (X_test_final)


print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = metrics.precision_score(y_test, predictions, average='micro')
recall = metrics.recall_score(y_test, predictions, average='micro')
f1 = metrics.f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = metrics.precision_score(y_test, predictions, average='macro')
recall = metrics.recall_score(y_test, predictions, average='macro')
f1 = metrics.f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

Accuracy : 0.5969010727056019
Hamming loss  0.08943186332936034
Micro-average quality numbers
Precision: 0.8885, Recall: 0.6536, F1-measure: 0.7532
Macro-average quality numbers
Precision: 0.6059, Recall: 0.4318, F1-measure: 0.4959


  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
nb_classifier = OneVsRestClassifier(BernoulliNB())
nb_classifier.fit(X_train_final, y_train)
predictions = nb_classifier.predict (X_test_final)


print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = metrics.precision_score(y_test, predictions, average='micro')
recall = metrics.recall_score(y_test, predictions, average='micro')
f1 = metrics.f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = metrics.precision_score(y_test, predictions, average='macro')
recall = metrics.recall_score(y_test, predictions, average='macro')
f1 = metrics.f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

Accuracy : 0.6123957091775923
Hamming loss  0.08537941994437823
Micro-average quality numbers
Precision: 0.7838, Recall: 0.8161, F1-measure: 0.7996
Macro-average quality numbers
Precision: 0.5261, Recall: 0.5603, F1-measure: 0.5405


  _warn_prf(average, modifier, msg_start, len(result))


In [21]:
lr_classifier = OneVsRestClassifier(LogisticRegression(max_iter=500,class_weight='balanced'))
lr_classifier.fit(X_train_final, y_train)
predictions = lr_classifier.predict (X_test_final)


print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = metrics.precision_score(y_test, predictions, average='micro')
recall = metrics.recall_score(y_test, predictions, average='micro')
f1 = metrics.f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = metrics.precision_score(y_test, predictions, average='macro')
recall = metrics.recall_score(y_test, predictions, average='macro')
f1 = metrics.f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

Accuracy : 0.634326579261025
Hamming loss  0.07926102502979737
Micro-average quality numbers
Precision: 0.7787, Recall: 0.8666, F1-measure: 0.8203
Macro-average quality numbers
Precision: 0.7025, Recall: 0.8027, F1-measure: 0.7480


In [23]:
svm_classifier = OneVsRestClassifier(LinearSVC(class_weight='balanced'))
svm_classifier.fit(X_train_final, y_train)
predictions = svm_classifier.predict (X_test_final)


print("Accuracy :",metrics.accuracy_score(y_test, predictions))
print("Hamming loss ",metrics.hamming_loss(y_test,predictions))


precision = metrics.precision_score(y_test, predictions, average='micro')
recall = metrics.recall_score(y_test, predictions, average='micro')
f1 = metrics.f1_score(y_test, predictions, average='micro')
 
print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

precision = metrics.precision_score(y_test, predictions, average='macro')
recall = metrics.recall_score(y_test, predictions, average='macro')
f1 = metrics.f1_score(y_test, predictions, average='macro')
 
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

Accuracy : 0.6474374255065555
Hamming loss  0.07767183154549066
Micro-average quality numbers
Precision: 0.8091, Recall: 0.8219, F1-measure: 0.8154
Macro-average quality numbers
Precision: 0.7674, Recall: 0.7224, F1-measure: 0.7403


## Best model is Linear SVM

In [24]:
with open('../models/tf-idf_svm.pkl', 'wb') as fid:
    pickle.dump(svm_classifier, fid)