In [1]:
import pickle

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from skmultilearn.problem_transform import LabelPowerset

# Load training and test sets

In [2]:
X_train = pd.read_pickle("../../pickled_files/X_train.pkl")
X_test =pd.read_pickle("../../pickled_files/X_test.pkl")

In [3]:
mlb = pickle.load(open("../../pickled_files/mlb.pkl", 'rb'))
y_train = pickle.load(open("../../pickled_files/y_train.pkl", 'rb'))
y_test = pickle.load(open("../../pickled_files/y_test.pkl", 'rb'))

# Train the model

In [4]:
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf-svm', LabelPowerset(
                             SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-4, max_iter=6)))])
# Fit the model
_ = text_clf_svm.fit(X_train, y_train)

# Make prediction
predicted_svm = text_clf_svm.predict(X_test)



# Evaluate the model

In [5]:
print(classification_report(y_test, predicted_svm))

              precision    recall  f1-score   support

           0       0.63      0.57      0.60       121
           1       0.67      0.81      0.73       289
           2       0.84      0.71      0.77       301
           3       0.89      0.41      0.56        80
           4       0.67      0.77      0.71       145

   micro avg       0.72      0.70      0.71       936
   macro avg       0.74      0.65      0.68       936
weighted avg       0.74      0.70      0.71       936
 samples avg       0.74      0.73      0.72       936



In [8]:
print(accuracy_score(y_test, predicted_svm))

0.5847568988173456


In [6]:
np.mean(predicted_svm == y_test)

0.8607095926412615