In [1]:
import pickle

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from skmultilearn.problem_transform import LabelPowerset

# Load Training and Test sets

In [2]:
X_train = pd.read_pickle("../../pickled_files/X_train.pkl")
X_test =pd.read_pickle("../../pickled_files/X_test.pkl")

In [3]:
mlb = pickle.load(open("../../pickled_files/mlb.pkl", 'rb'))
y_train = pickle.load(open("../../pickled_files/y_train.pkl", 'rb'))
y_test = pickle.load(open("../../pickled_files/y_test.pkl", 'rb'))

# Train the model

In [4]:
text_clf = Pipeline([('vect', CountVectorizer(stop_words = "english",ngram_range=(1, 1))),
                     ('tfidf', TfidfTransformer(use_idf=False)),
                     ('clf', LabelPowerset(RandomForestClassifier(n_estimators = 13))),])

# Fit the model
text_clf = text_clf.fit(X_train, y_train)

# Make prediction
predicted = text_clf.predict(X_test)

# Evaluate the model

In [5]:
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.88      0.30      0.44       121
           1       0.64      0.85      0.73       289
           2       0.85      0.67      0.75       301
           3       0.82      0.40      0.54        80
           4       0.81      0.67      0.73       145

   micro avg       0.75      0.66      0.70       936
   macro avg       0.80      0.58      0.64       936
weighted avg       0.78      0.66      0.68       936
 samples avg       0.73      0.68      0.69       936



In [6]:
print(accuracy_score(y_test, predicted))

0.5978975032851511


In [7]:
np.mean(predicted == y_test)

0.8604467805519054