# 1. References

Title: Multi-Label Text Classification Using Scikit-multilearn: a Case Study with StackOverflow Questions

Link: https://medium.com/towards-artificial-intelligence/multi-label-text-classification-using-scikit-multilearn-case-study-with-stackoverflow-questions-768cb487ad12

In [1]:
import pandas as pd

In [2]:
DATA_DIR = "../../data/raw/"
# INPUT_FILE_NAME = 'cleaned.parquet'
INPUT_FILE_NAME = 'cleaned_squashed.parquet'


In [3]:
df = pd.read_parquet(DATA_DIR + INPUT_FILE_NAME)
# df = df[:200]  # same as df.head(10)
df.head()

Unnamed: 0,speaker,headline,description,duration,tags,transcript,WC,clean_transcript,clean_transcript_string,squash_tags
0,Al Gore,Averting the climate crisis,With the same humor and humanity he exuded in ...,0:16:17,"cars,alternative energy,culture,politics,scien...","0:14\r\r\rThank you so much, Chris.\rAnd it's ...",2281.0,"[thank, chris, truly, great, honor, opportunit...",thank chris truly great honor opportunity come...,"alternative energy,culture,politics,science,cl..."
1,Amy Smith,Simple designs to save a life,Fumes from indoor cooking fires kill more than...,0:15:06,"MacArthur grant,simplicity,industrial design,a...","0:11\r\r\rIn terms of invention,\rI'd like to ...",2687.0,"[term, invention, like, tell, tale, favorite, ...",term invention like tell tale favorite project...,"industrial design,alternative energy,invention..."
2,Ashraf Ghani,How to rebuild a broken state,Ashraf Ghani's passionate and powerful 10-minu...,0:18:45,"corruption,poverty,economics,investment,milita...","0:12\r\r\rA public, Dewey long ago observed,\r...",2506.0,"[public, dewey, long, ago, observe, constitute...",public dewey long ago observe constitute discu...,"poverty,economics,investment,culture,politics,..."
3,Burt Rutan,The real future of space exploration,"In this passionate talk, legendary spacecraft ...",0:19:37,"aircraft,flight,industrial design,NASA,rocket ...","0:11\r\r\rI want to start off by saying, Houst...",3092.0,"[want, start, say, houston, problem, enter, se...",want start say houston problem enter second ge...,"industrial design,invention,engineering,entrep..."
4,Chris Bangle,Great cars are great art,American designer Chris Bangle explains his ph...,0:20:04,"cars,industrial design,transportation,inventio...","0:12\r\r\rWhat I want to talk about is, as bac...",3781.0,"[want, talk, background, idea, car, art, actua...",want talk background idea car art actually mea...,"industrial design,transportation,invention,des..."


In [4]:
df_x = df[['headline', 'clean_transcript_string']]
df_y = df[['squash_tags']]

In [5]:
from sklearn.preprocessing import MultiLabelBinarizer

y = []
for index, row in df_y.iterrows():
    y.append(set(row['squash_tags'].split(',')))
    
mlb = MultiLabelBinarizer()
encoded_y = mlb.fit_transform(y)

In [6]:
print(encoded_y[0])
print(len(encoded_y[0]))
#print(mlb.inverse_transform(encoded_y))

[0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
179


In [7]:
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from sklearn.base import BaseEstimator
from sklearn import utils as skl_utils
from tqdm import tqdm

import multiprocessing
import numpy as np

from gensim import utils
import gensim.parsing.preprocessing as gsp

filters = [
           gsp.strip_tags, 
           gsp.strip_punctuation,
           gsp.strip_multiple_whitespaces,
           gsp.strip_numeric,
           gsp.remove_stopwords, 
           gsp.strip_short, 
           gsp.stem_text
          ]

def clean_text(s):
    s = s.lower()
    s = utils.to_unicode(s)
    for f in filters:
        s = f(s)
    return s

class Doc2VecTransformer(BaseEstimator):

    def __init__(self, vector_size=100, learning_rate=0.02, epochs=20, field=None):
        self.learning_rate = learning_rate
        self.epochs = epochs
        self._model = None
        self.vector_size = vector_size
        self.workers = multiprocessing.cpu_count() - 1
        self.field = field

    def fit(self, df_x, df_y=None):
        tagged_x = [TaggedDocument(clean_text(row[str(self.field)]).split(), [index]) for index, row in df_x.iterrows()]
        model = Doc2Vec(documents=tagged_x, vector_size=self.vector_size, workers=self.workers)

        for epoch in range(self.epochs):
            model.train(skl_utils.shuffle([x for x in tqdm(tagged_x)]), total_examples=len(tagged_x), epochs=1)
            model.alpha -= self.learning_rate
            model.min_alpha = model.alpha

        self._model = model
        return self

    def transform(self, df_x):
        return np.asmatrix(np.array([self._model.infer_vector(clean_text(row[str(self.field)]).split())
                                     for index, row in df_x.iterrows()]))

In [8]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(df_x, encoded_y)

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.multiclass import OneVsRestClassifier
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import f1_score


In [None]:
vectorizer = TfidfVectorizer()
train_x = train_x['clean_transcript_string']
test_x = test_x['clean_transcript_string']

#train_x = vectorizer.fit_transform(train_x['clean_transcript_string'])
#test_x = vectorizer.transform(test_x['clean_transcript_string'])

In [None]:
vectorizer = CountVectorizer()

train_x = vectorizer.fit_transform(train_x['clean_transcript_string'])
test_x = vectorizer.transform(test_x['clean_transcript_string'])

In [10]:
fu = FeatureUnion(transformer_list=[('title_doc2vec',Doc2VecTransformer(field='headline')),
                                    ('body_doc2vec',Doc2VecTransformer(field='clean_transcript_string'))])
binary_rel_model = BinaryRelevance(RandomForestClassifier(n_jobs=-1, n_estimators=10))

multi_label_rf_br_model = Pipeline(steps=[
                           ('feature_union', fu),
                           ('binary_relevance', binary_rel_model)
                        ])

In [11]:
import sklearn.metrics as metrics

def hamming_loss(multi_label_model_pipeline,train_x, train_y, test_x, test_y):
    predictions_test_y = multi_label_model_pipeline.predict(test_x)
    return metrics.hamming_loss(y_true=test_y, y_pred=predictions_test_y)

In [None]:
multi_label_rf_br_model.fit(train_x, train_y)
print('Hamming loss for test data :', hamming_loss(multi_label_rf_br_model,train_x,train_y,test_x,test_y))

100%|██████████| 1789/1789 [00:00<00:00, 2273132.34it/s]
100%|██████████| 1789/1789 [00:00<00:00, 2106572.11it/s]
100%|██████████| 1789/1789 [00:00<00:00, 2928809.47it/s]
100%|██████████| 1789/1789 [00:00<00:00, 3546129.42it/s]
100%|██████████| 1789/1789 [00:00<00:00, 2239883.54it/s]
100%|██████████| 1789/1789 [00:00<00:00, 2045137.60it/s]
100%|██████████| 1789/1789 [00:00<00:00, 2846589.47it/s]
100%|██████████| 1789/1789 [00:00<00:00, 3496556.32it/s]
100%|██████████| 1789/1789 [00:00<00:00, 1622402.13it/s]
100%|██████████| 1789/1789 [00:00<00:00, 2999044.71it/s]
100%|██████████| 1789/1789 [00:00<00:00, 2368563.72it/s]
100%|██████████| 1789/1789 [00:00<00:00, 2121461.65it/s]
100%|██████████| 1789/1789 [00:00<00:00, 2189556.42it/s]
100%|██████████| 1789/1789 [00:00<00:00, 2150031.48it/s]
100%|██████████| 1789/1789 [00:00<00:00, 3355818.36it/s]
100%|██████████| 1789/1789 [00:00<00:00, 2496210.86it/s]
100%|██████████| 1789/1789 [00:00<00:00, 2920829.06it/s]
100%|██████████| 1789/1789 [00:

In [None]:
predictions_test_y = multi_label_rf_br_model.predict(test_x)
print(predictions_test_y.toarray())

In [None]:
# print(predictions_test_y.toarray()[-3])
# print(encoded_y[154])
# print(mlb.inverse_transform(predictions_test_y.toarray()))
print(test_y[27])
'''
103  want year opportunity close conference incredi...  
31   think podium bite scar chris ask tell structur...  
41   music music end end hi sirena year old connect...  
93   thank get story arrive plane long journey west...  
152  go try view world problem opportunity face ask...  
2    public dewey long ago observe constitute discu...  
154                                              music  
124  consider storyteller tell story usual way sens...  
94 
'''

In [None]:
index = -1
print(test_y[index])
print(mlb.inverse_transform(test_y)[index])
mlb.inverse_transform(predictions_test_y)[index]

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from sklearn.multiclass import OneVsRestClassifier
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import f1_score

In [None]:
X = df['clean_transcript_string']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, encoded_y)

In [None]:
# Set grid search params
param_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
param_range_fl = [1.0, 0.5, 0.1]

cv_grid_params = [] # Not implemened yet

tfidf_grid_params = [] # Not implemented yet

lr_grid_params = [{'clf__penalty': ['l1', 'l2'],
                   'clf__C': param_range_fl,
                   'clf__solver': ['liblinear']
                  }] 

rf_grid_params = [{'clf__criterion': ['gini', 'entropy'],
                   'clf__min_samples_leaf': param_range,
                   'clf__max_depth': param_range,
                   'clf__min_samples_split': param_range[1:],
                   'clf__n_estimators': [10]
                  }]

svm_grid_params = [{'clf__kernel': ['linear', 'rbf'], 
                    'clf__C': param_range,
                    'clf__gamma': ['auto'],
                    'probability': [True]
                   }]

scoring = 'accuracy'
njobs = -1

# Instantiate vectorizer and desired models

# OVR
    ## logistic regression
cv_lr_ovr = Pipeline([('vectorizer', CountVectorizer()),
                      ('clf', OneVsRestClassifier(LogisticRegression()))
                     ]
                    )
gs_cv_lr_ovr = GridSearchCV(estimator=cv_lr_ovr,
                            param_grid=lr_grid_params,
                            scoring=scoring,
                            cv=10) 


tfidf_lr_ovr = Pipeline([('vectorizer', TfidfVectorizer()),
                        ('clf', OneVsRestClassifier(LogisticRegression()))
                       ]
                      )
gs_tfidf_lr_ovr = GridSearchCV(estimator=tfidf_lr_ovr,
                               param_grid=lr_grid_params,
                               scoring=scoring,
                               cv=10) 


    ## random forest
cv_rf_ovr = Pipeline([('vectorizer', CountVectorizer()),
                      ('clf', OneVsRestClassifier(RandomForestClassifier()))
                     ]
                    )
gs_cv_rf_ovr = GridSearchCV(estimator=cv_rf_ovr,
                            param_grid=rf_grid_params,
                            scoring=scoring,
                            cv=10, 
                            n_jobs=njobs)


tfidf_rf_ovr = Pipeline([('vectorizer', TfidfVectorizer()),
                        ('clf', OneVsRestClassifier(RandomForestClassifier()))
                       ]
                      )
gs_tfidf_rf_ovr = GridSearchCV(estimator=tfidf_rf_ovr,
                               param_grid=rf_grid_params,
                               scoring=scoring,
                               cv=10, 
                               n_jobs=njobs)


    ## support vector classifier
cv_svm_ovr = Pipeline([('vectorizer', CountVectorizer()),
                       ('clf', OneVsRestClassifier(SVC()))
                      ]
                     )
gs_cv_svm_ovr = GridSearchCV(estimator=cv_svm_ovr,
                             param_grid=svm_grid_params,
                             scoring=scoring,
                             cv=10,
                             n_jobs=njobs)


tfidf_svm_ovr = Pipeline([('vectorizer', TfidfVectorizer()),
                          ('clf', OneVsRestClassifier(SVC()))
                         ]
                        )
gs_tfidf_svm_ovr = GridSearchCV(estimator=tfidf_svm_ovr,
                                param_grid=svm_grid_params,
                                scoring=scoring,
                                cv=10,
                                n_jobs=njobs)


In [None]:
pipelines = [cv_lr_ovr, tfidf_lr_ovr, 
             cv_rf_ovr, tfidf_rf_ovr, 
             cv_svm_ovr, tfidf_svm_ovr]

grids = [gs_cv_lr_ovr, gs_tfidf_lr_ovr,
         gs_cv_rf_ovr, gs_tfidf_rf_ovr,
         gs_cv_svm_ovr, gs_tfidf_svm_ovr]


In [None]:
for pipe in pipelines:
    pipe.fit(X_train, y_train)

In [None]:
for idx, clf in enumerate(pipelines):
    y_pred_prob = clf.predict_proba(X_test)
    t = 0.1 # threshold value
    y_pred_new = (y_pred_prob >= t).astype(int)
    scoring = f1_score(y_test, y_pred_new, average="micro")
    print('%s pipeline test accuracy: %.3f' % (idx, scoring))

In [None]:
mlb.inverse_transform(y_pred_new)[3]

In [None]:
best_acc = 0.0
best_clf = 0
best_pipe = ''
for idx, clf in enumerate(pipelines):
    if clf.score(X_test, y_test) > best_acc:
        best_acc = clf.score(X_test, y_test)
        best_pipe = clf
print(f'Classifier with best accuracy: {best_pipe.named_steps} \n with accuracy of {best_acc}')
# joblib.dump(best_pipe, 'best_classifier.pkl', compress=1)

In [None]:
print('Performing model optimizations...')
best_acc = 0.0
best_clf = 0
best_gs = ''
for idx, gs in enumerate(grids):
    print('\nEstimator: %s' % grid_dict[idx])
    # Fit grid search
    gs.fit(X_train, y_train)
    # Best params
    print('Best params: %s' % gs.best_params_)
    # Best training data accuracy
    print('Best training accuracy: %.3f' % gs.best_score_)
    # Predict on test data with best params
    y_pred = gs.predict(X_test)
    # Test data accuracy of model with best params
    print('Test set accuracy score for best params: %.3f ' % accuracy_score(y_test, y_pred))
    # Track best (highest test accuracy) model
    if accuracy_score(y_test, y_pred) > best_acc:
        best_acc = accuracy_score(y_test, y_pred)
        best_gs = gs
        best_clf = idx
print('\nClassifier with best test set accuracy: %s' % grid_dict[best_clf])

# Save best grid search pipeline to file
# dump_file = 'best_classifer_params.pkl'
# joblib.dump(best_gs, dump_file, compress=1)
# print('\nSaved %s grid search pipeline to file: %s' % (grid_dict[best_clf], dump_file))