# Rebalancing evaluation

In this notebook have been considered three different techniques to approach the imbalanced dataset:
- Oversampling: it consists in duplicating samples in the minority classes by picking them with replacement randomly
- Undersampling : it undersample the majority classes by randomly picking samples without replacement
- SMOTE: it generates synthetic examples from the existing samples of the minority classes

The executions take into account the six algorithms (LinearSVC, MultinomialNB, LogisticRegression, RandomForestClassifier, KNeighborsClassifier, BernulliNB) considered with the best tuned parameters performed in the notebook 'Classifier evaluation' and a step of rebalancing was put into the pipeline. 
The comparison is done comparing not only the results of the rebalancing methods but also with those occured without rebalancing; therefore these results will be reported at the beginning of each model paragraph.

At the end of this notebook is shown a summary table with all the executions' outcomes.

In [42]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

import nltk
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer


#Load pre-processed dataset
data = pd.read_csv("movies/data.csv", encoding="ISO-8859-1")
data = data.dropna(subset = ["overview"])
data = data.reset_index(drop=True)

encoder = LabelEncoder()
data['genre_ids'] = encoder.fit_transform(data.genre_ids)

X = data['overview']
y = data['genre_ids']

class LemmaTokenizer:
     def __init__(self):
            self.stm = SnowballStemmer("english")
     def __call__(self, doc):
        return [self.stm.stem(t) for t in word_tokenize(doc)]

oversampling=[]
sm=[]
undersampling=[]
no_balance=[]

In [43]:
import numpy as np
from sklearn.model_selection import StratifiedKFold
from imblearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import chi2, f_classif
from sklearn.model_selection import GridSearchCV

n_folds = 10
skf = StratifiedKFold(n_folds, shuffle=True)

scoring = ['accuracy', 'precision_micro', 'recall_micro', 'f1_micro']

# Vectorization parameters
# N-gram sizes for tokenizing text.
NGRAM_RANGE = [(1,2)]
# Minimum document frequency below which a token will be discarded.
MIN_DOCUMENT_FREQUENCY =  [2]

## LinearSVC
Best estimator selected is that with 20000 features and the chi2 score function with a best score of 0.5951419013089634.

In [44]:
from sklearn.svm import LinearSVC
   
grid = {
    'tfidfvectorizer__min_df': MIN_DOCUMENT_FREQUENCY,
    'tfidfvectorizer__ngram_range': NGRAM_RANGE,
    'tfidfvectorizer__tokenizer': [LemmaTokenizer()],
    'selectkbest__k': [20000],
    'selectkbest__score_func': [chi2]  
    }
no_balance.append(0.5951419013089634)

In [45]:
ros_pipe = make_pipeline(TfidfVectorizer(), 
                        RandomOverSampler(random_state=1),
                        SelectKBest(),
                        LinearSVC())

grid_search = GridSearchCV(ros_pipe, param_grid=grid, scoring=scoring, refit="f1_micro", n_jobs=-1, cv=skf)
svc_ros_model = grid_search.fit(X=X, y=y)

svc_ros_model.best_score_

0.5707534492951828

In [46]:
smote_pipe = make_pipeline(TfidfVectorizer(), 
                        SMOTE(sampling_strategy='minority', k_neighbors=5, random_state=1),
                        SelectKBest(),
                        LinearSVC())

grid_search = GridSearchCV(smote_pipe, param_grid=grid, scoring=scoring, refit="f1_micro", n_jobs=-1, cv=skf)
svc_smote_model = grid_search.fit(X=X, y=y)


svc_smote_model.best_score_ 

0.5898055252829597

In [47]:
rus_pipe = make_pipeline(TfidfVectorizer(), 
                        RandomUnderSampler(random_state=1),
                        SelectKBest(),
                        LinearSVC())

grid_search = GridSearchCV(rus_pipe, param_grid=grid, scoring=scoring, refit="f1_micro", n_jobs=-1, cv=skf)
svc_rus_model = grid_search.fit(X=X, y=y)

svc_rus_model.best_score_ 

0.5213673063088089

In [48]:
oversampling.append(svc_ros_model.best_score_)
sm.append(svc_smote_model.best_score_)
undersampling.append(svc_rus_model.best_score_)

## MultinomialNB
Best estimator selected is that with 5000 features and the f_classif (default) score function with a best score of 0.4806064431955515.

In [49]:
from sklearn.naive_bayes import MultinomialNB

grid = {
    'tfidfvectorizer__min_df': MIN_DOCUMENT_FREQUENCY,
    'tfidfvectorizer__ngram_range': NGRAM_RANGE,
    'tfidfvectorizer__tokenizer': [LemmaTokenizer()],
    'selectkbest__k': [5000],
    'selectkbest__score_func': [f_classif]  
    }

no_balance.append(0.4806064431955515)

In [50]:
ros_pipe = make_pipeline(TfidfVectorizer(), 
                        RandomOverSampler(random_state=1),
                        SelectKBest(),
                        MultinomialNB())

grid_search = GridSearchCV(ros_pipe, param_grid=grid, scoring=scoring, refit="f1_micro", n_jobs=-1, cv=skf)
mnb_ros_model = grid_search.fit(X=X, y=y)


mnb_ros_model.best_score_

0.5596187611741493

In [51]:
smote_pipe = make_pipeline(TfidfVectorizer(), 
                        SMOTE(sampling_strategy='minority', k_neighbors=5, random_state=1),
                        SelectKBest(),
                        MultinomialNB())

grid_search = GridSearchCV(smote_pipe, param_grid=grid, scoring=scoring, refit="f1_micro", n_jobs=-1, cv=skf)
mnb_smote_model = grid_search.fit(X=X, y=y)

mnb_smote_model.best_score_

0.4963942311116809

In [52]:
rus_pipe = make_pipeline(TfidfVectorizer(), 
                        RandomUnderSampler(random_state=1),
                        VarianceThreshold(),
                        SelectKBest(),
                        MultinomialNB())

grid_search = GridSearchCV(rus_pipe, param_grid=grid, scoring=scoring, refit="f1_micro", n_jobs=-1, cv=skf)
mnb_rus_model = grid_search.fit(X=X, y=y)

mnb_rus_model.best_score_

0.5277507164514343

In [53]:
oversampling.append(mnb_ros_model.best_score_)
sm.append(mnb_smote_model.best_score_)
undersampling.append(mnb_rus_model.best_score_)

## LogisticRegression 
Best estimator selected is that with all the features, the f_classif (default) score function and the default solver that is 'lbfgs' with a best score of 0.5924376562600167.

In [54]:
from sklearn.linear_model import LogisticRegression

grid = {
    'tfidfvectorizer__min_df': MIN_DOCUMENT_FREQUENCY,
    'tfidfvectorizer__ngram_range': NGRAM_RANGE,
    'tfidfvectorizer__tokenizer': [LemmaTokenizer()], 
    }

no_balance.append(0.5924376562600167) 

In [55]:
ros_pipe = make_pipeline(TfidfVectorizer(), 
                        RandomOverSampler(random_state=1),
                        LogisticRegression(max_iter=1000, multi_class='ovr'))

grid_search = GridSearchCV(ros_pipe, param_grid=grid, scoring=scoring, refit="f1_micro", n_jobs=-1, cv=skf)
lr_ros_model = grid_search.fit(X=X, y=y)

lr_ros_model.best_score_

0.5950441861182935

In [56]:
smote_pipe = make_pipeline(TfidfVectorizer(), 
                        SMOTE(sampling_strategy='minority', k_neighbors=5, random_state=1),
                        LogisticRegression(max_iter=1000, multi_class='ovr'))

grid_search = GridSearchCV(smote_pipe, param_grid=grid, scoring=scoring, refit="f1_micro", n_jobs=-1, cv=skf)
lr_smote_model = grid_search.fit(X=X, y=y)

lr_smote_model.best_score_

0.5924616309687468

In [57]:
rus_pipe = make_pipeline(TfidfVectorizer(), 
                        RandomUnderSampler(random_state=1),
                        LogisticRegression(max_iter=1000, multi_class='ovr'))

grid_search = GridSearchCV(rus_pipe, param_grid=grid, scoring=scoring, refit="f1_micro", n_jobs=-1, cv=skf)
lr_rus_model = grid_search.fit(X=X, y=y)

lr_rus_model.best_score_ 

0.542880324712173

In [58]:
oversampling.append(lr_ros_model.best_score_)
sm.append(lr_smote_model.best_score_)
undersampling.append(lr_ros_model.best_score_)

## RandomForest
Best estimator selected is that with 500 features, chi2 score function and a max tree depth of 8 with a best score of 0.3825407433524941.

In [59]:
from sklearn.ensemble import RandomForestClassifier

grid = {
    'tfidfvectorizer__min_df': MIN_DOCUMENT_FREQUENCY,
    'tfidfvectorizer__ngram_range': NGRAM_RANGE,
    'tfidfvectorizer__tokenizer': [LemmaTokenizer()],
    'selectkbest__k': [500],
    'selectkbest__score_func': [chi2],
    'randomforestclassifier__max_depth': [8],
    'randomforestclassifier__n_estimators': [100]
    }

no_balance.append(0.3825407433524941)

In [60]:
ros_pipe = make_pipeline(TfidfVectorizer(), 
                        RandomOverSampler(random_state=1),
                        SelectKBest(),
                        RandomForestClassifier())

grid_search = GridSearchCV(ros_pipe, param_grid=grid, scoring=scoring, refit="f1_micro", n_jobs=-1, cv=skf)
rf_ros_model = grid_search.fit(X=X, y=y)

rf_ros_model.best_score_

0.40605178275910375

In [61]:
smote_pipe = make_pipeline(TfidfVectorizer(), 
                        SMOTE(sampling_strategy='minority', k_neighbors=5, random_state=1),
                        SelectKBest(),
                        RandomForestClassifier())

grid_search = GridSearchCV(smote_pipe, param_grid=grid, scoring=scoring, refit="f1_micro", n_jobs=-1, cv=skf)
rf_smote_model = grid_search.fit(X=X, y=y)

rf_smote_model.best_score_

0.37357461185783514

In [62]:
rus_pipe = make_pipeline(TfidfVectorizer(), 
                        RandomUnderSampler(random_state=1),
                        SelectKBest(),
                        RandomForestClassifier())

grid_search = GridSearchCV(rus_pipe, param_grid=grid, scoring=scoring, refit="f1_micro", n_jobs=-1, cv=skf)
rf_rus_model = grid_search.fit(X=X, y=y)

rf_rus_model.best_score_

0.4140928430834835

In [63]:
oversampling.append(rf_ros_model.best_score_)
sm.append(rf_smote_model.best_score_)
undersampling.append(rf_rus_model.best_score_)

## KNeighborsClassifier
Best estimator selected is that with all the features and the default f_classif score function with a best score of 0.44734848269001104.

In [64]:
from sklearn.neighbors import KNeighborsClassifier

grid = {
    'tfidfvectorizer__min_df': MIN_DOCUMENT_FREQUENCY,
    'tfidfvectorizer__ngram_range': NGRAM_RANGE,
    'tfidfvectorizer__tokenizer': [LemmaTokenizer()]
}

no_balance.append(0.44734848269001104)

In [65]:
ros_pipe = make_pipeline(TfidfVectorizer(), 
                        RandomOverSampler(random_state=1),
                        KNeighborsClassifier())

grid_search = GridSearchCV(ros_pipe, param_grid=grid, scoring=scoring, refit="f1_micro", n_jobs=-1, cv=skf)
knc_ros_model = grid_search.fit(X=X, y=y)

knc_ros_model.best_score_

0.34762729923333163

In [66]:
smote_pipe = make_pipeline(TfidfVectorizer(), 
                        SMOTE(sampling_strategy='minority', k_neighbors=5, random_state=1),
                        KNeighborsClassifier())

grid_search = GridSearchCV(ros_pipe, param_grid=grid, scoring=scoring, refit="f1_micro", n_jobs=-1, cv=skf)
knc_smote_model = grid_search.fit(X=X, y=y)

knc_smote_model.best_score_

0.34726082274979636

In [67]:
rus_pipe = make_pipeline(TfidfVectorizer(), 
                        RandomUnderSampler(random_state=1),
                        KNeighborsClassifier())

grid_search = GridSearchCV(ros_pipe, param_grid=grid, scoring=scoring, refit="f1_micro", n_jobs=-1, cv=skf)
knc_rus_model = grid_search.fit(X=X, y=y)

knc_rus_model.best_score_

0.34728632295992384

In [68]:
oversampling.append(knc_ros_model.best_score_)
sm.append(knc_smote_model.best_score_)
undersampling.append(knc_rus_model.best_score_)

## BernoulliNB Classifier
Best estimator selected is that with 10000 features and the f_classif(default) score function with a best score of 0.5823505424136874.

In [69]:
from sklearn.naive_bayes import BernoulliNB

grid = {
    'tfidfvectorizer__min_df': MIN_DOCUMENT_FREQUENCY,
    'tfidfvectorizer__ngram_range': NGRAM_RANGE,
    'tfidfvectorizer__tokenizer': [LemmaTokenizer()],
    'selectkbest__k': [10000]
    }

no_balance.append(0.5823505424136874)

In [70]:
ros_pipe = make_pipeline(TfidfVectorizer(), 
                        RandomOverSampler(random_state=1),
                        SelectKBest(),
                        BernoulliNB())

grid_search = GridSearchCV(ros_pipe, param_grid=grid, scoring=scoring, refit="f1_micro", n_jobs=-1, cv=skf)
bnc_ros_model = grid_search.fit(X=X, y=y)

bnc_ros_model.best_score_

0.5794269397610958

In [71]:
smote_pipe = make_pipeline(TfidfVectorizer(), 
                        SMOTE(sampling_strategy='minority', k_neighbors=5, random_state=1),
                        SelectKBest(),
                        BernoulliNB())

grid_search = GridSearchCV(ros_pipe, param_grid=grid, scoring=scoring, refit="f1_micro", n_jobs=-1, cv=skf)
bnc_smote_model = grid_search.fit(X=X, y=y)

bnc_smote_model.best_score_

0.577697418875379

In [72]:
rus_pipe = make_pipeline(TfidfVectorizer(), 
                        RandomUnderSampler(random_state=1),
                        SelectKBest(),
                        BernoulliNB())

grid_search = GridSearchCV(ros_pipe, param_grid=grid, scoring=scoring, refit="f1_micro", n_jobs=-1, cv=skf)
bnc_rus_model = grid_search.fit(X=X, y=y)

bnc_rus_model.best_score_

0.5798900036327115

In [73]:
oversampling.append(bnc_ros_model.best_score_)
sm.append(bnc_smote_model.best_score_)
undersampling.append(bnc_rus_model.best_score_)

In [74]:
methods=['Oversampling','Smote', 'Undersampling', 'No balance']
models=['LinearSVC',  'MultinomialNB', 'LogisticRegression', 'RandomForest', 'KNeighbors', 'BernoulliNB']
d = {'Oversampling': oversampling, 'Smote': sm, 'Undersampling': undersampling, 'No balance': no_balance}
df = pd.DataFrame(d, columns=methods, index=models)

df

Unnamed: 0,Oversampling,Smote,Undersampling,No balance
LinearSVC,0.570753,0.589806,0.521367,0.595142
MultinomialNB,0.559619,0.496394,0.527751,0.480606
LogisticRegression,0.595044,0.592462,0.595044,0.592438
RandomForest,0.406052,0.373575,0.414093,0.382541
KNeighbors,0.347627,0.347261,0.347286,0.447348
BernoulliNB,0.579427,0.577697,0.57989,0.582351
