## Loading libraries and data

In [1]:
from time import time
import joblib
import gc

import pandas as pd
import numpy as np

from sklearn.metrics import classification_report, confusion_matrix, \
                            f1_score, accuracy_score, recall_score, \
                            precision_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2

import xgboost as xgb

  from pandas import MultiIndex, Int64Index


In [2]:
def CreateFullReport(model_name, model_obj, y_true, y_predict, time_train, time_predict, n_features):

    report = classification_report(y_true, y_predict)
    cm_matrix = pd.DataFrame(confusion_matrix(y_true, y_predict, labels=[0,1,2]))
        
    f1_result = f1_score(y_true, y_predict, labels=[0,1,2], average="macro")
    accuracy_result = accuracy_score(y_true, y_predict)
    recall_result = recall_score(y_true, y_predict, labels=[0,1,2], average="macro")
    precision_result = precision_score(y_true, y_predict, labels=[0,1,2], average="macro")
  
    # Saving the values of the experiment
    current_report = ({"chi2": n_features},
                      {model_name: model_obj},
                      {"scores":
                       {
                           "f1_score": f1_result,
                           "accuracy": accuracy_result,
                           "recall": recall_result,
                           "precision": precision_result,
                           "cm": cm_matrix,
                           "report": report
                       }
                      },
                      {"time":
                       {
                           "training": time_train,
                           "prediction":time_predict
                       }
                      })
  
    return current_report

In [3]:
Xy_data = pd.read_feather("data/labeled_data_clean.feather")
Xy_data["comment"] = Xy_data["comment"].transform(list)

Xy_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 704019 entries, 0 to 704018
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   comment  704019 non-null  object
 1   class    704019 non-null  int8  
dtypes: int8(1), object(1)
memory usage: 6.0+ MB


In [4]:
Xy_data["class"].value_counts()

1    313661
0    199718
2    190640
Name: class, dtype: int64

In [5]:
print("Shufling data 5 time")
for i in range(5):
    Xy_data = Xy_data.sample(Xy_data.shape[0], replace=False)

Shufling data 5 time


## Experimental grid definition

### Models parameters

In [6]:
# SVM doesn't work for 500 and 250
select_k_features = [2000, 1000, 500, 250]

model_parameters = {
    
    "random_forest": {},
    
    "svm_linear": {
        "kernel":"linear",
        "class_weight":"balanced"
    },
    
    "multinomial": {}
   
}

bagging_models = {
    
    "random_forest": None,
    "svm_linear": SVC,
    "multinomial": MultinomialNB
    
}

bagging_parameters = {

    "random_forest":{
        "n_jobs": 6,
        "verbose":1
    },
    
    "svm_linear": {
        "max_samples": 1.0 / 10,
        "n_estimators": 10,
        "verbose": 2,
        "n_jobs": 6
    },
    
    "multinomial":{
        "n_jobs": 6,
        "verbose": 1
    }

}

bagging_definition = {
    
    "random_forest": RandomForestClassifier,
    "svm_linear": BaggingClassifier,
    "multinomial": BaggingClassifier

}

boosting_parameters = {
    
    "xgb":{

        "tree_method": "hist",
        "objective": "multi:softprob",
        "n_estimators": 1000,
        "verbosity": 1,
        "n_jobs": 5

    }

}

boosting_options = {
    
    "xgb":xgb.XGBClassifier

}

### Train/test split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(Xy_data["comment"],
                                                    Xy_data["class"],
                                                    train_size = 0.7)

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (492813,)
y_train shape: (492813,)
X_test shape: (211206,)
y_test shape: (211206,)


### Feature selection and running grid

In [None]:
print("Vectorizing data")
tfidf_model = TfidfVectorizer(min_df = 10)
X_vect = tfidf_model.fit_transform(Xy_data["comment"].apply(" ".join))

X_train = tfidf_model.transform(X_train.apply(" ".join))
X_test  = tfidf_model.transform(X_test.apply(" ".join))
    
# List to track and save the models running
# Initialize with the data used to feed the models
list_tracking = []

for f_s_m in select_k_features:
    
    print(f"Selecting features: {f_s_m}")
    chi2_model = SelectKBest(chi2, k = f_s_m)
    X_vect = chi2_model.fit_transform(X_vect, Xy_data["class"])

    X_train = chi2_model.transform(X_train)
    X_test  = chi2_model.transform(X_test)
    
    for key_bagg, bagg_class in bagging_definition.items():
        
        print(f"Defining model: {key_bagg}")
        
        # Defining if the model is already an ensemble bagging model or not
        # None means it's already an ensemble
        if bagging_models[key_bagg] is None:
            bagg_model_clf = bagg_class(**bagging_parameters[key_bagg])
        else:
            clf_model = bagging_models[key_bagg](**model_parameters[key_bagg])
            bagg_model_clf = bagg_class(clf_model,
                                        **bagging_parameters[key_bagg])
        
        
        # SVM doesn't work for 500 and 250
        if (key_bagg == "svm_linear") and (f_s_m < 501):
            print(f"Skipping model {key_bagg}: {f_s_m}")
            continue
    
        print("Training model")
        time_start_train_model = time()
        bagg_model_clf.fit(X_train, y_train)
        time_train = time() - time_start_train_model
        
        print("Evaluating model")
        time_start_predic_model = time()
        y_hat = bagg_model_clf.predict(X_test)
        time_predic = time() - time_start_predic_model
        
        # Saving the values of this experiment
        current_report = CreateFullReport(key_bagg, bagg_model_clf, 
                                          y_test, y_hat,
                                          time_train, time_predic,
                                          f_s_m)
        
        list_tracking.append(current_report)
        print("Done")
        print("")
        gc.collect()

with open("bagging_trained_models.joblib", "wb") as oFile:
    joblib.dump(list_tracking, oFile)

In [None]:
## grid of xgboost
list_tracking = []
for f_s_m in select_k_features:
    
    print(f"Selecting features: {f_s_m}")
    chi2_model = SelectKBest(chi2, k = f_s_m)
    X_vect = chi2_model.fit_transform(X_vect, Xy_data["class"])

    X_train = chi2_model.transform(X_train)
    X_test  = chi2_model.transform(X_test)

    for key_boost, boost_class in boosting_options.items():

        print(f"Defining model: {key_boost}")
        boost_model_clf = boost_class(**boosting_parameters[key_boost])
      
        print("Training model")
        time_start_train_model = time()
        boost_model_clf.fit(X_train, y_train)
        time_train = time() - time_start_train_model

        print("Evaluating model")
        time_start_predic_model = time()
        y_hat = boost_model_clf.predict(X_test)
        time_predic = time() - time_start_predic_model
      
        current_report = CreateFullReport(key_boost, boost_model_clf,
                                          y_test, y_hat,
                                          time_train, time_predic,
                                          f_s_m)
        
        list_tracking.append(current_report)
        print("Done")
        print("")

        gc.collect()

with open("boosting_trained_models.joblib", "wb") as oFile:
    joblib.dump(list_tracking, oFile)

## Model evaluations

In [9]:
# Loading all models

with open("bagging_trained_models.joblib", "rb") as oBag:
    bagging_models = joblib.load(oBag)

with open("boosting_trained_models.joblib", "rb") as oBoo:
    boosting_models = joblib.load(oBoo)

### Best model (bagging)

In [146]:
scores_models = []
times_models = []

for bagg_mod in bagging_models:
    n_features = bagg_mod[0]["chi2"]
    model_name = list(bagg_mod[1].keys())[0]
    f1 = bagg_mod[2]["scores"]["f1_score"]
    acc  = bagg_mod[2]["scores"]["accuracy"]
    prec = bagg_mod[2]["scores"]["recall"]
    rec  = bagg_mod[2]["scores"]["precision"]


    scores_models.append((n_features, model_name, f1, acc, prec, rec))

    train = bagg_mod[-1]["time"]["training"]
    predi = bagg_mod[-1]["time"]["prediction"]

    times_models.append((train, predi))



metrics_models = pd.DataFrame(scores_models, columns=["n_features","model",
                                                      "f1_score","accuracy",
                                                      "precision", "recall"])

metrics_models.set_index(["n_features","model"], inplace=True)
metrics_models.style.background_gradient(cmap="Blues", axis=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,accuracy,precision,recall
n_features,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000,random_forest,0.957991,0.959031,0.956739,0.959338
2000,svm_linear,0.951794,0.953245,0.953902,0.949901
2000,multinomial,0.856173,0.860757,0.839518,0.887167
1000,random_forest,0.956747,0.95779,0.955474,0.958108
1000,svm_linear,0.951309,0.952875,0.953555,0.949318
1000,multinomial,0.86142,0.86525,0.84299,0.896855
500,random_forest,0.955539,0.956673,0.954228,0.956933
500,multinomial,0.862553,0.865913,0.842076,0.903686
250,random_forest,0.953347,0.954447,0.951548,0.955277
250,multinomial,0.859064,0.862187,0.836528,0.906714


In [147]:
times_best_models_by_features = pd.DataFrame(times_models,
                                             columns = ["t_training","t_prediction"]).iloc[[0,3,6,8]]

best_models_by_features = metrics_models.loc[metrics_models.index[[0,3,6,8]]]

times_best_models_by_features.reset_index(drop=True, inplace=True)
best_models_by_features.reset_index(inplace=True)

metrics_models_best = best_models_by_features.merge(times_best_models_by_features,
                                                    left_index=True, right_index=True)

metrics_models_best.set_index(["model", "n_features"], inplace = True)
metrics_models_best.style.background_gradient(cmap="Blues")

Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,accuracy,precision,recall,t_training,t_prediction
model,n_features,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
random_forest,2000,0.957991,0.959031,0.956739,0.959338,815.436067,3.529123
random_forest,1000,0.956747,0.95779,0.955474,0.958108,668.573231,3.078766
random_forest,500,0.955539,0.956673,0.954228,0.956933,527.394452,2.629157
random_forest,250,0.953347,0.954447,0.951548,0.955277,378.487558,2.495455


In [148]:
best_bagg_model = bagging_models[-2]
print("\nRandom Forest using 250 features")
print(best_bagg_model[2]["scores"]["report"])


Random Forest using 250 features
              precision    recall  f1-score   support

           0       0.96      0.94      0.95     59621
           1       0.95      0.97      0.96     94437
           2       0.96      0.95      0.95     57148

    accuracy                           0.95    211206
   macro avg       0.96      0.95      0.95    211206
weighted avg       0.95      0.95      0.95    211206



In [167]:
confusion_matrix_best_bagg = best_bagg_model[2]["scores"]["cm"]

multi_index_cols = pd.MultiIndex.from_product([["Confusion matrix: Random Forest using 250 features"],
                                               ["Negative","Other","Positive"]])
confusion_matrix_best_bagg.columns = multi_index_cols
confusion_matrix_best_bagg.index = ["Negative","Other","Positive"]

confusion_matrix_best_bagg.style.background_gradient(cmap="RdYlGn")

Unnamed: 0_level_0,Confusion matrix: Random Forest using 250 features,Confusion matrix: Random Forest using 250 features,Confusion matrix: Random Forest using 250 features
Unnamed: 0_level_1,Negative,Other,Positive
Negative,56095,2532,994
Other,1553,91479,1405
Positive,930,2207,54011


### Best model (boosting)

In [161]:
scores_models = []
times_models = []

for boos_mod in boosting_models:
    n_features = boos_mod[0]["chi2"]
    model_name = list(boos_mod[1].keys())[0]
    f1 = boos_mod[2]["scores"]["f1_score"]
    acc  = boos_mod[2]["scores"]["accuracy"]
    prec = boos_mod[2]["scores"]["recall"]
    rec  = boos_mod[2]["scores"]["precision"]

    scores_models.append((n_features, model_name, f1, acc, prec, rec))

    train = boos_mod[-1]["time"]["training"]
    predi = boos_mod[-1]["time"]["prediction"]

    times_models.append((train, predi))



metrics_models = pd.DataFrame(scores_models, columns=["n_features","model",
                                                  "f1_score","accuracy",
                                                  "precision", "recall"])

times_models = pd.DataFrame(times_models, columns = ["t_training","t_prediction"])

metrics_models = metrics_models.merge(times_models, left_index=True,
                                      right_index=True)

metrics_models.set_index(["n_features","model"], inplace=True)
metrics_models.style.background_gradient(cmap="Blues", axis=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,accuracy,precision,recall,t_training,t_prediction
n_features,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2000,xgb,0.387249,0.414709,0.47791,0.755109,42.397923,26.77569
1000,xgb,0.636258,0.674479,0.61458,0.826195,26.752123,21.839481
500,xgb,0.774708,0.782411,0.747501,0.852389,19.360382,25.523778
250,xgb,0.86233,0.863456,0.848022,0.8875,13.274708,22.179826


In [166]:
best_boss_model = boosting_models[-1]

print("\nXGB using 250 features")
print(best_boss_model[2]["scores"]["report"])

XGB using 250 features
              precision    recall  f1-score   support

           0       0.92      0.76      0.83     59845
           1       0.80      0.94      0.87     93886
           2       0.94      0.84      0.89     57475

    accuracy                           0.86    211206
   macro avg       0.89      0.85      0.86    211206
weighted avg       0.87      0.86      0.86    211206



In [None]:
best_boss_model = boosting_models[-1]

print("\nXGB using 250 features")
print(best_boss_model[2]["scores"]["report"])

In [174]:
confusion_matrix_best_boss = best_boss_model[2]["scores"]["cm"]

multi_index_cols = pd.MultiIndex.from_product([["Confusion matrix: XGB using 250 features"],
                                               ["Negative","Other","Positive"]])

confusion_matrix_best_boss.index = ["Negative","Other","Positive"]
confusion_matrix_best_boss.columns = multi_index_cols

confusion_matrix_best_boss.style.background_gradient(cmap="RdYlGn")


Unnamed: 0_level_0,Confusion matrix: XGB using 250 features,Confusion matrix: XGB using 250 features,Confusion matrix: XGB using 250 features
Unnamed: 0_level_1,Negative,Other,Positive
Negative,45319,13552,974
Other,3072,88577,2237
Positive,754,8250,48471


## Creating Pipeline and saving

In [132]:
input_data = Xy_data["comment"].apply(" ".join)

# Vectorization section
tfidf_model_fitted = TfidfVectorizer(min_df = 10).fit(input_data)
X_vect = tfidf_model_fitted.transform(input_data)

# Feature selection section
chi2_model_fitted = SelectKBest(chi2, k = 250).fit(X_vect, Xy_data["class"])

# Best Model selection section
best_model = best_bagg_model[1]["random_forest"]

# Saving all the steps in the pipeline to predict

with open("models/01_tfidf_vectorizer_fitted.joblib", "wb") as vFile:
    joblib.dump(best_model, vFile)

with open("models/02_chi2_250_feature_selector_fitted.joblib", "wb") as fsFile:
    joblib.dump(chi2_model_fitted, fsFile)

with open("models/03_random_forest_model_fitted.joblib") as bmFile:
    joblib.dump(best_model, bmFile)