## Loading libraries and data

In [3]:
from time import time
import joblib
import gc

import pandas as pd
import numpy as np

from sklearn.metrics import classification_report, confusion_matrix, \
                            f1_score, accuracy_score, recall_score, \
                            precision_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2

import xgboost as xgb

  from pandas import MultiIndex, Int64Index


In [4]:
# Important considerations about the versions used
print("python==3.10.10")
print("sklearn==1.1.3")
print("xgboost==1.5.0")
print("joblib==1.2.0")

python==3.10.10
sklearn==1.1.3
xgboost==1.5.0
joblib==1.2.0


In [None]:
def CreateFullReport(model_name:str, model_obj:object,
                    y_true:list, y_predict:list,
                    time_train:float, time_predict:float, 
                    n_features:int) -> "tuple(dict)":

    '''
    This functions creates a full report about a machine learning model. It includes:
    1. Model name
    2. Model object
    3. Confusion Matrix
    4. Feature selection and number of features selected
    5. Time took to train and predict
    6. F1-score, precision, recall and accuracy

    By default the method used (and reported) to feature selection is chi2

    Parameters
    ----------
    model_name: str
        Name of the model. e.g SVM, Random Forest etc
    
    model_obj: class 
        Object containig the model. e.g sklearn object fitted such as SVC() or MultinomialNB() classifier
    
    y_true: list, array-like
        List or array-like containing the true labels

    y_predict: list, array-like
        List or array-like containing the predicted labels by the model
    
    time_train: float
        Time taken by the model for training

    time_predict: float:
        Time taken by the model when predicting

    n_features: int 
        Number of features selected

    '''

    report = classification_report(y_true, y_predict)
    cm_matrix = pd.DataFrame(confusion_matrix(y_true, y_predict, labels=[0,1,2]))
        
    f1_result = f1_score(y_true, y_predict, labels=[0,1,2], average="macro")
    accuracy_result = accuracy_score(y_true, y_predict)
    recall_result = recall_score(y_true, y_predict, labels=[0,1,2], average="macro")
    precision_result = precision_score(y_true, y_predict, labels=[0,1,2], average="macro")
  
    # Saving the values of the experiment
    current_report = ({"chi2": n_features},
                    {model_name: model_obj},
                    {"scores":
                     {
                        "f1_score": f1_result,
                        "accuracy": accuracy_result,
                        "recall": recall_result,
                        "precision": precision_result,
                        "cm": cm_matrix,
                        "report": report
                     }
                    },
                    {"time":
                     {
                        "training": time_train,
                        "prediction":time_predict
                     }
                    })

    return current_report

In [15]:
Xy_data = pd.read_feather("data/labeled_data_clean.feather")
Xy_data["comment"] = Xy_data["comment"].transform(list)

Xy_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 704019 entries, 0 to 704018
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   comment  704019 non-null  object
 1   class    704019 non-null  int8  
dtypes: int8(1), object(1)
memory usage: 6.0+ MB


In [None]:
Xy_data["class"].value_counts()

1    313661
0    199718
2    190640
Name: class, dtype: int64

In [None]:
print("Shufling data 5 time")
for i in range(5):
    Xy_data = Xy_data.sample(Xy_data.shape[0], replace=False)

Shufling data 5 time


## Experimental grid definition

### Models parameters

In this case, I'll be using _Term Frequency Inverse Document Frequency_ (tf-idf) to obtain a numeric vector representation of the data. I will ignore words with a frequency less than 10, such as misspelled words.

For feature selection, I have chosen the _chi2_ method and will experiment with different numbers of features to select (2000, 1000, 5000, 250). This approach allows for the use of fewer computational resources by utilizing a subset of features instead of all of them, resulting in faster model training. Additionally, based on the previous notebook, wordclouds have shown that certain words (features) are significantly present in specific classes. Therefore, I expect that only a subset of words is critical for determining the class, rather than assuming that every word in the sentence is important and should be considered.

To further address the computational power issue, I will use "bagging" to train multiple classifiers with less data, instead of using a single classifier with all the data. Three base models have been chosen for the sentiment analysis task: _Support Vector Machine_ (SVM), _Multinomial Naive Bayes_ (Multinomial NB), and _tree decisions_ (Random Forest).

I will also test a boosting method, Extreme Gradient Boost (XGBoost), which utilizes all of the data.

Finally, the models will be trained in parallel using 6 jobs at the same time, fitting 10 models (except for Random Forest, where n_estimators = 100, and XGBoost, where n_estimators = 1000). Other parameters were left as default.

The experimental grid definition, consisting of several dictionaries containing the configuration of the grid to test, is shown below. In the section **Feature selection and running grid**, the models are trained.

In [None]:
# SVM doesn't work for 500 and 250
select_k_features = [2000, 1000, 500, 250]

model_parameters = {
    
    "random_forest": {},
    
    "svm_linear": {
        "kernel":"linear",
        "class_weight":"balanced"
    },
    
    "multinomial": {}
   
}

bagging_models = {
    
    "random_forest": None,
    "svm_linear": SVC,
    "multinomial": MultinomialNB
    
}

bagging_parameters = {

    "random_forest":{
        "n_jobs": 6,
        "verbose":1
    },
    
    "svm_linear": {
        "max_samples": 2.5 / 10,
        "n_estimators": 10,
        "verbose": 2,
        "n_jobs": 6
    },
    
    "multinomial":{
        "n_jobs": 6,
        "verbose": 1,
        "n_estimators": 10,
        "max_samples": 2.5 / 10,
    }

}

bagging_definition = {
    
    "random_forest": RandomForestClassifier,
    "svm_linear": BaggingClassifier,
    "multinomial": BaggingClassifier

}

boosting_parameters = {
    
    "xgb":{

        "tree_method": "hist",
        "objective": "multi:softprob",
        "n_estimators": 1000,
        "verbosity": 1,
        "n_jobs": 6

    }

}

boosting_options = {
    
    "xgb":xgb.XGBClassifier

}

### Train/test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(Xy_data["comment"],
                                                    Xy_data["class"],
                                                    train_size = 0.7)

print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (492813,)
y_train shape: (492813,)
X_test shape: (211206,)
y_test shape: (211206,)


### Feature selection and running grid

In [None]:
print("Vectorizing data")
tfidf_model = TfidfVectorizer(min_df = 10)
X_vect = tfidf_model.fit_transform(Xy_data["comment"].apply(" ".join))

X_train = tfidf_model.transform(X_train.apply(" ".join))
X_test  = tfidf_model.transform(X_test.apply(" ".join))
    
# List to track and save the models running
list_tracking = []

for f_s_m in select_k_features:

    # Fitting with all data X_vect
    print(f"Selecting features: {f_s_m}")
    chi2_model = SelectKBest(chi2, k = f_s_m)
    chi2_model.fit(X_vect, Xy_data["class"])

    X_train_chi2 = chi2_model.transform(X_train)
    X_test_chi2  = chi2_model.transform(X_test)
    
    for key_bagg, bagg_class in bagging_definition.items():
        
        print(f"Defining model: {key_bagg}")
        
        # Defining if the model is already an ensemble bagging model or not
        # None means it's already an ensemble
        if bagging_models[key_bagg] is None:
            bagg_model_clf = bagg_class(**bagging_parameters[key_bagg])
        else:
            clf_model = bagging_models[key_bagg](**model_parameters[key_bagg])
            bagg_model_clf = bagg_class(clf_model,
                                        **bagging_parameters[key_bagg])
        
        
        # SVM doesn't work for 500 and 250
        if (key_bagg == "svm_linear") and (f_s_m < 501):
            print(f"Skipping model {key_bagg}: {f_s_m}")
            continue
    
        print("Training model")
        time_start_train_model = time()
        bagg_model_clf.fit(X_train_chi2, y_train)
        time_train = time() - time_start_train_model
        
        print("Evaluating model")
        time_start_predic_model = time()
        y_hat = bagg_model_clf.predict(X_test_chi2)
        time_predic = time() - time_start_predic_model
        
        # Saving the values of this experiment
        current_report = CreateFullReport(key_bagg, bagg_model_clf, 
                                        y_test, y_hat,
                                        time_train, time_predic,
                                        f_s_m)
        
        list_tracking.append(current_report)
        print("Done")
        print("")
        gc.collect()

with open("bagging_trained_models.joblib", "wb") as oFile:
    joblib.dump(list_tracking, oFile)

gc.collect()

In [None]:
## grid of xgboost
list_tracking = []
for f_s_m in select_k_features:
    
    print(f"Selecting features: {f_s_m}")
    chi2_model = SelectKBest(chi2, k = f_s_m)
    chi2_model.fit(X_vect, Xy_data["class"])

    X_train_chi2 = chi2_model.transform(X_train)
    X_test_chi2  = chi2_model.transform(X_test)

    for key_boost, boost_class in boosting_options.items():

        print(f"Defining model: {key_boost}")
        boost_model_clf = boost_class(**boosting_parameters[key_boost])
      
        print("Training model")
        time_start_train_model = time()
        boost_model_clf.fit(X_train_chi2, y_train)
        time_train = time() - time_start_train_model

        print("Evaluating model")
        time_start_predic_model = time()
        y_hat = boost_model_clf.predict(X_test_chi2)
        time_predic = time() - time_start_predic_model
      
        current_report = CreateFullReport(key_boost, boost_model_clf,
                                          y_test, y_hat,
                                          time_train, time_predic,
                                          f_s_m)
        
        list_tracking.append(current_report)
        print("Done")
        print("")

        gc.collect()

with open("boosting_trained_models.joblib", "wb") as oFile:
    joblib.dump(list_tracking, oFile)

gc.collect()

## Model evaluations

In [5]:
# Loading all models

with open("bagging_trained_models.joblib", "rb") as oBag:
    bagging_models = joblib.load(oBag)

with open("boosting_trained_models.joblib", "rb") as oBoo:
    boosting_models = joblib.load(oBoo)

### Best model (bagging)

Metrics such as _f1-score_, _accuracy_, _recall_, and _precision_ are presented in the following table.

Among all the models, Multinomial Naive Bayes performed the worst. However, the models trained with different numbers of features exhibited similar performance. For instance, the Random Forest models trained with 2000 features and 250 features demonstrated comparable results.

Unfortunately, SVM failed to run for 250 and 500 features, possibly due to convergence issues in finding a suitable solution for the task.

In [6]:
scores_models = []
times_models = []

for bagg_mod in bagging_models:
    n_features = bagg_mod[0]["chi2"]
    model_name = list(bagg_mod[1].keys())[0]
    f1 = bagg_mod[2]["scores"]["f1_score"]
    acc  = bagg_mod[2]["scores"]["accuracy"]
    prec = bagg_mod[2]["scores"]["recall"]
    rec  = bagg_mod[2]["scores"]["precision"]


    scores_models.append((n_features, model_name, f1, acc, prec, rec))

    train = bagg_mod[-1]["time"]["training"]
    predi = bagg_mod[-1]["time"]["prediction"]

    times_models.append((train, predi))



metrics_models = pd.DataFrame(scores_models, columns=["n_features","model",
                                                      "f1_score","accuracy",
                                                      "precision", "recall"])

metrics_models.set_index(["n_features","model"], inplace=True)
metrics_models.style.background_gradient(cmap="Blues", axis=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,accuracy,precision,recall
n_features,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000,random_forest,0.95844,0.959499,0.957141,0.959818
2000,svm_linear,0.952834,0.954248,0.955023,0.950941
2000,multinomial,0.857568,0.861903,0.840464,0.889755
1000,random_forest,0.956909,0.958022,0.955647,0.958245
1000,svm_linear,0.952165,0.953676,0.954415,0.950265
1000,multinomial,0.860266,0.864052,0.841264,0.897512
500,random_forest,0.955696,0.956786,0.954281,0.957194
500,multinomial,0.860659,0.863943,0.839517,0.904001
250,random_forest,0.953439,0.954523,0.951642,0.955364
250,multinomial,0.859034,0.862064,0.836328,0.907285


If we consider only the models with the best performance for each number of features trained, Random Forest outperformed the other models in this section. Additionally, the training and testing times were measured.

As expected, the simpler models (with fewer features) trained faster (size of $\text{X\_train} = 492813$). However, it is challenging to make the same assumption about the prediction time (see table below) since the variations observed within a few milliseconds could be influenced by the performance of the equipment used during the prediction.

As mentioned earlier, the metrics such as F1-score, recall, accuracy, and precision are similar. Therefore, the best model in this case will be based on the simplest model, which is the Random Forest classifier trained with 250 features.

In [7]:
times_best_models_by_features = pd.DataFrame(times_models,
                                             columns = ["t_training","t_prediction"]).iloc[[0,3,6,8]]

best_models_by_features = metrics_models.loc[metrics_models.index[[0,3,6,8]]]

times_best_models_by_features.reset_index(drop=True, inplace=True)
best_models_by_features.reset_index(inplace=True)

metrics_models_best = best_models_by_features.merge(times_best_models_by_features,
                                                    left_index=True, right_index=True)

metrics_models_best.set_index(["model", "n_features"], inplace = True)
metrics_models_best.style.background_gradient(cmap="Blues")

Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,accuracy,precision,recall,t_training,t_prediction
model,n_features,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
random_forest,2000,0.95844,0.959499,0.957141,0.959818,799.223845,3.461977
random_forest,1000,0.956909,0.958022,0.955647,0.958245,637.674447,3.141531
random_forest,500,0.955696,0.956786,0.954281,0.957194,510.892138,3.753146
random_forest,250,0.953439,0.954523,0.951642,0.955364,372.622733,2.834109


Below, you will find the classification report and confusion matrix of the best model chosen so far.

It is important to note that Class 1, or "other," was the majority class with 94,359 samples for testing. This class was created by merging other classes such as "neutral" or "uncertainty." The number of samples for the negative class (0) and positive class (1) is roughly the same.

Additionally, the most "complicated" task was to distinguish between the "negative" class and the "positive" class of the "other" class.

In [8]:
best_bagg_model = bagging_models[-2]
print("\nRandom Forest using 250 features")
print(best_bagg_model[2]["scores"]["report"])


Random Forest using 250 features
              precision    recall  f1-score   support

           0       0.96      0.94      0.95     59765
           1       0.95      0.97      0.96     94359
           2       0.96      0.94      0.95     57082

    accuracy                           0.95    211206
   macro avg       0.96      0.95      0.95    211206
weighted avg       0.95      0.95      0.95    211206



In [9]:
confusion_matrix_best_bagg = best_bagg_model[2]["scores"]["cm"]

multi_index_cols = pd.MultiIndex.from_product([["Confusion matrix: Random Forest using 250 features"],
                                               ["Negative","Other","Positive"]])
confusion_matrix_best_bagg.columns = multi_index_cols
confusion_matrix_best_bagg.index = ["Negative","Other","Positive"]

confusion_matrix_best_bagg.style.background_gradient(cmap="RdYlGn")

Unnamed: 0_level_0,Confusion matrix: Random Forest using 250 features,Confusion matrix: Random Forest using 250 features,Confusion matrix: Random Forest using 250 features
Unnamed: 0_level_1,Negative,Other,Positive
Negative,56358,2508,899
Other,1586,91395,1378
Positive,1022,2212,53848


### Best model (boosting)

Now it's time to see the performance of _xgboost_.

Again, performance is similar across the selected features, and the training time is reduced as fewer features are used. However, the prediction time on our testing data set appears to be the same.

In [10]:
scores_models = []
times_models = []

for boos_mod in boosting_models:
    n_features = boos_mod[0]["chi2"]
    model_name = list(boos_mod[1].keys())[0]
    f1 = boos_mod[2]["scores"]["f1_score"]
    acc  = boos_mod[2]["scores"]["accuracy"]
    prec = boos_mod[2]["scores"]["recall"]
    rec  = boos_mod[2]["scores"]["precision"]

    scores_models.append((n_features, model_name, f1, acc, prec, rec))

    train = boos_mod[-1]["time"]["training"]
    predi = boos_mod[-1]["time"]["prediction"]

    times_models.append((train, predi))



metrics_models = pd.DataFrame(scores_models, columns=["n_features","model",
                                                  "f1_score","accuracy",
                                                  "precision", "recall"])

times_models = pd.DataFrame(times_models, columns = ["t_training","t_prediction"])

metrics_models = metrics_models.merge(times_models, left_index=True,
                                      right_index=True)

metrics_models.set_index(["n_features","model"], inplace=True)
metrics_models.style.background_gradient(cmap="Blues", axis=0)

Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,accuracy,precision,recall,t_training,t_prediction
n_features,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2000,xgb,0.959417,0.960323,0.958034,0.96089,740.922689,19.833372
1000,xgb,0.958961,0.959859,0.95748,0.960541,474.911145,19.671304
500,xgb,0.957982,0.958912,0.956279,0.95981,293.841939,20.070977
250,xgb,0.95766,0.958486,0.955557,0.959944,183.963725,20.495082


Below, the classification report and confusion matrix are shown for the best model in this section, which is _XGBoost_ with 250 features.

Similar to _Random Forest_, distinguishing between "positive" and "other" and "negative" and "other" were the most "challenging" tasks.

In [11]:
best_boss_model = boosting_models[-1]

print("\nXGB using 250 features")
print(best_boss_model[2]["scores"]["report"])


XGB using 250 features
              precision    recall  f1-score   support

           0       0.97      0.94      0.95     59765
           1       0.95      0.97      0.96     94359
           2       0.96      0.95      0.96     57082

    accuracy                           0.96    211206
   macro avg       0.96      0.96      0.96    211206
weighted avg       0.96      0.96      0.96    211206



In [13]:
confusion_matrix_best_boss = best_boss_model[2]["scores"]["cm"]

multi_index_cols = pd.MultiIndex.from_product([["Confusion matrix: XGB using 250 features"],
                                               ["Negative","Other","Positive"]])

confusion_matrix_best_boss.index = ["Negative","Other","Positive"]
confusion_matrix_best_boss.columns = multi_index_cols

confusion_matrix_best_boss.style.background_gradient(cmap="RdYlGn")


Unnamed: 0_level_0,Confusion matrix: XGB using 250 features,Confusion matrix: XGB using 250 features,Confusion matrix: XGB using 250 features
Unnamed: 0_level_1,Negative,Other,Positive
Negative,56477,2520,768
Other,1226,91803,1330
Positive,814,2110,54158


Both _Random Forest_ and _XGBoost_ (best models) performed well in the classification problem, with _XGBoost_ slightly outperforming _Random Forest_ (a difference in f1-score of 0.01). However, the prediction time on the same testing data was slower for _XGBoost_, so I am inclined to choose _Random Forest_ as our winner!

Moreover, the _Random Forest_ model is already doing an excellent job in classifying the polarity of the sentences, so I don't see a need for hyperparameter tuning

## Creating Pipeline and saving

Finally, the vectorizer, feature selector and our best model will be serialized to be used later.

In [19]:
input_data = Xy_data["comment"].apply(" ".join)

# Vectorization section
tfidf_model_fitted = TfidfVectorizer(min_df = 10).fit(input_data)
X_vect = tfidf_model_fitted.transform(input_data)

# Feature selection section
chi2_model_fitted = SelectKBest(chi2, k = 250).fit(X_vect, Xy_data["class"])

# Best Model selection section
best_model = best_bagg_model[1]["random_forest"]

# Saving all the steps in the pipeline to predict

with open("models/01_tfidf_vectorizer_fitted.joblib", "wb") as vFile:
    joblib.dump(tfidf_model_fitted, vFile)

with open("models/02_chi2_250_feature_selector_fitted.joblib", "wb") as fsFile:
    joblib.dump(chi2_model_fitted, fsFile)

with open("models/03_random_forest_model_fitted.joblib", "wb") as bmFile:
    joblib.dump(best_model, bmFile)