In [1]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, log_loss, matthews_corrcoef,
    balanced_accuracy_score
)
import pandas as pd
import numpy as np

In [2]:
# Set display option to show full text in all columns
pd.set_option('display.max_colwidth', None)

In [3]:
# Read dataset
df = pd.read_csv('../../dataset/vectorized_features.csv')

# Display a quick overview of the dataset
display(df.reset_index(drop=True))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,376,377,378,379,380,381,382,383,label,label_name
0,-0.055051,-0.007697,0.063530,-0.039664,0.116901,-0.123296,0.058080,0.067705,0.071730,-0.109816,...,-0.034640,0.021249,-0.029084,0.084679,0.016152,0.015425,-0.135161,-0.064534,0,sadness
1,0.009239,-0.052964,0.019263,0.034021,0.125202,0.027428,0.077058,0.035879,0.075603,-0.052699,...,-0.044897,0.132352,-0.082222,0.003469,0.095559,-0.060182,-0.027176,-0.026275,0,sadness
2,-0.074503,-0.010642,-0.003460,-0.073246,-0.018509,-0.026024,0.023559,0.062387,0.110395,0.064938,...,-0.001018,0.019752,0.078386,-0.010269,0.041514,-0.024779,-0.042020,0.024512,3,anger
3,0.108594,0.095322,0.036477,0.015178,0.089073,-0.012647,-0.089686,-0.070015,0.042590,-0.011443,...,-0.004122,0.023587,0.056529,0.024166,0.103731,-0.044091,-0.109329,0.034851,2,love
4,-0.016712,-0.078771,0.032170,-0.053829,0.115593,-0.051190,0.132093,0.037378,0.001562,-0.072058,...,-0.077645,-0.016146,0.007182,0.029738,0.059137,-0.062703,-0.019559,-0.057704,3,anger
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,-0.037232,-0.092891,0.039721,0.022603,0.055530,-0.032968,0.018117,-0.083258,0.057230,-0.004862,...,-0.026874,0.035998,-0.056798,0.028747,0.009920,-0.014104,-0.117972,-0.006001,3,sadness
19996,-0.030062,0.018929,0.022504,0.004247,-0.037159,-0.113743,0.021137,0.043203,0.032478,-0.041403,...,0.000916,0.004910,-0.030023,0.073150,-0.072006,-0.043622,-0.001171,0.002970,3,joy
19997,-0.035255,0.050717,0.014070,-0.070761,0.038359,0.031761,0.191340,0.027269,-0.021324,-0.031828,...,0.021075,0.019080,-0.023213,-0.005873,0.085750,0.056295,-0.131316,-0.016883,1,surprise
19998,0.024688,-0.016231,0.016892,-0.047842,0.002488,0.028731,-0.010688,0.078802,-0.011791,-0.011458,...,-0.081756,0.049751,-0.041050,0.062848,-0.077174,-0.040509,-0.032970,0.026702,1,surprise


In [4]:
np.unique(df[["label_name"]])

array(['anger', 'fear', 'joy', 'love', 'sadness', 'surprise'],
      dtype=object)

In [5]:
# Separate the first 16000 records for training
train_df = df.iloc[:16000].reset_index(drop=True)

In [6]:
# Next 2000 records for testing
test_df = df.iloc[16000:18000].reset_index(drop=True)

In [7]:
# Last 2000 records for validation
validation_df = df.iloc[18000:].reset_index(drop=True)

In [8]:
def split(
    df: pd.DataFrame, 
    features_names: list[str],
    label_name: str
) -> (pd.DataFrame, pd.DataFrame):
    X = df.loc[:, features_names]  # Features
    y = df[label_name]             # Label
    return X, y

In [9]:
def train_model(X: pd.DataFrame, y: pd.DataFrame, model=str):
    random_state = 99
    if model == "LogisticRegression":
        classifier = LogisticRegression()
    elif model == "DecisionTreeClassifier":
        classifier = DecisionTreeClassifier(random_state=random_state)
    elif model == "RandomForestClassifier":
        classifier = RandomForestClassifier(n_estimators=100, random_state=random_state)
    elif model == "NeuralNetwork":
        classifier = MLPClassifier(hidden_layer_sizes=(10,), max_iter=1000, random_state=random_state)
    else:
        return None
        
    classifier.fit(X, y)
    
    return classifier

In [10]:
def evaluate_model(X_eval, y_eval, model) -> (pd.DataFrame, pd.DataFrame):
    y_pred = model.predict(X_eval)

    # Calculate metrics
    accuracy = accuracy_score(y_eval, y_pred)
    precision = precision_score(y_eval, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_eval, y_pred, average='weighted')
    f1 = f1_score(y_eval, y_pred, average='weighted')
    cm = confusion_matrix(y_eval, y_pred)

    # Use label names for index and columns
    ordered_labels_names = np.array([0, 1, 2, 3, 4, 5])
    cm_df = pd.DataFrame(cm, columns=ordered_labels_names, index=ordered_labels_names)

    # Create a dictionary of metrics and descriptions
    metrics_dict = {
        "Metric": [
            "Accuracy",
            "Precision",
            "Recall (Sensitivity)",
            "F1 Score",
        ],
        "Value": [
            accuracy,
            precision,
            recall,
            f1,
        ],
        "Description": [
            "Ratio of correct predictions to total predictions",
            "True Positives / (True Positives + False Positives)",
            "True Positives / (True Positives + False Negatives)",
            "Harmonic mean of precision and recall",
        ],
    }
    
    # Convert to DataFrame
    metrics_df = pd.DataFrame(metrics_dict)
    
    # Display nicely rounded results
    metrics_df["Value"] = metrics_df["Value"].apply(lambda x: round(x, 3))

    return metrics_df, cm_df

In [11]:
features = list(map(str, range(384)))
label = "label"

X_train, y_train = split(
    df=train_df,
    features_names=features,
    label_name=label
)

X_test, y_test = split(
    df=test_df,
    features_names=features,
    label_name=label
)

for use_model in ["LogisticRegression", "DecisionTreeClassifier", "RandomForestClassifier", "NeuralNetwork"]:
    # Train model
    classifier = train_model(
        X=X_train, 
        y=y_train, 
        model=use_model
    )
    
    # Evaluate model
    metrics_df, cm_df = evaluate_model(
        X_eval=X_test, 
        y_eval=y_test, 
        model=classifier
    )

    # Display evaluation results
    print(f"Evaluation Metrics for {use_model}:")
    display(metrics_df)
    print(f"Confusion Matrix for {use_model}:")
    display(cm_df)
    print()

Evaluation Metrics for LogisticRegression:


Unnamed: 0,Metric,Value,Description
0,Accuracy,0.696,Ratio of correct predictions to total predictions
1,Precision,0.692,True Positives / (True Positives + False Positives)
2,Recall (Sensitivity),0.696,True Positives / (True Positives + False Negatives)
3,F1 Score,0.685,Harmonic mean of precision and recall


Confusion Matrix for LogisticRegression:


Unnamed: 0,0,1,2,3,4,5
0,427,54,9,37,22,1
1,61,587,17,17,18,4
2,17,73,76,6,5,1
3,65,35,4,153,17,1
4,38,28,2,15,124,5
5,21,24,3,3,6,24



Evaluation Metrics for DecisionTreeClassifier:


Unnamed: 0,Metric,Value,Description
0,Accuracy,0.39,Ratio of correct predictions to total predictions
1,Precision,0.391,True Positives / (True Positives + False Positives)
2,Recall (Sensitivity),0.39,True Positives / (True Positives + False Negatives)
3,F1 Score,0.39,Harmonic mean of precision and recall


Confusion Matrix for DecisionTreeClassifier:


Unnamed: 0,0,1,2,3,4,5
0,257,106,35,79,55,18
1,134,352,70,67,59,22
2,36,70,34,19,14,5
3,81,61,16,72,33,12
4,58,60,14,22,53,5
5,18,26,7,7,11,12



Evaluation Metrics for RandomForestClassifier:


Unnamed: 0,Metric,Value,Description
0,Accuracy,0.577,Ratio of correct predictions to total predictions
1,Precision,0.6,True Positives / (True Positives + False Positives)
2,Recall (Sensitivity),0.577,True Positives / (True Positives + False Negatives)
3,F1 Score,0.505,Harmonic mean of precision and recall


Confusion Matrix for RandomForestClassifier:


Unnamed: 0,0,1,2,3,4,5
0,435,111,0,2,2,0
1,85,611,2,2,4,0
2,38,134,3,2,1,0
3,142,78,0,51,4,0
4,81,73,0,5,53,0
5,28,48,0,0,5,0



Evaluation Metrics for NeuralNetwork:


Unnamed: 0,Metric,Value,Description
0,Accuracy,0.68,Ratio of correct predictions to total predictions
1,Precision,0.677,True Positives / (True Positives + False Positives)
2,Recall (Sensitivity),0.68,True Positives / (True Positives + False Negatives)
3,F1 Score,0.677,Harmonic mean of precision and recall


Confusion Matrix for NeuralNetwork:


Unnamed: 0,0,1,2,3,4,5
0,407,60,11,41,26,5
1,68,545,26,22,30,13
2,12,59,85,13,5,4
3,47,29,7,153,33,6
4,26,22,5,19,133,7
5,13,19,6,0,6,37





In [12]:
features = list(map(str, range(384)))
label = "label_name"

X_train, y_train = split(
    df=train_df,
    features_names=features,
    label_name=label
)

X_test, y_test = split(
    df=test_df,
    features_names=features,
    label_name=label
)

for use_model in ["LogisticRegression", "DecisionTreeClassifier", "RandomForestClassifier", "NeuralNetwork"]:
    # Train model
    classifier = train_model(
        X=X_train, 
        y=y_train, 
        model=use_model
    )
    
    # Evaluate model
    metrics_df, cm_df = evaluate_model(
        X_eval=X_test, 
        y_eval=y_test, 
        model=classifier
    )

    # Display evaluation results
    print(f"Evaluation Metrics for {use_model}:")
    display(metrics_df)
    print(f"Confusion Matrix for {use_model}:")
    display(cm_df)
    print()

Evaluation Metrics for LogisticRegression:


Unnamed: 0,Metric,Value,Description
0,Accuracy,0.24,Ratio of correct predictions to total predictions
1,Precision,0.221,True Positives / (True Positives + False Positives)
2,Recall (Sensitivity),0.24,True Positives / (True Positives + False Negatives)
3,F1 Score,0.229,Harmonic mean of precision and recall


Confusion Matrix for LogisticRegression:


Unnamed: 0,0,1,2,3,4,5
0,26,25,104,15,95,3
1,33,26,98,11,67,1
2,89,64,265,45,223,16
3,15,23,71,3,58,3
4,58,48,238,32,158,10
5,10,6,25,5,28,3



Evaluation Metrics for DecisionTreeClassifier:


Unnamed: 0,Metric,Value,Description
0,Accuracy,0.232,Ratio of correct predictions to total predictions
1,Precision,0.232,True Positives / (True Positives + False Positives)
2,Recall (Sensitivity),0.232,True Positives / (True Positives + False Negatives)
3,F1 Score,0.232,Harmonic mean of precision and recall


Confusion Matrix for DecisionTreeClassifier:


Unnamed: 0,0,1,2,3,4,5
0,31,33,92,30,74,8
1,31,23,81,20,77,4
2,98,81,235,60,196,32
3,25,20,59,13,49,7
4,68,60,189,46,160,21
5,13,8,19,7,28,2



Evaluation Metrics for RandomForestClassifier:


Unnamed: 0,Metric,Value,Description
0,Accuracy,0.299,Ratio of correct predictions to total predictions
1,Precision,0.24,True Positives / (True Positives + False Positives)
2,Recall (Sensitivity),0.299,True Positives / (True Positives + False Negatives)
3,F1 Score,0.246,Harmonic mean of precision and recall


Confusion Matrix for RandomForestClassifier:


Unnamed: 0,0,1,2,3,4,5
0,6,11,150,1,100,0
1,10,10,119,0,97,0
2,28,27,370,2,275,0
3,6,7,90,1,69,0
4,16,16,300,1,211,0
5,0,2,38,0,37,0



Evaluation Metrics for NeuralNetwork:


Unnamed: 0,Metric,Value,Description
0,Accuracy,0.247,Ratio of correct predictions to total predictions
1,Precision,0.243,True Positives / (True Positives + False Positives)
2,Recall (Sensitivity),0.247,True Positives / (True Positives + False Negatives)
3,F1 Score,0.245,Harmonic mean of precision and recall


Confusion Matrix for NeuralNetwork:


Unnamed: 0,0,1,2,3,4,5
0,33,27,91,23,84,10
1,33,36,86,15,54,12
2,92,67,269,47,197,30
3,26,18,57,11,59,2
4,66,62,210,45,142,19
5,14,9,20,8,22,4



