In [1]:
!pip install scikit-learn prettytable seaborn --quiet

In [2]:
# imports 
import numpy as np
import pandas as pd
import pickle
from prettytable import PrettyTable

from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

# Data Loading

In [3]:
# Load pickled pre-processed data
# Load sparse
X_train_sparse, X_test_sparse, y_train_sparse, y_test_sparse, org_train, org_test = pd.read_pickle("pickles/sparse.pkl")

# Load dense
X_train_dense, X_test_dense, y_train_dense, y_test_dense, w2v_train, w2v_test = pd.read_pickle("pickles/dense.pkl")

# Functions

In [4]:
# Function to print out formatted confusion matrix
def pretty_confusion_matrix(y_test, y_pred, header_to_print):
    # Generate confusion matrix
    confmat = confusion_matrix(y_test, y_pred)

    # Instatiate pretty table
    tab = PrettyTable()
    tab.field_names = ["", "Predicted 0", "Predicted 1", "Predicted 2", "Predicted 3", "Predicted 4", "Predicted 5"]
    
    # Fill in rows and print table
    for i in range(6):
        tab.add_row(["Actual " + str(i)] + [confmat[i][j] for j in range(6)])
    print(f"Confusion Matrix ({header_to_print}): \n{tab}")
    
    # Print a legend of the classes
    classes = y_test.unique()
    print("0: BPD, 1: Anxiety, 2: Bipolar, 3: Depression, 4: Mental Illness, 5: Schizophrenia")

In [5]:
# Function to print out formatted classification report
def pretty_classification_report(y_test, y_pred, header_to_print):
    # Generate classification report
    class_report = classification_report(y_test, y_pred, output_dict=True)

    # Instatiate the PrettyTable
    tab = PrettyTable()
    tab.field_names = ['Class', 'Precision', 'Recall', 'F1-score', 'Support']

    # Get a list of classes
    classes = y_test_sparse.unique()

    # Loop through each class in the classification report and add its metrics to the table
    for class_name, metrics in class_report.items():
        if class_name in classes:
            class_id = class_name
            precision = round(metrics['precision'], 2)
            recall = round(metrics['recall'], 2)
            f1_score = round(metrics['f1-score'], 2)
            support = metrics['support']
            tab.add_row([class_id, precision, recall, f1_score, support])

    # Add the macro and weighted averages to the table
    macro_precision = round(class_report['macro avg']['precision'], 2)
    macro_recall = round(class_report['macro avg']['recall'], 2)
    macro_f1_score = round(class_report['macro avg']['f1-score'], 2)
    tab.add_row(['macro avg', macro_precision, macro_recall, macro_f1_score, ''])

    weighted_precision = round(class_report['weighted avg']['precision'], 2)
    weighted_recall = round(class_report['weighted avg']['recall'], 2)
    weighted_f1_score = round(class_report['weighted avg']['f1-score'], 2)
    tab.add_row(['weighted avg', weighted_precision, weighted_recall, weighted_f1_score, ''])

    # Print the table
    print(f"Classification Report ({header_to_print}): \n{tab}")

# Word2Vec

**Using grid search for hyperparameter tuning**

In [6]:
# set max_iter = 1000 as below the max_iter was reached which means the coef_ did not converge
# solver='saga' as it was the best solver in previous grid searchs, saga is well suited for large datasets
lr_w2v = LogisticRegression(multi_class = 'multinomial', max_iter=1000, solver='saga')

# Define the hyperparameter grid to search
parameter_space = {
    'penalty': ['l1', 'l2'], # Regularization penalty (L1 or L2)
    'C': [1, 10, 100], # Regularization strength
    'tol': [1e-5, 1e-4, 1e-3], # Tolerance for stopping criteria
}

# Perform grid search with cross-validation
lr_clf_w2v = GridSearchCV(lr_w2v, parameter_space, n_jobs=-1, cv=3, verbose=10)

In [7]:
%%time
lr_clf_w2v.fit(X_train_dense, y_train_dense)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
CPU times: user 5min 22s, sys: 3.16 s, total: 5min 25s
Wall time: 28min 20s


In [8]:
# Print the best hyperparameters and the corresponding score
print("Best hyperparameters: \n", lr_clf_w2v.best_params_)
print("Best score: \n", lr_clf_w2v.best_score_)

Best hyperparameters: 
 {'C': 10, 'penalty': 'l1', 'tol': 0.0001}
Best score: 
 0.651770015592991


In [9]:
# Save optimal model
best_model_lr_w2v = lr_clf_w2v.best_estimator_

# Perform cross validation to check if model is overfitting 
y_scores = cross_val_score(best_model_lr_w2v, X_train_dense, y_train_dense, cv = 5)
print(f'Cross-validated scores: {y_scores}')

Cross-validated scores: [0.65121657 0.65365949 0.65091205 0.65299674 0.65100977]


**Evaluate the performance on the test set**

In [10]:
# Predict on the test set 
y_pred_lr_w2v = best_model_lr_w2v.predict(X_test_dense)

In [11]:
# Print confusion matrix
pretty_confusion_matrix(y_test_dense, y_pred_lr_w2v, "Logistic Regression, Word2Vec")

Confusion Matrix (Logistic Regression, Word2Vec): 
+----------+-------------+-------------+-------------+-------------+-------------+-------------+
|          | Predicted 0 | Predicted 1 | Predicted 2 | Predicted 3 | Predicted 4 | Predicted 5 |
+----------+-------------+-------------+-------------+-------------+-------------+-------------+
| Actual 0 |    32692    |     1955    |     1067    |     3947    |     1094    |     270     |
| Actual 1 |     3381    |    24535    |     742     |     2173    |     1004    |     278     |
| Actual 2 |     1154    |     666     |     4024    |     693     |     326     |     255     |
| Actual 3 |     4604    |     1661    |     708     |    15986    |     1058    |     177     |
| Actual 4 |     2054    |     1011    |     492     |     1591    |     1960    |     375     |
| Actual 5 |     293     |     239     |     287     |     221     |     296     |     987     |
+----------+-------------+-------------+-------------+-------------+--------

In [12]:
# Print out classification report
pretty_classification_report(y_test_dense, y_pred_lr_w2v, "Logistic Regression, Word2Vec")

Classification Report (Logistic Regression, Word2Vec): 
+---------------+-----------+--------+----------+---------+
|     Class     | Precision | Recall | F1-score | Support |
+---------------+-----------+--------+----------+---------+
|      BPD      |    0.74   |  0.8   |   0.77   |  41025  |
|    anxiety    |    0.82   |  0.76  |   0.79   |  32113  |
|    bipolar    |    0.55   |  0.57  |   0.56   |   7118  |
|   depression  |    0.65   |  0.66  |   0.66   |  24194  |
| mentalillness |    0.34   |  0.26  |   0.3    |   7483  |
| schizophrenia |    0.42   |  0.42  |   0.42   |   2323  |
|   macro avg   |    0.59   |  0.58  |   0.58   |         |
|  weighted avg |    0.7    |  0.7   |   0.7    |         |
+---------------+-----------+--------+----------+---------+


**Save the model**

In [14]:
pickle.dump(best_model_lr_w2v, open('logistic_regression_model_w2v.pkl', 'wb'))

# TF-IDF

**Using grid search for hyperparameter tuning**

In [13]:
# set max_iter = 1000 as below the max_iter was reached which means the coef_ did not converge
# solver='saga' as it was the best solver in previous grid searchs
lr_tfidf = LogisticRegression(multi_class = 'multinomial', max_iter=100, solver='saga')

# Define the hyperparameter grid to search
parameter_space = {
    'penalty': ['l1', 'l2'], # Regularization penalty (L1 or L2)
    'C': [1, 10, 100], # Regularization strength
    'tol': [1e-5, 1e-4, 1e-3], # Tolerance for stopping criteria
}

# Perform grid search with cross-validation
lr_clf_tfidf = GridSearchCV(lr_tfidf, parameter_space, n_jobs=-1, cv=3, verbose=10)

In [14]:
%%time
lr_clf_tfidf.fit(X_train_sparse, y_train_sparse)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
CPU times: user 3min 14s, sys: 3.25 s, total: 3min 18s
Wall time: 1h 41min 31s


In [15]:
# Print the best hyperparameters and the corresponding score
print("Best hyperparameters: \n", lr_clf_tfidf.best_params_)
print("Best score: \n", lr_clf_tfidf.best_score_)

Best hyperparameters: 
 {'C': 1, 'penalty': 'l1', 'tol': 0.001}
Best score: 
 0.727718002339203


In [16]:
# Save optimal model
best_model_lr_tfidf = lr_clf_tfidf.best_estimator_

# Perform cross validation to check if model is overfitting 
y_scores = cross_val_score(best_model_lr_tfidf, X_train_sparse, y_train_sparse, cv = 5)
print(f'Cross-validated scores: {y_scores}')

Cross-validated scores: [0.72921484 0.73002898 0.72370221 0.73187651 0.73151827]


**Evaluate the performance on the test set**

In [17]:
# Predict on the test set 
y_pred_lr_tfidf = best_model_lr_tfidf.predict(X_test_sparse)

In [18]:
# Print confusion matrix
pretty_confusion_matrix(y_test_sparse, y_pred_lr_tfidf, "Logistic Regression, TF-IDF")

Confusion Matrix (Logistic Regression, TF-IDF): 
+----------+-------------+-------------+-------------+-------------+-------------+-------------+
|          | Predicted 0 | Predicted 1 | Predicted 2 | Predicted 3 | Predicted 4 | Predicted 5 |
+----------+-------------+-------------+-------------+-------------+-------------+-------------+
| Actual 0 |    33956    |     1447    |     822     |     3284    |     1319    |     200     |
| Actual 1 |     1605    |    26846    |     272     |     2006    |     1211    |     174     |
| Actual 2 |     782     |     349     |     4844    |     593     |     328     |     222     |
| Actual 3 |     2565    |     1494    |     403     |    18264    |     1369    |     103     |
| Actual 4 |     1294    |     873     |     395     |     1545    |     3017    |     363     |
| Actual 5 |     149     |     152     |     191     |     188     |     254     |     1392    |
+----------+-------------+-------------+-------------+-------------+----------

In [19]:
# Print out classification report
pretty_classification_report(y_test_sparse, y_pred_lr_tfidf, "Logistic Regression, TF-IDF")

Classification Report (Logistic Regression, TF-IDF): 
+---------------+-----------+--------+----------+---------+
|     Class     | Precision | Recall | F1-score | Support |
+---------------+-----------+--------+----------+---------+
|      BPD      |    0.84   |  0.83  |   0.83   |  41028  |
|    anxiety    |    0.86   |  0.84  |   0.85   |  32114  |
|    bipolar    |    0.7    |  0.68  |   0.69   |   7118  |
|   depression  |    0.71   |  0.75  |   0.73   |  24198  |
| mentalillness |    0.4    |  0.4   |   0.4    |   7487  |
| schizophrenia |    0.57   |  0.6   |   0.58   |   2326  |
|   macro avg   |    0.68   |  0.68  |   0.68   |         |
|  weighted avg |    0.78   |  0.77  |   0.77   |         |
+---------------+-----------+--------+----------+---------+


**Save the model**

In [22]:
pickle.dump(best_model_lr_tfidf, open('logistic_regression_model_tfidf.pkl', 'wb'))

# Running times
Re-run just the optimal model to get the CPU execution time 

In [20]:
# Word2Vec
lr_w2v_OPT = LogisticRegression(multi_class = 'multinomial', solver='saga', C = 10, penalty = 'l1', tol = 1e-05)
%time lr_w2v_OPT.fit(X_train_dense, y_train_dense)

CPU times: user 4min 2s, sys: 301 ms, total: 4min 2s
Wall time: 4min 2s




In [21]:
# TF-IDF
lr_tfidf_OPT = LogisticRegression(multi_class = 'multinomial', max_iter=100, solver='saga', C= 1, penalty= 'l1', tol= 0.001)
%time lr_tfidf_OPT.fit(X_train_sparse, y_train_sparse)

CPU times: user 3min 20s, sys: 244 ms, total: 3min 20s
Wall time: 3min 20s
