# Automated Hyper-Parameter Tuned Baseline Classifiers for Benchmark


### Features Extracted [Mel(128), MFCC(40), Chroma(12)]
### Gender Splitted
### Hyper-Parameter Tuning
### Classical Classifiers and Voting Classifiers

* This notebook extracts audio features and considers the following nine baseline classifiers which will serve as a benchmark for the proposed model
1. Light Gradient Boosing Machine
2. Random Forest
3. eXtreme Gradient Boosting
3. Multi-Layer Perceptron
4. K-Nearest Neighbor
5. Decision Tree
6. Logistic Regression
* The notebook also combines the best of classifiers and creates four new classifiers which is a combination of best of the above classifiers
7. V1 [Multi-Layer Perceptron, Light Gradient Boosing Machine]
8. V2 [K-Nearest Neighbor, eXtreme Gradient Boosting, Multi-Layer Perceptron]
9. V3 [eXtreme Gradient Boosting, Multi-Layer Perceptron, Random Forest, Logistic Regression]
10. V4 [Multi-Layer Perceptron, eXtreme Gradient Boosting]

* The hyper-parementer tuning is performed using Optune Framework (https://optuna.org/) to get the best parameters for the baseline classifiers
* The baseline classifiers are cross-validated and the classification report and confusion matrix is displayed

##### __IMPORTANT NOTE: This notebook is "extremely" CPU intensive and can several hours to complete.__
Approximated Runtimes on Intel DevCloud Jupyter Notebook Node
* Feature Extraction: 15 minutes
* Hyper-Parameter Tuning: 4 Hours + 
* Training Tuned Classifiers: 

In [1]:
# Activating ser conda environment
! source activate ser

# Installing the Optune Framework package for Hyper-Paramenter Tuning
! pip install --user optuna

# Installing other classifier packages
! pip install --user lightgbm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m22.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m22.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
# Notebook Settings

# Supress warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# Loading Libraries

import pandas as pd
import numpy as np
import os
import random
import sys
import glob 
import librosa
import librosa.display
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.utils.multiclass import unique_labels
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

import lightgbm as lgb
import xgboost as xgb
import optuna
from tqdm import tqdm

In [4]:
# Function to extract audio features

def extract_feature(file_name):
    X, sample_rate = librosa.load(file_name)
    stft=np.abs(librosa.stft(X))
    result=np.array([])
    mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
    result=np.hstack((result, mfccs))
    chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
    result=np.hstack((result, chroma))
    mel=np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
    result=np.hstack((result, mel))
    return result

In [5]:
# Declaring Classes and Gender

emotions={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}

def gender(g):
    """Returns Gender Label"""
    if int(g[0:2]) % 2 == 0:
        return 'female'
    else:
        return 'male'

In [6]:
# Function to load data

def load_data(test_size=0.2):
    """Loads Data from directory containing WAV files."""
    x,y=[],[]
    for file in tqdm(glob.glob("./DATASET/*.wav")): # Path to audio dataset
        file_name=os.path.basename(file)
        emotion=emotions[file_name.split("-")[2]] + '_' + gender(file_name.split("-")[-1])
        feature=extract_feature(file)
        x.append(feature)
        y.append(emotion)
    return train_test_split(np.array(x), y, test_size=test_size, random_state=9)

In [7]:
# Loading Data

X_train, X_test, y_train, y_test = load_data()

100%|██████████| 5252/5252 [12:53<00:00,  6.79it/s]


In [8]:
print((X_train.shape[0], X_test.shape[0]))
print((X_train.shape[1], X_test.shape[1]))
print(f'Features extracted: {X_train.shape[1]}')

(4201, 1051)
(180, 180)
Features extracted: 180


## HYPER-PARAMETER TUNING USING OPTUNA FRAMEWORK (OPTIONAL)

N.B.: If hyper-parameters are obtained, copy them and paste it into the corrosponding model parameters below and do not run this cell

### LGB

In [9]:
def objective_lgb(trial): 

    params = {
        'num_leaves': trial.suggest_int('num_leaves', 2, 150),
        'max_depth': trial.suggest_int('max_depth', 2, 100),
        'n_estimators': trial.suggest_int('n_estimators', 10, 20000),
        'subsample_for_bin': trial.suggest_int('subsample_for_bin', 100000, 500000),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 500),
        'reg_alpha': trial.suggest_uniform('reg_alpha', 0.0, 100),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.0, 1.0),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-0),
        'boosting_type': trial.suggest_categorical('boosting_type', ['goss','gbdt','dart']),
        'objective': 'multiclass',
        'verbose': -1,
        'random_state':22,
        }
   
    model = lgb.LGBMClassifier(**params, n_jobs = 12) 
    
    model.set_params(**params)

    return np.mean(cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy'))

In [10]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_lgb, n_trials=25)

[32m[I 2022-12-28 12:40:49,820][0m A new study created in memory with name: no-name-2b577db4-2701-456e-9182-9adf236a1f39[0m




[32m[I 2022-12-28 12:42:16,093][0m Trial 0 finished with value: 0.7307748711850971 and parameters: {'num_leaves': 15, 'max_depth': 18, 'n_estimators': 9423, 'subsample_for_bin': 177318, 'min_data_in_leaf': 467, 'reg_alpha': 21.364324863046125, 'colsample_bytree': 0.4166169833515284, 'learning_rate': 0.08988880633472562, 'boosting_type': 'gbdt'}. Best is trial 0 with value: 0.7307748711850971.[0m




[32m[I 2022-12-28 12:45:58,324][0m Trial 1 finished with value: 0.659364135666157 and parameters: {'num_leaves': 8, 'max_depth': 98, 'n_estimators': 12899, 'subsample_for_bin': 269886, 'min_data_in_leaf': 439, 'reg_alpha': 12.73644948176721, 'colsample_bytree': 0.18931542811418567, 'learning_rate': 7.905719250121463e-05, 'boosting_type': 'goss'}. Best is trial 0 with value: 0.7307748711850971.[0m




[32m[I 2022-12-28 12:56:06,153][0m Trial 2 finished with value: 0.48866655342279597 and parameters: {'num_leaves': 31, 'max_depth': 47, 'n_estimators': 8085, 'subsample_for_bin': 188423, 'min_data_in_leaf': 226, 'reg_alpha': 70.66238836396343, 'colsample_bytree': 0.3381317494631866, 'learning_rate': 0.006727407881920793, 'boosting_type': 'dart'}. Best is trial 0 with value: 0.7307748711850971.[0m




[32m[I 2022-12-28 13:00:21,979][0m Trial 3 finished with value: 0.06284157182492497 and parameters: {'num_leaves': 139, 'max_depth': 32, 'n_estimators': 17808, 'subsample_for_bin': 235415, 'min_data_in_leaf': 307, 'reg_alpha': 98.2433944932535, 'colsample_bytree': 0.40150552585613264, 'learning_rate': 0.543254952678558, 'boosting_type': 'dart'}. Best is trial 0 with value: 0.7307748711850971.[0m




[32m[I 2022-12-28 13:04:37,909][0m Trial 4 finished with value: 0.45941056565313404 and parameters: {'num_leaves': 108, 'max_depth': 89, 'n_estimators': 12949, 'subsample_for_bin': 332299, 'min_data_in_leaf': 147, 'reg_alpha': 89.00759212637116, 'colsample_bytree': 0.6679200653814781, 'learning_rate': 0.00021282819246622793, 'boosting_type': 'goss'}. Best is trial 0 with value: 0.7307748711850971.[0m




[32m[I 2022-12-28 13:10:31,604][0m Trial 5 finished with value: 0.7517213068342676 and parameters: {'num_leaves': 22, 'max_depth': 100, 'n_estimators': 16025, 'subsample_for_bin': 118611, 'min_data_in_leaf': 134, 'reg_alpha': 31.920033893710244, 'colsample_bytree': 0.9982880803922601, 'learning_rate': 0.00047686959240402714, 'boosting_type': 'goss'}. Best is trial 5 with value: 0.7517213068342676.[0m




[32m[I 2022-12-28 13:14:20,658][0m Trial 6 finished with value: 0.697210237245909 and parameters: {'num_leaves': 4, 'max_depth': 47, 'n_estimators': 3958, 'subsample_for_bin': 444732, 'min_data_in_leaf': 437, 'reg_alpha': 35.91188580520871, 'colsample_bytree': 0.3243961538846868, 'learning_rate': 0.13263740946272637, 'boosting_type': 'dart'}. Best is trial 5 with value: 0.7517213068342676.[0m




[32m[I 2022-12-28 13:15:13,499][0m Trial 7 finished with value: 0.6531767736821243 and parameters: {'num_leaves': 94, 'max_depth': 33, 'n_estimators': 4612, 'subsample_for_bin': 423782, 'min_data_in_leaf': 447, 'reg_alpha': 60.478215204706906, 'colsample_bytree': 0.8356119642020825, 'learning_rate': 0.0021486099484051223, 'boosting_type': 'goss'}. Best is trial 5 with value: 0.7517213068342676.[0m




[32m[I 2022-12-28 13:46:21,352][0m Trial 8 finished with value: 0.7567193250665307 and parameters: {'num_leaves': 76, 'max_depth': 17, 'n_estimators': 18599, 'subsample_for_bin': 165047, 'min_data_in_leaf': 451, 'reg_alpha': 16.916634984956957, 'colsample_bytree': 0.8881241087894987, 'learning_rate': 0.0017525101066842008, 'boosting_type': 'dart'}. Best is trial 8 with value: 0.7567193250665307.[0m




[32m[I 2022-12-28 14:10:29,888][0m Trial 9 finished with value: 0.20757601494819095 and parameters: {'num_leaves': 71, 'max_depth': 78, 'n_estimators': 18315, 'subsample_for_bin': 275076, 'min_data_in_leaf': 480, 'reg_alpha': 79.8540162128086, 'colsample_bytree': 0.49958544080983003, 'learning_rate': 0.00010571493509114321, 'boosting_type': 'dart'}. Best is trial 8 with value: 0.7567193250665307.[0m




[32m[I 2022-12-28 14:10:33,809][0m Trial 10 finished with value: 0.07902893380895759 and parameters: {'num_leaves': 52, 'max_depth': 4, 'n_estimators': 104, 'subsample_for_bin': 101437, 'min_data_in_leaf': 326, 'reg_alpha': 3.6561992833743737, 'colsample_bytree': 0.7301059168542271, 'learning_rate': 1.4490514053156924e-05, 'boosting_type': 'gbdt'}. Best is trial 8 with value: 0.7567193250665307.[0m




[32m[I 2022-12-28 14:14:09,510][0m Trial 11 finished with value: 0.7812400203838967 and parameters: {'num_leaves': 44, 'max_depth': 67, 'n_estimators': 15500, 'subsample_for_bin': 102097, 'min_data_in_leaf': 14, 'reg_alpha': 35.274734370320914, 'colsample_bytree': 0.9765873416449043, 'learning_rate': 0.0040789319240881585, 'boosting_type': 'goss'}. Best is trial 11 with value: 0.7812400203838967.[0m




[32m[I 2022-12-28 14:55:41,632][0m Trial 12 finished with value: 0.677452296019478 and parameters: {'num_leaves': 55, 'max_depth': 69, 'n_estimators': 19768, 'subsample_for_bin': 162644, 'min_data_in_leaf': 17, 'reg_alpha': 45.95935259569694, 'colsample_bytree': 0.9249969670146562, 'learning_rate': 0.005736119288180306, 'boosting_type': 'dart'}. Best is trial 11 with value: 0.7812400203838967.[0m




[32m[I 2022-12-28 14:57:58,880][0m Trial 13 finished with value: 0.7998063529811448 and parameters: {'num_leaves': 86, 'max_depth': 66, 'n_estimators': 14299, 'subsample_for_bin': 365574, 'min_data_in_leaf': 32, 'reg_alpha': 24.88685277750192, 'colsample_bytree': 0.6713127954205013, 'learning_rate': 0.023360346645062598, 'boosting_type': 'goss'}. Best is trial 13 with value: 0.7998063529811448.[0m




[32m[I 2022-12-28 15:00:07,597][0m Trial 14 finished with value: 0.7726700073608516 and parameters: {'num_leaves': 108, 'max_depth': 63, 'n_estimators': 13906, 'subsample_for_bin': 353868, 'min_data_in_leaf': 10, 'reg_alpha': 48.03904012707117, 'colsample_bytree': 0.6663820088091268, 'learning_rate': 0.02844146530352623, 'boosting_type': 'goss'}. Best is trial 13 with value: 0.7998063529811448.[0m




[32m[I 2022-12-28 15:02:41,623][0m Trial 15 finished with value: 0.7862377555064832 and parameters: {'num_leaves': 143, 'max_depth': 61, 'n_estimators': 15280, 'subsample_for_bin': 398277, 'min_data_in_leaf': 82, 'reg_alpha': 30.91448798543, 'colsample_bytree': 0.774572818049516, 'learning_rate': 0.018695718794815804, 'boosting_type': 'goss'}. Best is trial 13 with value: 0.7998063529811448.[0m




[32m[I 2022-12-28 15:04:22,103][0m Trial 16 finished with value: 0.8045662759753128 and parameters: {'num_leaves': 147, 'max_depth': 57, 'n_estimators': 11073, 'subsample_for_bin': 488937, 'min_data_in_leaf': 91, 'reg_alpha': 23.978509061538183, 'colsample_bytree': 0.5886725151170362, 'learning_rate': 0.03200080792487808, 'boosting_type': 'goss'}. Best is trial 16 with value: 0.8045662759753128.[0m




[32m[I 2022-12-28 15:06:20,833][0m Trial 17 finished with value: 0.13591444425570468 and parameters: {'num_leaves': 123, 'max_depth': 79, 'n_estimators': 11277, 'subsample_for_bin': 479905, 'min_data_in_leaf': 92, 'reg_alpha': 1.658602164862998, 'colsample_bytree': 0.5742498084893155, 'learning_rate': 0.6471267655375141, 'boosting_type': 'goss'}. Best is trial 16 with value: 0.8045662759753128.[0m




[32m[I 2022-12-28 15:06:56,870][0m Trial 18 finished with value: 0.5160571315327558 and parameters: {'num_leaves': 124, 'max_depth': 49, 'n_estimators': 6966, 'subsample_for_bin': 499339, 'min_data_in_leaf': 204, 'reg_alpha': 58.77656324352118, 'colsample_bytree': 0.08289038605843829, 'learning_rate': 0.07522253393181473, 'boosting_type': 'gbdt'}. Best is trial 16 with value: 0.8045662759753128.[0m




[32m[I 2022-12-28 15:08:34,924][0m Trial 19 finished with value: 0.7979018741860596 and parameters: {'num_leaves': 150, 'max_depth': 37, 'n_estimators': 10970, 'subsample_for_bin': 372244, 'min_data_in_leaf': 80, 'reg_alpha': 25.29437378449422, 'colsample_bytree': 0.5714174883183664, 'learning_rate': 0.030555080414197758, 'boosting_type': 'goss'}. Best is trial 16 with value: 0.8045662759753128.[0m




[32m[I 2022-12-28 15:09:24,352][0m Trial 20 finished with value: 0.8131345903402979 and parameters: {'num_leaves': 90, 'max_depth': 57, 'n_estimators': 6416, 'subsample_for_bin': 449062, 'min_data_in_leaf': 170, 'reg_alpha': 12.790778368648716, 'colsample_bytree': 0.5660043664435338, 'learning_rate': 0.27645578068240395, 'boosting_type': 'goss'}. Best is trial 20 with value: 0.8131345903402979.[0m




[32m[I 2022-12-28 15:10:06,813][0m Trial 21 finished with value: 0.8155149765018969 and parameters: {'num_leaves': 93, 'max_depth': 57, 'n_estimators': 5183, 'subsample_for_bin': 451907, 'min_data_in_leaf': 164, 'reg_alpha': 11.212768670906843, 'colsample_bytree': 0.5871650483185199, 'learning_rate': 0.19989208497950994, 'boosting_type': 'goss'}. Best is trial 21 with value: 0.8155149765018969.[0m




[32m[I 2022-12-28 15:10:48,396][0m Trial 22 finished with value: 0.8167077175697866 and parameters: {'num_leaves': 102, 'max_depth': 55, 'n_estimators': 5239, 'subsample_for_bin': 451297, 'min_data_in_leaf': 179, 'reg_alpha': 10.166085786278504, 'colsample_bytree': 0.5564115021925584, 'learning_rate': 0.23410649113166168, 'boosting_type': 'goss'}. Best is trial 22 with value: 0.8167077175697866.[0m




[32m[I 2022-12-28 15:11:21,823][0m Trial 23 finished with value: 0.8112335088613328 and parameters: {'num_leaves': 99, 'max_depth': 54, 'n_estimators': 4439, 'subsample_for_bin': 449828, 'min_data_in_leaf': 171, 'reg_alpha': 10.298161147968209, 'colsample_bytree': 0.4732585790389039, 'learning_rate': 0.22318569716533146, 'boosting_type': 'goss'}. Best is trial 22 with value: 0.8167077175697866.[0m




[32m[I 2022-12-28 15:11:32,441][0m Trial 24 finished with value: 0.32983353151010697 and parameters: {'num_leaves': 63, 'max_depth': 40, 'n_estimators': 1674, 'subsample_for_bin': 410277, 'min_data_in_leaf': 282, 'reg_alpha': 5.822028287321087, 'colsample_bytree': 0.24351807574820955, 'learning_rate': 0.9590454530172389, 'boosting_type': 'goss'}. Best is trial 22 with value: 0.8167077175697866.[0m


### RF

In [12]:
def objective_rf(trial):
    
    params = {
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
        'max_depth': trial.suggest_int('max_depth', 5, 25),
        'n_estimators': trial.suggest_int('n_estimators', 15000, 25000),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 2, 5),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 25),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 50, 250),
        }
   
    model = RandomForestClassifier(**params, random_state = 22) 
    
    model.set_params(**params)

    return np.mean(cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy'))

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_rf, n_trials=25)

[32m[I 2022-12-29 00:44:47,370][0m A new study created in memory with name: no-name-72c7325e-1c5e-4c94-b8ae-e3c303489eb7[0m


### XGB

In [None]:
def objective_xgb(trial):

    param = {
        'silent': 1,
        'objective': 'multi:softmax',
        'num_class': 16,
        'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
        'lambda': trial.suggest_loguniform('lambda', 1e-8, 1.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-8, 1.0)
    }

    if param['booster'] == 'gbtree' or param['booster'] == 'gblinear':
        param['max_depth'] = trial.suggest_int('max_depth', 1, 9)
        param['eta'] = trial.suggest_loguniform('eta', 1e-8, 1.0)
        param['gamma'] = trial.suggest_loguniform('gamma', 1e-8, 1.0)
        param['grow_policy'] = trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide'])
    if param['booster'] == 'dart':
        param['sample_type'] = trial.suggest_categorical('sample_type', ['uniform', 'weighted'])
        param['normalize_type'] = trial.suggest_categorical('normalize_type', ['tree', 'forest'])
        param['rate_drop'] = trial.suggest_loguniform('rate_drop', 1e-8, 1.0)
        param['skip_drop'] = trial.suggest_loguniform('skip_drop', 1e-8, 1.0)
  
    model = xgb.XGBClassifier() 
    
    model.set_params(**param)

    return np.mean(cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy'))

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_xgb, n_trials=25)

### MLP

In [24]:
def objective_mlp(trial):

    params = {
        'activation': trial.suggest_categorical('activation', ['logistic', 'tanh', 'relu']),
        'solver': trial.suggest_categorical('solver', ['lbfgs', 'sgd', 'adam']),
        'hidden_layer_sizes':trial.suggest_int('hidden_layer_sizes', 100, 1500),
        'alpha': trial.suggest_uniform('alpha', 0.001, 0.99),
        'batch_size':trial.suggest_int('batch_size', 150, 300), 
        'learning_rate': trial.suggest_categorical('learning_rate', ['adaptive', 'constant', 'invscaling']),
        'max_iter': 1000
        }
  
    model = MLPClassifier(**params, random_state = 22) 
    
    model.set_params(**params)

    return np.mean(cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy'))

In [25]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_mlp, n_trials=25)

[32m[I 2022-12-28 18:31:35,583][0m A new study created in memory with name: no-name-bba9d8bb-d87f-4926-b67d-f6617224f1a7[0m
[32m[I 2022-12-28 18:35:19,671][0m Trial 0 finished with value: 0.7272133514523527 and parameters: {'activation': 'tanh', 'solver': 'lbfgs', 'hidden_layer_sizes': 272, 'alpha': 0.6655292519385918, 'batch_size': 249, 'learning_rate': 'constant'}. Best is trial 0 with value: 0.7272133514523527.[0m
[32m[I 2022-12-28 18:35:49,600][0m Trial 1 finished with value: 0.807188437800804 and parameters: {'activation': 'logistic', 'solver': 'adam', 'hidden_layer_sizes': 166, 'alpha': 0.721368233089467, 'batch_size': 151, 'learning_rate': 'adaptive'}. Best is trial 1 with value: 0.807188437800804.[0m
[32m[I 2022-12-28 18:36:47,642][0m Trial 2 finished with value: 0.30560330672102376 and parameters: {'activation': 'relu', 'solver': 'sgd', 'hidden_layer_sizes': 129, 'alpha': 0.3024295887507661, 'batch_size': 166, 'learning_rate': 'adaptive'}. Best is trial 1 with value

### KNN

In [9]:
def objective_knn(trial):

    params = { 
        'weights': trial.suggest_categorical('weights', ['uniform', 'distance']),
        'n_neighbors': trial.suggest_int('n_neighbors', 16, 200),
        }

    model = KNeighborsClassifier(**params) 
    
    model.set_params(**params)

    return np.mean(cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy'))

In [10]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_knn, n_trials=25)

[32m[I 2022-12-28 22:46:42,479][0m A new study created in memory with name: no-name-1e04c718-ed05-46bf-908e-04be5aec2405[0m
[32m[I 2022-12-28 22:46:43,440][0m Trial 0 finished with value: 0.6146073268784328 and parameters: {'weights': 'uniform', 'n_neighbors': 144}. Best is trial 0 with value: 0.6146073268784328.[0m
[32m[I 2022-12-28 22:46:43,647][0m Trial 1 finished with value: 0.6634077911783024 and parameters: {'weights': 'distance', 'n_neighbors': 120}. Best is trial 1 with value: 0.6634077911783024.[0m
[32m[I 2022-12-28 22:46:43,942][0m Trial 2 finished with value: 0.6003252930185154 and parameters: {'weights': 'uniform', 'n_neighbors': 181}. Best is trial 1 with value: 0.6634077911783024.[0m
[32m[I 2022-12-28 22:46:44,215][0m Trial 3 finished with value: 0.6172263744974804 and parameters: {'weights': 'uniform', 'n_neighbors': 136}. Best is trial 1 with value: 0.6634077911783024.[0m
[32m[I 2022-12-28 22:46:44,511][0m Trial 4 finished with value: 0.6022297718136006

### DT

In [28]:
def objective_dt(trial):
    
    params = {
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
        'max_depth': trial.suggest_int('max_depth', 5, 100),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 2, 5),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 25),
        'max_leaf_nodes': trial.suggest_int('max_leaf_nodes', 50, 250),
        }
    
    model = DecisionTreeClassifier(**params, random_state = 22) 
    
    model.set_params(**params)

    return np.mean(cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy'))

In [29]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_dt, n_trials=25)

[32m[I 2022-12-28 19:46:06,019][0m A new study created in memory with name: no-name-d8bf2aa8-bbb5-4285-8748-b1255aeae45e[0m
[32m[I 2022-12-28 19:46:08,946][0m Trial 0 finished with value: 0.6500787044901195 and parameters: {'criterion': 'gini', 'max_depth': 32, 'min_samples_leaf': 4, 'min_samples_split': 15, 'max_leaf_nodes': 77}. Best is trial 0 with value: 0.6500787044901195.[0m
[32m[I 2022-12-28 19:46:15,053][0m Trial 1 finished with value: 0.6679332993601721 and parameters: {'criterion': 'entropy', 'max_depth': 93, 'min_samples_leaf': 5, 'min_samples_split': 19, 'max_leaf_nodes': 105}. Best is trial 1 with value: 0.6679332993601721.[0m
[32m[I 2022-12-28 19:46:21,249][0m Trial 2 finished with value: 0.6674588075420418 and parameters: {'criterion': 'entropy', 'max_depth': 61, 'min_samples_leaf': 4, 'min_samples_split': 21, 'max_leaf_nodes': 207}. Best is trial 1 with value: 0.6679332993601721.[0m
[32m[I 2022-12-28 19:46:27,408][0m Trial 3 finished with value: 0.66174480

### LR

In [None]:
def objective_lr(trial):

    params = { 
        'class_weight': trial.suggest_categorical('class_weight', ['balanced', None]),
        'solver': trial.suggest_categorical('solver', ['newton-cg', 'sag', 'saga', 'lbfgs']),
        'multi_class':'multinomial',
        'max_iter': 5000
        }
    
    model = LogisticRegression(**params, random_state = 22) 
    
    model.set_params(**params)

    return np.mean(cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy'))

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_lr, n_trials=10)

## END OF HYPER-PARAMETER TUNING (RUN FROM HERE)

Run from below after setting the correct tuned parameters below to train the model

In [9]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#### Hyper-Tuned Parameters

In [10]:
lgb_params = {'num_leaves': 102, 
              'max_depth': 55, 
              'n_estimators': 5239, 
              'subsample_for_bin': 451297, 
              'min_data_in_leaf': 179, 
              'reg_alpha': 10.166085786278504, 
              'colsample_bytree': 0.24351807574820955, 
              'learning_rate': 0.9590454530172389, 
              'boosting_type': 'goss'}

In [11]:
rf_params = {'criterion': 'entropy', 
             'max_depth': 15, 
             'n_estimators': 22984, 
             'min_samples_leaf': 3, 
             'min_samples_split': 9, 
             'max_leaf_nodes': 239, 
             'random_state': 22}

In [12]:
xgb_params = {'booster': 'gbtree', 
              'lambda': 7.201651687969849e-08, 
              'alpha': 2.2495125443474775e-05, 
              'max_depth': 7, 
              'eta': 9.307925211476325e-06, 
              'gamma': 1.7948741419263195e-05, 
              'grow_policy': 'lossguide'}

In [13]:
dt_params = {'criterion': 'gini', 
             'max_depth': 77, 
             'min_samples_leaf': 2, 
             'min_samples_split': 8, 
             'max_leaf_nodes': 229}

In [14]:
mlp_params = {'activation': 'tanh', 
              'solver': 'adam', 
              'hidden_layer_sizes': 1442, 
              'alpha': 0.0027099149530944583, 
              'batch_size': 298, 
              'learning_rate': 'invscaling',
              'max_iter':1000}

In [15]:
knn_params = {'weights': 'distance', 
              'n_neighbors': 16}

In [16]:
lr_params = {'multi_class':'multinomial',
             'class_weight': None, 
             'solver': 'saga', 
             'max_iter':10000}

In [17]:
models = {'dt':DecisionTreeClassifier(**dt_params),
          'rf':RandomForestClassifier(**rf_params), 
          'lgb':lgb.LGBMClassifier(**lgb_params), 
          'xgb':xgb.XGBClassifier(**xgb_params),
          'mlp':MLPClassifier(**mlp_params), 
          'kn':KNeighborsClassifier(**knn_params),  
          'lr':LogisticRegression(**lr_params)
         }

model_abrv = {'dt':'Decision Tree Classifier', 
              'rf':'Random Forest Classifier', 
              'lgb':'LGBM Classifier', 
              'xgb':'XGB Classifier', 
              'mlp':'MLP Classifier',
              'kn':'K-Nearest Neighbors', 
              'lr':'Logistic Regression', 
              'v':'Voting Classifier: MLP, LGB', 
              'v2':'Voting Classifier 2: KNN, XGB, MLP', 
              'v3':'Voting Classifier 3: XGB, MLP, RF, LR', 
              'v4':'Voting Classifier 4: MLP, XGB'
             }

### Voting Classifiers Hyper-Tuned Parameters

In [18]:
v_params = {'estimators':[('mlp', models['mlp']), 
                          ('lgb', models['lgb'])], 
            'voting':'soft'}

In [19]:
v2_params = {'estimators':[('kn', models['kn']), 
                           ('xgb', models['xgb']), 
                           ('mlp', models['mlp'])], 
             'voting':'soft'}

In [20]:
v3_params = {'estimators':[('xgb', models['xgb']),
                           ('mlp', models['mlp']),
                           ('rf', models['rf']), 
                           ('lr', models['lr'])], 
             'voting':'soft'}

In [21]:
v4_params = {'estimators':[('mlp', models['mlp']), 
                          ('xgb', models['xgb'])], 
            'voting':'soft'}

In [22]:
models['v'] = VotingClassifier(**v_params)
models['v2'] = VotingClassifier(**v2_params)
models['v3'] = VotingClassifier(**v3_params)
models['v4'] = VotingClassifier(**v4_params)

### Training and Metrices

In [23]:
def print_confusion_matrix(confusion_matrix, class_names, figsize = (10,7), fontsize=14, model='clf', save=True):
    """Prints a confusion matrix, as returned by sklearn.metrics.confusion_matrix, as a seaborn heatmap. 
    Saves confusion matrix file to jpg file."""
    df_cm = pd.DataFrame(
        confusion_matrix, index=class_names, columns=class_names, 
    )
    fig, ax = plt.subplots(1, 1, figsize=figsize)
    try:
        heatmap = sns.heatmap(df_cm, annot=True, ax=ax, fmt="d", cmap=plt.cm.Oranges)
    except ValueError:
        raise ValueError("Confusion matrix values must be integers.")
        
    heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
    heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=fontsize)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    # fix for mpl bug that cuts off top/bottom of seaborn viz
    b, t = plt.ylim() 
    b += 0.5 
    t -= 0.5 
    plt.ylim(b, t) 
    if save == True:
        plt.savefig('tuned_' + model_abrv[model] + '_confusion_matrix.png')
    plt.show()

In [24]:
def model(clf, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, models=models, save=False, print_stat=True, inc_train=False, cv=False):
    """Trains models and outputs score metrics. Takes an identifier, list of models, and split dataset as inputs and has options for saving model, 
    printing confusion matrix and classification report and getting cross-validated 5 fold accuracy."""
    clf_model = models[clf]
    clf_model.fit(X_train, y_train)
    y_pred = clf_model.predict(X_test)
    if print_stat == True:
        clf_report = pd.DataFrame(classification_report(y_test,y_pred, output_dict=True)).T
        clf_report.to_csv('tuned_' + model_abrv[clf] + '_classification_report.csv')
        print(model_abrv[clf])
        print('\nTest Stats\n', classification_report(y_test,y_pred))
        print_confusion_matrix(confusion_matrix(y_test, y_pred), unique_labels(y_test, y_pred), model=clf)
        if inc_train == True:
            print(model_abrv[clf])
            print('\nTrain Stats\n', classification_report(y_train,clf_model.predict(X_train)))
            print_confusion_matrix(confusion_matrix(y_train, clf_model.predict(X_train)), unique_labels(y_test, y_pred), model=clf)
    if cv == True:
        print(model_abrv[clf] + ' CV Accuracy:',  
              np.mean(cross_val_score(clf_model, X_train, y_train, cv=5, scoring='accuracy')))
    if save == True:
        return clf_model

In [None]:
# Training and 5-Fold Cross Validation

for key in models.keys():
    model(key, cv=True, print_stat=False)

Decision Tree Classifier CV Accuracy: 0.6765047279316007


In [None]:
# Model Types

print(models.keys())

In [None]:
# Results and Metrices

for key in models.keys():
    model(key, inc_train=True)