In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, log_loss, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from scipy.stats import mode
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import ElasticNet
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
import itertools
from scipy.optimize import minimize

## Loading the dataset

In [29]:
df = pd.read_csv('pd_EEG_features.csv')
df.head()

Unnamed: 0,id,gender,PPE,EEG,RPDE,numPulses,numPeriodsPulses,meanPeriodPulses,stdDevPeriodPulses,locPctJitter,...,tqwt_kurtosisValue_dec_28,tqwt_kurtosisValue_dec_29,tqwt_kurtosisValue_dec_30,tqwt_kurtosisValue_dec_31,tqwt_kurtosisValue_dec_32,tqwt_kurtosisValue_dec_33,tqwt_kurtosisValue_dec_34,tqwt_kurtosisValue_dec_35,tqwt_kurtosisValue_dec_36,class
0,0,1,0.85247,0.71826,0.57227,240,239,0.008064,8.7e-05,0.00218,...,1.562,2.6445,3.8686,4.2105,5.1221,4.4625,2.6202,3.0004,18.9405,1
1,0,1,0.76686,0.69481,0.53966,234,233,0.008258,7.3e-05,0.00195,...,1.5589,3.6107,23.5155,14.1962,11.0261,9.5082,6.5245,6.3431,45.178,1
2,0,1,0.85083,0.67604,0.58982,232,231,0.00834,6e-05,0.00176,...,1.5643,2.3308,9.4959,10.7458,11.0177,4.8066,2.9199,3.1495,4.7666,1
3,1,0,0.41121,0.79672,0.59257,178,177,0.010858,0.000183,0.00419,...,3.7805,3.5664,5.2558,14.0403,4.2235,4.6857,4.846,6.265,4.0603,1
4,1,0,0.3279,0.79782,0.53028,236,235,0.008162,0.002669,0.00535,...,6.1727,5.8416,6.0805,5.7621,7.7817,11.6891,8.2103,5.0559,6.1164,1


In [30]:
X = df.drop(columns=['id','class']) 
y = df['class']  

# IMBALANCED DATA

In [127]:
count0=0
count1=0
for i in y:
    if i==0:
        count0+=1
    else:
        count1+=1

print(count0,count1)

192 564


In [31]:
#splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### XGBoost

In [129]:
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)

In [130]:
xg_accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
error_rate = 1 - xg_accuracy
y_pred_proba = xgb.predict_proba(X_test)[:, 1]
loss = log_loss(y_test, y_pred_proba)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:",xg_accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Error Rate:", error_rate)
print("Binary Cross-Entropy Loss:", loss)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.881578947368421
Precision: 0.8870967741935484
Recall: 0.9649122807017544
F1 Score: 0.9243697478991597
Error Rate: 0.11842105263157898
Binary Cross-Entropy Loss: 0.2929799327922923
Confusion Matrix:
[[ 24  14]
 [  4 110]]


### AdaBoost Classifier

In [131]:
ada = AdaBoostClassifier(n_estimators=100, random_state=42)
ada.fit(X_train, y_train)
y_pred = ada.predict(X_test)




In [132]:
AB_accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
error_rate = 1 - AB_accuracy
y_pred_proba = ada.predict_proba(X_test)[:, 1]
loss = log_loss(y_test, y_pred_proba)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:",AB_accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Error Rate:", error_rate)
print("Binary Cross-Entropy Loss:", loss)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.8947368421052632
Precision: 0.9016393442622951
Recall: 0.9649122807017544
F1 Score: 0.9322033898305084
Error Rate: 0.10526315789473684
Binary Cross-Entropy Loss: 0.6436058135483399
Confusion Matrix:
[[ 26  12]
 [  4 110]]


### CatBoost

In [133]:
catboost_classifier = CatBoostClassifier(n_estimators=100, random_state=42)
catboost_classifier.fit(X_train, y_train)
y_pred = catboost_classifier.predict(X_test)

Learning rate set to 0.068618
0:	learn: 0.6522192	total: 52ms	remaining: 5.14s
1:	learn: 0.6086152	total: 102ms	remaining: 5.02s
2:	learn: 0.5609546	total: 156ms	remaining: 5.05s
3:	learn: 0.5278116	total: 210ms	remaining: 5.05s
4:	learn: 0.5010378	total: 264ms	remaining: 5.01s
5:	learn: 0.4685901	total: 317ms	remaining: 4.96s
6:	learn: 0.4402476	total: 369ms	remaining: 4.91s
7:	learn: 0.4184514	total: 421ms	remaining: 4.84s
8:	learn: 0.4002694	total: 473ms	remaining: 4.79s
9:	learn: 0.3845739	total: 525ms	remaining: 4.72s
10:	learn: 0.3728740	total: 575ms	remaining: 4.66s
11:	learn: 0.3593275	total: 628ms	remaining: 4.61s
12:	learn: 0.3459569	total: 681ms	remaining: 4.56s
13:	learn: 0.3349738	total: 737ms	remaining: 4.53s
14:	learn: 0.3233329	total: 795ms	remaining: 4.51s
15:	learn: 0.3130873	total: 851ms	remaining: 4.47s
16:	learn: 0.3035014	total: 904ms	remaining: 4.42s
17:	learn: 0.2942774	total: 957ms	remaining: 4.36s
18:	learn: 0.2842905	total: 1.01s	remaining: 4.31s
19:	learn: 0

In [134]:
catboost_accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
error_rate = 1 - catboost_accuracy
y_pred_proba = catboost_classifier.predict_proba(X_test)[:, 1]
loss = log_loss(y_test, y_pred_proba)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:",catboost_accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Error Rate:", error_rate)
print("Binary Cross-Entropy Loss:", loss)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.8618421052631579
Precision: 0.8661417322834646
Recall: 0.9649122807017544
F1 Score: 0.9128630705394191
Error Rate: 0.13815789473684215
Binary Cross-Entropy Loss: 0.3079936360843641
Confusion Matrix:
[[ 21  17]
 [  4 110]]


### GBM

In [135]:
gbm = GradientBoostingClassifier(
    n_estimators=100,      
    learning_rate=0.05,
    max_depth=5,          
    max_features=0.9,
    random_state=42
)
gbm.fit(X_train, y_train)

y_pred_prob = gbm.predict_proba(X_test)[:, 1]
y_pred_binary = [1 if x > 0.5 else 0 for x in y_pred_prob]


In [136]:
gbm_accuracy = accuracy_score(y_test, y_pred_binary)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
error_rate = 1 - gbm_accuracy
y_pred_proba = gbm.predict_proba(X_test)[:, 1]
loss = log_loss(y_test, y_pred_proba)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:",gbm_accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Error Rate:", error_rate)
print("Binary Cross-Entropy Loss:", loss)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.8486842105263158
Precision: 0.8661417322834646
Recall: 0.9649122807017544
F1 Score: 0.9128630705394191
Error Rate: 0.15131578947368418
Binary Cross-Entropy Loss: 0.37246624664823264
Confusion Matrix:
[[ 21  17]
 [  4 110]]


### LightGBM

In [33]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
params = {
    'objective': 'binary',
    'metric': 'binary_error',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}
lgbm = lgb.train(params, lgb_train, num_boost_round=100, valid_sets=lgb_eval)
y_pred = lgbm.predict(X_test, num_iteration=lgbm.best_iteration)
y_pred_binary = [1 if x > 0.5 else 0 for x in y_pred]


[LightGBM] [Info] Number of positive: 450, number of negative: 154
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.015271 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 150492
[LightGBM] [Info] Number of data points in the train set: 604, number of used features: 753
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.745033 -> initscore=1.072295
[LightGBM] [Info] Start training from score 1.072295


In [34]:
lgbm_accuracy = accuracy_score(y_test, y_pred_binary)
precision = precision_score(y_test, y_pred_binary)
recall = recall_score(y_test, y_pred_binary)
f1 = f1_score(y_test, y_pred_binary)
error_rate = 1 - lgbm_accuracy
y_pred_proba = 1 / (1 + np.exp(-y_pred))
loss = log_loss(y_test, y_pred_proba)
conf_matrix = confusion_matrix(y_test, y_pred_binary)

print("Accuracy:",lgbm_accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Error Rate:", error_rate)
print("Binary Cross-Entropy Loss:", loss)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.8947368421052632
Precision: 0.8828125
Recall: 0.9912280701754386
F1 Score: 0.9338842975206612
Error Rate: 0.10526315789473684
Binary Cross-Entropy Loss: 0.48226933004165345
Confusion Matrix:
[[ 23  15]
 [  1 113]]


### Ridge

In [139]:
ridge_classifier = RidgeClassifier(
    alpha=1.0,  
    random_state=42
)
ridge_classifier.fit(X_train, y_train)
y_pred = ridge_classifier.predict(X_test)
y_pred_binary = (y_pred >= 0.5).astype(int)



In [140]:
ridge_accuracy = accuracy_score(y_test, y_pred_binary)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
error_rate = 1 - ridge_accuracy
loss = log_loss(y_test, y_pred_proba)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:",ridge_accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Error Rate:", error_rate)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.8355263157894737
Precision: 0.8677685950413223
Recall: 0.9210526315789473
F1 Score: 0.8936170212765957
Error Rate: 0.16447368421052633
Confusion Matrix:
[[ 22  16]
 [  9 105]]


### ElasticNet 

In [141]:
elastic_net_classifier = ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=42)
elastic_net_classifier.fit(X_train, y_train)
y_pred = elastic_net_classifier.predict(X_test)
y_pred_binary = (y_pred >= 0.5).astype(int)


  model = cd_fast.enet_coordinate_descent(


In [142]:
elastic_net_accuracy = accuracy_score(y_test, y_pred_binary)
precision = precision_score(y_test, y_pred_binary)
recall = recall_score(y_test, y_pred_binary)
f1 = f1_score(y_test, y_pred_binary)
error_rate = 1 - elastic_net_accuracy
conf_matrix = confusion_matrix(y_test, y_pred_binary)

print("Accuracy:",elastic_net_accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Error Rate:", error_rate)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.868421052631579
Precision: 0.8790322580645161
Recall: 0.956140350877193
F1 Score: 0.9159663865546218
Error Rate: 0.13157894736842102
Confusion Matrix:
[[ 23  15]
 [  5 109]]


### RandomForest

In [143]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)


In [144]:
rf_accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
error_rate = 1 - rf_accuracy
y_pred_proba = rf_classifier.predict_proba(X_test)[:, 1]
loss = log_loss(y_test, y_pred_proba)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:",rf_accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Error Rate:", error_rate)
print("Binary Cross-Entropy Loss:", loss)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.8881578947368421
Precision: 0.888
Recall: 0.9736842105263158
F1 Score: 0.9288702928870293
Error Rate: 0.11184210526315785
Binary Cross-Entropy Loss: 0.3338101813352485
Confusion Matrix:
[[ 24  14]
 [  3 111]]


### ExtraTrees

In [145]:
et_classifier = ExtraTreesClassifier(n_estimators=100, random_state=42)
et_classifier.fit(X_train, y_train)
y_pred = et_classifier.predict(X_test)

In [146]:
et_accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
error_rate = 1 - et_accuracy
y_pred_proba = et_classifier.predict_proba(X_test)[:, 1]
loss = log_loss(y_test, y_pred_proba)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:",et_accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Error Rate:", error_rate)
print("Binary Cross-Entropy Loss:", loss)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.881578947368421
Precision: 0.8870967741935484
Recall: 0.9649122807017544
F1 Score: 0.9243697478991597
Error Rate: 0.11842105263157898
Binary Cross-Entropy Loss: 0.31982136505076286
Confusion Matrix:
[[ 24  14]
 [  4 110]]


### GMM

In [147]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
gmm = GaussianMixture(n_components=2, random_state=42)
gmm.fit(X_scaled)
cluster_labels = gmm.predict(X_scaled)

def map_cluster_labels(true_labels, cluster_labels):
    new_labels = np.zeros_like(cluster_labels)
    for i in range(np.max(cluster_labels) + 1):
        mask = (cluster_labels == i)
        new_labels[mask] = mode(true_labels[mask])[0]
    return new_labels

true_labels = y  
mapped_labels = map_cluster_labels(true_labels, cluster_labels)


In [148]:
gmm_accuracy = accuracy_score(true_labels, mapped_labels)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
error_rate = 1 - gmm_accuracy
y_pred_proba = gmm.predict_proba(X_test)[:, 1]
loss = log_loss(y_test, y_pred_proba)
conf_matrix = confusion_matrix(y_test, y_pred)

print("Accuracy:",gmm_accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Error Rate:", error_rate)
print("Binary Cross-Entropy Loss:", loss)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.746031746031746
Precision: 0.8870967741935484
Recall: 0.9649122807017544
F1 Score: 0.9243697478991597
Error Rate: 0.25396825396825395
Binary Cross-Entropy Loss: 9.010913347279288
Confusion Matrix:
[[ 24  14]
 [  4 110]]




### Accuracy on UNBALANCED data

In [149]:
print(f'XGBoost Accuracy: {xg_accuracy}')
print(f'AdaBoost Accuracy: {AB_accuracy}')
print(f"CatBoost Accuracy:{catboost_accuracy}")
print(f'GBM Accuracy: {gbm_accuracy}')
print(f'LightGBM Accuracy: {lgbm_accuracy}')
print(f'Ridge Accuracy: {ridge_accuracy}')
print(f'ElasticNet Accuracy: {elastic_net_accuracy}')
print(f'RandomForest Accuracy: {rf_accuracy}')
print(f'ExtraTrees Accuracy: {et_accuracy}')
print(f'GMM Accuracy: {gmm_accuracy}')

XGBoost Accuracy: 0.881578947368421
AdaBoost Accuracy: 0.8947368421052632
CatBoost Accuracy:0.8618421052631579
GBM Accuracy: 0.8486842105263158
LightGBM Accuracy: 0.8947368421052632
Ridge Accuracy: 0.8355263157894737
ElasticNet Accuracy: 0.868421052631579
RandomForest Accuracy: 0.8881578947368421
ExtraTrees Accuracy: 0.881578947368421
GMM Accuracy: 0.746031746031746


### Enhancing LightGBM

In [35]:
# existing model
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
params = {
    'objective': 'binary',
    'metric': 'binary_error',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}
lgbm = lgb.train(params, lgb_train, num_boost_round=100, valid_sets=lgb_eval)
y_pred = lgbm.predict(X_test, num_iteration=lgbm.best_iteration)
y_pred_binary = [1 if x > 0.5 else 0 for x in y_pred]
lgbm_accuracy = accuracy_score(y_test, y_pred_binary)
print("Accuracy:",lgbm_accuracy)


[LightGBM] [Info] Number of positive: 450, number of negative: 154
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013041 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 150492
[LightGBM] [Info] Number of data points in the train set: 604, number of used features: 753
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.745033 -> initscore=1.072295
[LightGBM] [Info] Start training from score 1.072295
Accuracy: 0.8947368421052632


#### Hyperparameter tuning

In [9]:
num_leaves_values = [20, 50, 100, 150, 200]
learning_rate_values = [0.01, 0.05, 0.1, 0.2]
feature_fraction_values = [0.5, 0.7, 0.8, 0.9, 1.0]

best_accuracy = 0.0
best_params = None

for num_leaves, learning_rate, feature_fraction in itertools.product(num_leaves_values, learning_rate_values, feature_fraction_values):
    params = {
        'objective': 'binary',
        'metric': 'binary_error',
        'boosting_type': 'gbdt',
        'num_leaves': num_leaves,
        'learning_rate': learning_rate,
        'feature_fraction': feature_fraction
    }
    
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
    lgbm = lgb.train(params, lgb_train, num_boost_round=100, valid_sets=lgb_eval)
    
    y_pred = lgbm.predict(X_test, num_iteration=lgbm.best_iteration)
    y_pred_binary = [1 if x > 0.5 else 0 for x in y_pred]
    accuracy = accuracy_score(y_test, y_pred_binary)
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_params = params

print("Best Parameters:", best_params)
print("Best Accuracy:", best_accuracy)


[LightGBM] [Info] Number of positive: 450, number of negative: 154
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008903 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 150492
[LightGBM] [Info] Number of data points in the train set: 604, number of used features: 753
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.745033 -> initscore=1.072295
[LightGBM] [Info] Start training from score 1.072295
[LightGBM] [Info] Number of positive: 450, number of negative: 154
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.013574 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 150492
[LightGBM] [Info] Number of data points in the train set: 604, number of used features: 753
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.745033 -> initscore=1.072295
[LightGBM] [Info] Start training from score 1.072295
[LightGBM] [Info] Numb

In [36]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
params = {
    'objective': 'binary',
    'metric': 'binary_error',
    'boosting_type': 'gbdt',
    'num_leaves': 20,
    'learning_rate': 0.1,
    'feature_fraction': 0.5
}
lgbm = lgb.train(params, lgb_train, num_boost_round=100, valid_sets=lgb_eval)
y_pred = lgbm.predict(X_test, num_iteration=lgbm.best_iteration)
y_pred_binary = [1 if x > 0.5 else 0 for x in y_pred]
better_lgbm_accuracy = accuracy_score(y_test, y_pred_binary)
print("Accuracy:",better_lgbm_accuracy)


[LightGBM] [Info] Number of positive: 450, number of negative: 154
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008840 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 150492
[LightGBM] [Info] Number of data points in the train set: 604, number of used features: 753
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.745033 -> initscore=1.072295
[LightGBM] [Info] Start training from score 1.072295
Accuracy: 0.9210526315789473


#### Optimising Threshold

In [37]:


params = {
    'objective': 'binary',
    'metric': 'binary_error',
    'boosting_type': 'gbdt',
    'num_leaves': 20,
    'learning_rate': 0.1,
    'feature_fraction': 0.5
}
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
lgbm = lgb.train(params, lgb_train, num_boost_round=100, valid_sets=lgb_eval)

def objective_function(threshold):
    y_pred = lgbm.predict(X_test)
    y_pred_binary = (y_pred > threshold).astype(int)
    accuracy = accuracy_score(y_test, y_pred_binary)
    return -accuracy 

initial_guess = 0.6  
result = minimize(objective_function, initial_guess, method='Nelder-Mead')

best_threshold = result.x[0]
print("Best Threshold:", best_threshold)

y_pred = lgbm.predict(X_test)
y_pred_binary = (y_pred > best_threshold).astype(int)
best_lgbm_accuracy = accuracy_score(y_test, y_pred_binary)
print("Accuracy with Best Threshold:", best_lgbm_accuracy)


[LightGBM] [Info] Number of positive: 450, number of negative: 154
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008752 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 150492
[LightGBM] [Info] Number of data points in the train set: 604, number of used features: 753
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.745033 -> initscore=1.072295
[LightGBM] [Info] Start training from score 1.072295
Best Threshold: 0.6
Accuracy with Best Threshold: 0.9276315789473685


### Proposed Model Accuracy

In [42]:
print("LightGBM Accuracy:",lgbm_accuracy*100,sep='\n')
print("\nLightGBM Accuracy after Hyperparameter Tuning:",better_lgbm_accuracy*100,sep='\n')
print("\nLightGBM Accuracy after Optimising threshold:",best_lgbm_accuracy*100,sep='\n')

LightGBM Accuracy:
89.47368421052632

LightGBM Accuracy after Hyperparameter Tuning:
92.10526315789474

LightGBM Accuracy after Optimising threshold:
92.76315789473685
