# Unballanced dataset model implementation

    
### Boosting alghoritms: 
    XGBoost
    AdaBoost
    Gradient Boosting Machine
    Optimized light gradient boosting 
    CatBoost

## Unballanced dataset Models

In [1]:
# Importing necessary libraries

import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)
pd.set_option('float_format', '{:f}'.format)

import matplotlib.pyplot as plt
import matplotlib.colors as colors  
import matplotlib.patches as mpatches
%matplotlib inline
import seaborn as sns

import time

import warnings
warnings.filterwarnings('ignore')


from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import collections
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, recall_score, precision_score

In [2]:
df = pd.read_csv('datasets/data3_synthetic.csv')

In [3]:
df.head()

Unnamed: 0,Class,V1,V2,V3,V4,V5
0,0,0.223811,-0.997202,-0.760048,-0.368115,-0.98913
1,0,0.659099,-0.997189,0.97692,-0.316844,-0.98913
2,0,1.811941,-0.997146,-0.996419,-0.726818,-0.98913
3,0,0.354844,-0.997134,0.645444,0.74965,-0.98913
4,0,0.113514,-0.997126,-0.628333,0.983777,-0.98913


In [5]:
# Spliting Training and testing data
X = df.drop('Class', axis=1)
y = df['Class']


In [6]:
sss = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)

In [7]:
for train_index, test_index in sss.split(X, y):
    print("Train:", train_index, "Test:", test_index)
    original_Xtrain, original_Xtest = X.iloc[train_index], X.iloc[test_index]
    original_ytrain, original_ytest = y.iloc[train_index], y.iloc[test_index]

# Check the Distribution of the labels
original_Xtrain = original_Xtrain.values
original_Xtest = original_Xtest.values
original_ytrain = original_ytrain.values
original_ytest = original_ytest.values

# Check to see if the distribution of the test and train labels is similar.
train_unique_label, train_counts_label = np.unique(original_ytrain, return_counts=True)
test_unique_label, test_counts_label = np.unique(original_ytest, return_counts=True)
print('-' * 100)

print('Label Distributions: \n')
print(train_counts_label/ len(original_ytrain))
print(test_counts_label/ len(original_ytest))

Train: [ 350197  350198  350199 ... 1754152 1754153 1754154] Test: [     0      1      2 ... 424727 425176 425184]
Train: [      0       1       2 ... 1754152 1754153 1754154] Test: [350197 350198 350199 ... 768566 768653 768685]
Train: [      0       1       2 ... 1754152 1754153 1754154] Test: [ 701101  701102  701103 ... 1095208 1095343 1095476]
Train: [      0       1       2 ... 1754152 1754153 1754154] Test: [1052163 1052164 1052165 ... 1424801 1424859 1425018]
Train: [      0       1       2 ... 1424801 1424859 1425018] Test: [1403110 1403111 1403112 ... 1754152 1754153 1754154]
----------------------------------------------------------------------------------------------------
Label Distributions: 

[0.9916313 0.0083687]
[0.99162845 0.00837155]


In [8]:
# setting up testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

## XGBoost

by including tweaking for max_depth and n_estimators alongside learning_rate, we'll set up a nested loop to iterate over combinations of these parameters. 

In [9]:
import time
import pandas as pd
import xgboost as xgb
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

# Define ranges for the parameters to test
learning_rates = [0.01, 0.1, 0.3]
max_depths = [3, 6, 9]
n_estimators = [20, 50, 100, 200, 500, 1000]

# Initialize a list to store the results
results = []

# Loop over the combinations of learning_rates, max_depths, and n_estimators
for lr in learning_rates:
    for depth in max_depths:
        for n_est in n_estimators:
            start_time = time.time()  # Start timing
            
            # Instantiate and fit the XGBoost classifier
            xgb_classifier = xgb.XGBClassifier(learning_rate=lr, max_depth=depth, n_estimators=n_est,
                                               use_label_encoder=False, eval_metric='logloss')
            xgb_classifier.fit(X_train, y_train)
            
            # Predict and evaluate on training data
            y_train_pred = xgb_classifier.predict(X_train)
            train_accuracy = accuracy_score(y_train, y_train_pred)
            train_f1 = f1_score(y_train, y_train_pred)
            train_recall = recall_score(y_train, y_train_pred)
            train_precision = precision_score(y_train, y_train_pred)
            
            # Predict and evaluate on testing data
            y_test_pred = xgb_classifier.predict(X_test)
            test_accuracy = accuracy_score(y_test, y_test_pred)
            test_f1 = f1_score(y_test, y_test_pred)
            test_recall = recall_score(y_test, y_test_pred)
            test_precision = precision_score(y_test, y_test_pred)
            
            # Calculate elapsed time
            end_time = time.time()
            elapsed_time = end_time - start_time
            
            # Append training and testing results separately, including parameter values and execution time
            params = f'LR: {lr}, Depth: {depth}, Estimators: {n_est}'
            
            print(params, elapsed_time)
            
            results.append({
                'Parameters': params,
                'Data Split': 'Training',
                'Accuracy': train_accuracy,
                'F1 Score': train_f1,
                'Recall': train_recall,
                'Precision': train_precision,
                'Execution Time (s)': elapsed_time
            })
            results.append({
                'Parameters': params,
                'Data Split': 'Testing',
                'Accuracy': test_accuracy,
                'F1 Score': test_f1,
                'Recall': test_recall,
                'Precision': test_precision,
                'Execution Time (s)': elapsed_time
            })

# Convert the list to a DataFrame
results_df = pd.DataFrame(results)

LR: 0.01, Depth: 3, Estimators: 20 4.003005027770996
LR: 0.01, Depth: 3, Estimators: 50 5.163248062133789
LR: 0.01, Depth: 3, Estimators: 100 5.816546440124512
LR: 0.01, Depth: 3, Estimators: 200 8.84053087234497
LR: 0.01, Depth: 3, Estimators: 500 18.153380632400513
LR: 0.01, Depth: 3, Estimators: 1000 34.31721520423889
LR: 0.01, Depth: 6, Estimators: 20 3.9422965049743652
LR: 0.01, Depth: 6, Estimators: 50 5.948397636413574
LR: 0.01, Depth: 6, Estimators: 100 7.727770566940308
LR: 0.01, Depth: 6, Estimators: 200 12.981003999710083
LR: 0.01, Depth: 6, Estimators: 500 31.947895765304565
LR: 0.01, Depth: 6, Estimators: 1000 81.12107515335083
LR: 0.01, Depth: 9, Estimators: 20 4.118523836135864
LR: 0.01, Depth: 9, Estimators: 50 7.19537091255188
LR: 0.01, Depth: 9, Estimators: 100 10.883767366409302
LR: 0.01, Depth: 9, Estimators: 200 21.810601234436035
LR: 0.01, Depth: 9, Estimators: 500 45.25473976135254
LR: 0.01, Depth: 9, Estimators: 1000 85.99015927314758
LR: 0.1, Depth: 3, Estimato

In [10]:
results_df.to_excel('data3_results/XGBoost_results.xlsx', index=False)
results_df.head()

Unnamed: 0,Parameters,Data Split,Accuracy,F1 Score,Recall,Precision,Execution Time (s)
0,"LR: 0.01, Depth: 3, Estimators: 20",Training,0.991569,0.0,0.0,0.0,4.003005
1,"LR: 0.01, Depth: 3, Estimators: 20",Testing,0.991876,0.0,0.0,0.0,4.003005
2,"LR: 0.01, Depth: 3, Estimators: 50",Training,0.991569,0.0,0.0,0.0,5.163248
3,"LR: 0.01, Depth: 3, Estimators: 50",Testing,0.991876,0.0,0.0,0.0,5.163248
4,"LR: 0.01, Depth: 3, Estimators: 100",Training,0.991569,0.0,0.0,0.0,5.816546


## AdaBoost

To experiment with the AdaBoost algorithm and tweak its parameters, we can adjust the number of estimators (n_estimators) and the learning rate (learning_rate). These are two key parameters for AdaBoost that influence the performance and complexity of the resulting model. The n_estimators parameter controls the number of sequential models to be added to correct the errors of the previous models, while learning_rate scales the contribution of each model.

In [11]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

# Define ranges for the parameters to test
n_estimators_list = [10, 25, 50, 100] #, 500, 1000, 5000] #Too slow to run all. At 1000 took 4 hours and crashed.
learning_rates_list = [0.01, 0.1, 0.5, 1] #, 2]

# Initialize a list to store the results
results = []

# Loop over the combinations of n_estimators and learning_rates
for n_estimators in n_estimators_list:
    for learning_rate in learning_rates_list:
        start_time = time.time()  # Start timing
        
        # Instantiate and fit the AdaBoost classifier
        ada_classifier = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate)
        ada_classifier.fit(X_train, y_train)
        
        # Predict and evaluate on training data
        y_train_pred = ada_classifier.predict(X_train)
        train_accuracy = accuracy_score(y_train, y_train_pred)
        train_f1 = f1_score(y_train, y_train_pred)
        train_recall = recall_score(y_train, y_train_pred)
        train_precision = precision_score(y_train, y_train_pred)
        
        # Predict and evaluate on testing data
        y_test_pred = ada_classifier.predict(X_test)
        test_accuracy = accuracy_score(y_test, y_test_pred)
        test_f1 = f1_score(y_test, y_test_pred)
        test_recall = recall_score(y_test, y_test_pred)
        test_precision = precision_score(y_test, y_test_pred)
        
        # Calculate elapsed time
        end_time = time.time()
        elapsed_time = end_time - start_time
        
        # Append training and testing results separately, including parameter values and execution time
        params = f'Estimators: {n_estimators}, LR: {learning_rate}'
        print(params, elapsed_time)
        results.append({
            'Parameters': params,
            'Data Split': 'Training',
            'Accuracy': train_accuracy,
            'F1 Score': train_f1,
            'Recall': train_recall,
            'Precision': train_precision,
            'Execution Time (s)': elapsed_time
        })
        results.append({
            'Parameters': params,
            'Data Split': 'Testing',
            'Accuracy': test_accuracy,
            'F1 Score': test_f1,
            'Recall': test_recall,
            'Precision': test_precision,
            'Execution Time (s)': elapsed_time
        })

# Convert the list to a DataFrame
results_df = pd.DataFrame(results)

Estimators: 10, LR: 0.01 26.956761121749878
Estimators: 10, LR: 0.1 24.028055906295776
Estimators: 10, LR: 0.5 24.49956178665161
Estimators: 10, LR: 1 24.55051279067993
Estimators: 25, LR: 0.01 54.94986534118652
Estimators: 25, LR: 0.1 54.04945230484009
Estimators: 25, LR: 0.5 56.36599326133728
Estimators: 25, LR: 1 56.06926774978638
Estimators: 50, LR: 0.01 109.1175627708435
Estimators: 50, LR: 0.1 112.29208755493164
Estimators: 50, LR: 0.5 112.23736095428467
Estimators: 50, LR: 1 112.10819983482361
Estimators: 100, LR: 0.01 216.1755862236023
Estimators: 100, LR: 0.1 216.39499497413635
Estimators: 100, LR: 0.5 219.55863070487976
Estimators: 100, LR: 1 224.14096212387085


In [12]:
results_df.to_excel('data3_results/AdaBoost_results.xlsx', index=False)
results_df.head()

Unnamed: 0,Parameters,Data Split,Accuracy,F1 Score,Recall,Precision,Execution Time (s)
0,"Estimators: 10, LR: 0.01",Training,0.993435,0.362378,0.221283,1.0,26.956761
1,"Estimators: 10, LR: 0.01",Testing,0.993704,0.36723,0.224912,1.0,26.956761
2,"Estimators: 10, LR: 0.1",Training,0.993435,0.362378,0.221283,1.0,24.028056
3,"Estimators: 10, LR: 0.1",Testing,0.993704,0.36723,0.224912,1.0,24.028056
4,"Estimators: 10, LR: 0.5",Training,0.993435,0.362378,0.221283,1.0,24.499562


## AdaBoost with Decision Tree clacifier

AdaBoost, as implemented in scikit-learn (AdaBoostClassifier), doesn't directly accept a max_depth parameter because AdaBoost can be used with various types of base estimators, not just decision trees. This is a frequent way of using AdaBoost.

In [13]:
import time
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

# Define ranges for the parameters to test
n_estimators_list = [10, 25, 50] #, 500, 1000, 5000]
learning_rates_list = [0.01, 0.1, 0.3]
max_depths = [3, 5, 7]

# Initialize a list to store the results
results = []

# Loop over the combinations of n_estimators, learning_rates, and max_depths
for n_estimators in n_estimators_list:
    for learning_rate in learning_rates_list:
        for max_depth in max_depths:
            start_time = time.time()  # Start timing
            
            # Create a DecisionTreeClassifier with the specified max_depth
            base_estimator = DecisionTreeClassifier(max_depth=max_depth)
            
            # Instantiate and fit the AdaBoost classifier with the DecisionTree base estimator
            ada_classifier = AdaBoostClassifier(
                base_estimator=base_estimator,
                n_estimators=n_estimators,
                learning_rate=learning_rate
            )
            ada_classifier.fit(X_train, y_train)
            
            # Predict and evaluate on training data
            y_train_pred = ada_classifier.predict(X_train)
            train_accuracy = accuracy_score(y_train, y_train_pred)
            train_f1 = f1_score(y_train, y_train_pred)
            train_recall = recall_score(y_train, y_train_pred)
            train_precision = precision_score(y_train, y_train_pred)
            
            # Predict and evaluate on testing data
            y_test_pred = ada_classifier.predict(X_test)
            test_accuracy = accuracy_score(y_test, y_test_pred)
            test_f1 = f1_score(y_test, y_test_pred)
            test_recall = recall_score(y_test, y_test_pred)
            test_precision = precision_score(y_test, y_test_pred)
            
            # Calculate elapsed time
            end_time = time.time()
            elapsed_time = end_time - start_time
            
            # Append training and testing results separately, including parameter values and execution time
            params = f'Estimators: {n_estimators}, LR: {learning_rate}, Depth: {max_depth}'
            print(params, elapsed_time)
            results.append({
                'Parameters': params,
                'Data Split': 'Training',
                'Accuracy': train_accuracy,
                'F1 Score': train_f1,
                'Recall': train_recall,
                'Precision': train_precision,
                'Execution Time (s)': elapsed_time
            })
            results.append({
                'Parameters': params,
                'Data Split': 'Testing',
                'Accuracy': test_accuracy,
                'F1 Score': test_f1,
                'Recall': test_recall,
                'Precision': test_precision,
                'Execution Time (s)': elapsed_time
            })

# Convert the list to a DataFrame
results_df = pd.DataFrame(results)


Estimators: 10, LR: 0.01, Depth: 3 57.369370460510254
Estimators: 10, LR: 0.01, Depth: 5 88.89150857925415
Estimators: 10, LR: 0.01, Depth: 7 121.49059724807739
Estimators: 10, LR: 0.1, Depth: 3 55.72072649002075
Estimators: 10, LR: 0.1, Depth: 5 87.91244196891785
Estimators: 10, LR: 0.1, Depth: 7 119.1020143032074
Estimators: 10, LR: 0.3, Depth: 3 57.55844783782959
Estimators: 10, LR: 0.3, Depth: 5 86.01259350776672
Estimators: 10, LR: 0.3, Depth: 7 123.87506103515625
Estimators: 25, LR: 0.01, Depth: 3 140.37486243247986
Estimators: 25, LR: 0.01, Depth: 5 216.529625415802
Estimators: 25, LR: 0.01, Depth: 7 291.60878443717957
Estimators: 25, LR: 0.1, Depth: 3 139.96478080749512
Estimators: 25, LR: 0.1, Depth: 5 222.31930565834045
Estimators: 25, LR: 0.1, Depth: 7 300.19666814804077
Estimators: 25, LR: 0.3, Depth: 3 140.36984634399414
Estimators: 25, LR: 0.3, Depth: 5 220.21723890304565
Estimators: 25, LR: 0.3, Depth: 7 298.06032252311707
Estimators: 50, LR: 0.01, Depth: 3 268.927777767

In [14]:
results_df.to_excel('data3_results/AdaboostDT_results.xlsx', index=False)

In [None]:
### Was run in different instance on another similar computer.

## Gradient Boosting Machine

To perform a similar parameter tweaking experiment with a Gradient Boosting Machine (GBM) using scikit-learn's GradientBoostingClassifier, we can iterate over combinations of n_estimators, learning_rate, and max_depth. These parameters are analogous to those adjusted in the AdaBoost and XGBoost, serving similar purposes:

n_estimators: controls the number of sequential trees to be modeled.
learning_rate: scales the contribution of each tree.
max_depth: sets the maximum depth of each tree.

In [15]:
import time
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

# Define ranges for the parameters to test
n_estimators_list = [10, 50, 100]
learning_rates_list = [0.01, 0.1, 0.3]
max_depths = [3, 6, 9]

# Initialize a list to store the results
results = []

# Loop over the combinations of n_estimators, learning_rates, and max_depths
for n_estimators in n_estimators_list:
    for learning_rate in learning_rates_list:
        for max_depth in max_depths:
            start_time = time.time()  # Start timing
            
            # Instantiate and fit the GradientBoostingClassifier
            gbm_classifier = GradientBoostingClassifier(
                n_estimators=n_estimators,
                learning_rate=learning_rate,
                max_depth=max_depth
            )
            gbm_classifier.fit(X_train, y_train)
            
            # Predict and evaluate on training data
            y_train_pred = gbm_classifier.predict(X_train)
            train_accuracy = accuracy_score(y_train, y_train_pred)
            train_f1 = f1_score(y_train, y_train_pred)
            train_recall = recall_score(y_train, y_train_pred)
            train_precision = precision_score(y_train, y_train_pred)
            
            # Predict and evaluate on testing data
            y_test_pred = gbm_classifier.predict(X_test)
            test_accuracy = accuracy_score(y_test, y_test_pred)
            test_f1 = f1_score(y_test, y_test_pred)
            test_recall = recall_score(y_test, y_test_pred)
            test_precision = precision_score(y_test, y_test_pred)
            
            # Calculate elapsed time
            end_time = time.time()
            elapsed_time = end_time - start_time
            
            # Append training and testing results separately, including parameter values and execution time
            params = f'Estimators: {n_estimators}, LR: {learning_rate}, Depth: {max_depth}'
            print(params, elapsed_time)
            results.append({
                'Parameters': params,
                'Data Split': 'Training',
                'Accuracy': train_accuracy,
                'F1 Score': train_f1,
                'Recall': train_recall,
                'Precision': train_precision,
                'Execution Time (s)': elapsed_time
            })
            results.append({
                'Parameters': params,
                'Data Split': 'Testing',
                'Accuracy': test_accuracy,
                'F1 Score': test_f1,
                'Recall': test_recall,
                'Precision': test_precision,
                'Execution Time (s)': elapsed_time
            })

# Convert the list to a DataFrame
results_df = pd.DataFrame(results)



Estimators: 10, LR: 0.01, Depth: 3 50.11316227912903
Estimators: 10, LR: 0.01, Depth: 6 96.05612015724182
Estimators: 10, LR: 0.01, Depth: 9 146.7016384601593
Estimators: 10, LR: 0.1, Depth: 3 51.65701341629028
Estimators: 10, LR: 0.1, Depth: 6 100.11393427848816
Estimators: 10, LR: 0.1, Depth: 9 150.24624872207642
Estimators: 10, LR: 0.3, Depth: 3 50.959155321121216
Estimators: 10, LR: 0.3, Depth: 6 101.6550235748291
Estimators: 10, LR: 0.3, Depth: 9 150.32666325569153
Estimators: 50, LR: 0.01, Depth: 3 261.01782274246216
Estimators: 50, LR: 0.01, Depth: 6 522.9441232681274
Estimators: 50, LR: 0.01, Depth: 9 778.2255091667175
Estimators: 50, LR: 0.1, Depth: 3 258.34352827072144
Estimators: 50, LR: 0.1, Depth: 6 497.89626002311707
Estimators: 50, LR: 0.1, Depth: 9 746.8027191162109
Estimators: 50, LR: 0.3, Depth: 3 250.33263516426086
Estimators: 50, LR: 0.3, Depth: 6 507.9370141029358
Estimators: 50, LR: 0.3, Depth: 9 909.3066816329956
Estimators: 100, LR: 0.01, Depth: 3 596.6135818958

In [16]:
results_df.to_excel('data3_results/GBM_results.xlsx', index=False)
results_df.head()

Unnamed: 0,Parameters,Data Split,Accuracy,F1 Score,Recall,Precision,Execution Time (s)
0,"Estimators: 10, LR: 0.01, Depth: 3",Training,0.991569,0.0,0.0,0.0,50.113162
1,"Estimators: 10, LR: 0.01, Depth: 3",Testing,0.991876,0.0,0.0,0.0,50.113162
2,"Estimators: 10, LR: 0.01, Depth: 6",Training,0.991569,0.0,0.0,0.0,96.05612
3,"Estimators: 10, LR: 0.01, Depth: 6",Testing,0.991876,0.0,0.0,0.0,96.05612
4,"Estimators: 10, LR: 0.01, Depth: 9",Training,0.991569,0.0,0.0,0.0,146.701638
