In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.discriminant_analysis import StandardScaler
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
%pip install lightgbm

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Note: you may need to restart the kernel to use updated packages.


In [2]:
random_state=42
best_models = {}

In [3]:
# Read in the data
df = pd.read_csv('../Data/Final_skin_cancer.csv')
df.drop('drink', axis=1, inplace=True)

# New Section

In [4]:
from sklearn.model_selection import train_test_split

def splitting_data(df, sampling, test_size=0.2, random_state=123):
    # First, split the data into features and target variable
    X = df.drop(['diagnostic'], axis=1)
    y = df['diagnostic']
    
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    # Apply sampling methods to the training data based on the specified method
    if sampling == 'none':
        return X_train, X_test, y_train, y_test
    elif sampling == 'SMOTEENN':
        from imblearn.combine import SMOTEENN
        smote_enn = SMOTEENN(random_state=random_state)
        X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)
        return X_resampled, X_test, y_resampled, y_test
    elif sampling == 'SMOTE':
        from imblearn.over_sampling import SMOTE
        smote = SMOTE(random_state=random_state)
        X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
        return X_resampled, X_test, y_resampled, y_test
    elif sampling == 'under':
        from imblearn.under_sampling import RandomUnderSampler
        rus = RandomUnderSampler(random_state=random_state)
        X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
        return X_resampled, X_test, y_resampled, y_test
    elif sampling == 'over':
        from imblearn.over_sampling import RandomOverSampler
        rus = RandomOverSampler(random_state=random_state)
        X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
        return X_resampled, X_test, y_resampled, y_test


In [5]:

def training(X_train, y_train):
    # Create a KNN classifier with 5 neighbors
    LGBM = lgb.LGBMClassifier()
    # Fit the classifier to the data
    LGBM.fit(X_train, y_train)
    return LGBM

In [6]:
def best_model(modelName, accuracy, precision, recall, f1):
    best_models[modelName] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [7]:
def predict(modleName,LGBM, X_test ,y_test):
    # Predict the labels for the training data X
    y_pred = LGBM.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    cr=classification_report(y_test, y_pred, output_dict=True)
    precision = cr['weighted avg']['precision']
    recall = cr['weighted avg']['recall']
    f1 = cr['weighted avg']['f1-score']
    best_model(modleName,accuracy,precision,recall,f1)
    cr=classification_report(y_test, y_pred)
    print(cr)

In [8]:
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

def optimize_with_grid(X_train, y_train):

    # Initialize the LGBMClassifier
    LGBM = lgb.LGBMClassifier()

    # Define the parameter grid
    param_grid = {
        'num_leaves': [31, 50, 70],
        'learning_rate': [0.01, 0.1, 0.5],
        'n_estimators': [50, 100, 200]
    }

    # Initialize GridSearchCV
    LGBM_cv = GridSearchCV(LGBM, param_grid, cv=5)

    # Fit the grid search to the data
    LGBM_cv.fit(X_train, y_train)

    # Best parameters and best score
    best_params = LGBM_cv.best_params_
    best_score = LGBM_cv.best_score_
    best_estimator = LGBM_cv.best_estimator_
    print(best_params)
    print(best_score)

    return best_estimator

<h1> LGBM on original data with optimization </h1>

In [9]:
# using function with no sampling
X_train, X_test, y_train, y_test = splitting_data(df, 'none')

# Scale the features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
#check number of observations in each class in the set
print("Number of observations in each class in the training set:")
print(y_train.value_counts())

Number of observations in each class in the training set:
diagnostic
1    1192
0     172
Name: count, dtype: int64


In [11]:
LGBM1 = training(X_train, y_train)
y_pred = predict('original',LGBM1, X_test, y_test)


[LightGBM] [Info] Number of positive: 1192, number of negative: 172
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000404 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 96
[LightGBM] [Info] Number of data points in the train set: 1364, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.873900 -> initscore=1.935893
[LightGBM] [Info] Start training from score 1.935893
              precision    recall  f1-score   support

           0       0.84      0.79      0.82        39
           1       0.97      0.98      0.98       302

    accuracy                           0.96       341
   macro avg       0.91      0.89      0.90       341
weighted avg       0.96      0.96      0.96       341



In [12]:
best_LGBM1 = optimize_with_grid(X_train, y_train)
prediction = predict('original_grid',best_LGBM1, X_test, y_test)

[LightGBM] [Info] Number of positive: 953, number of negative: 138
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000206 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 95
[LightGBM] [Info] Number of data points in the train set: 1091, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.873511 -> initscore=1.932361
[LightGBM] [Info] Start training from score 1.932361
[LightGBM] [Info] Number of positive: 953, number of negative: 138
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000130 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 94
[LightGBM] [Info] Number of data points in the train set: 1091, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.873511 -> initscore=1.932361
[LightGBM] [Info] Start training from score 1.932361
[LightGBM] [Info] Number of posi

<h1> LGBM using SMOTE sampling </h1>

In [13]:
X_train, X_test, y_train, y_test = splitting_data(df, 'SMOTE')
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [14]:
#check number of observations in each class in the set
print("Number of observations in each class in the training set:")
print(y_train.value_counts())

Number of observations in each class in the training set:
diagnostic
1    1192
0    1192
Name: count, dtype: int64


In [15]:
LGBM2 =training(X_train, y_train)
y_pred = predict('SMOTE',LGBM2, X_test, y_test)

[LightGBM] [Info] Number of positive: 1192, number of negative: 1192
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000518 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 97
[LightGBM] [Info] Number of data points in the train set: 2384, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
              precision    recall  f1-score   support

           0       0.74      0.82      0.78        39
           1       0.98      0.96      0.97       302

    accuracy                           0.95       341
   macro avg       0.86      0.89      0.88       341
weighted avg       0.95      0.95      0.95       341



In [16]:
best_LGBM2 = optimize_with_grid(X_train, y_train)
prediction = predict('SMOTE_grid',best_LGBM2, X_test, y_test)

[LightGBM] [Info] Number of positive: 953, number of negative: 954
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000428 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 96
[LightGBM] [Info] Number of data points in the train set: 1907, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499738 -> initscore=-0.001049
[LightGBM] [Info] Start training from score -0.001049
[LightGBM] [Info] Number of positive: 953, number of negative: 954
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000324 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 96
[LightGBM] [Info] Number of data points in the train set: 1907, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499738 -> initscore=-0.001049
[LightGBM] [Info] 

<h1> LGBM using SMOTEENN sampling </h1>

In [17]:
X_train, X_test, y_train, y_test = splitting_data(df, 'SMOTEENN')
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [18]:
print("Number of observations in each class in the set:")
print(y_train.value_counts())

Number of observations in each class in the set:
diagnostic
1    1018
0     933
Name: count, dtype: int64


In [19]:
LGBM3 =training(X_train, y_train)
y_pred = predict('SMOTEENN',LGBM3, X_test, y_test)

[LightGBM] [Info] Number of positive: 1018, number of negative: 933
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000609 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 95
[LightGBM] [Info] Number of data points in the train set: 1951, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.521784 -> initscore=0.087190
[LightGBM] [Info] Start training from score 0.087190
              precision    recall  f1-score   support

           0       0.68      0.87      0.76        39
           1       0.98      0.95      0.96       302

    accuracy                           0.94       341
   macro avg       0.83      0.91      0.86       341
weighted avg       0.95      0.94      0.94       341



In [20]:
# from joblib import dump
# dump(LGBM3,'/content/LGBM_SMOTEENN.joblib')

In [21]:
# from google.colab import drive
# drive.mount('/content/drive')

In [22]:
best_LGBM3 = optimize_with_grid(X_train, y_train)
prediction = predict('SMOTEENN_grid',best_LGBM3, X_test, y_test)

[LightGBM] [Info] Number of positive: 814, number of negative: 746
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000775 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 95
[LightGBM] [Info] Number of data points in the train set: 1560, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.521795 -> initscore=0.087235
[LightGBM] [Info] Start training from score 0.087235
[LightGBM] [Info] Number of positive: 815, number of negative: 746
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000288 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 94
[LightGBM] [Info] Number of data points in the train set: 1561, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.522101 -> initscore=0.088463
[LightGBM] [Info] Sta

<h1> DT on Random undersampling </h1>

In [23]:
X_train, X_test, y_train, y_test = splitting_data(df, 'under')
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [24]:
print("Number of observations in each class in the set:")
print(y_train.value_counts())

Number of observations in each class in the set:
diagnostic
0    172
1    172
Name: count, dtype: int64


In [25]:
LGBM4 =training(X_train, y_train)
y_pred = predict('undersampling',LGBM4, X_test, y_test)

[LightGBM] [Info] Number of positive: 172, number of negative: 172
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000071 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 79
[LightGBM] [Info] Number of data points in the train set: 344, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
              precision    recall  f1-score   support

           0       0.54      0.95      0.69        39
           1       0.99      0.89      0.94       302

    accuracy                           0.90       341
   macro avg       0.76      0.92      0.81       341
weighted avg       0.94      0.90      0.91       341



In [26]:
best_LGBM4 = optimize_with_grid(X_train, y_train)
prediction = predict('undersampling_grid',best_LGBM4, X_test, y_test)

[LightGBM] [Info] Number of positive: 138, number of negative: 137
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000193 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 72
[LightGBM] [Info] Number of data points in the train set: 275, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501818 -> initscore=0.007273
[LightGBM] [Info] Start training from score 0.007273
[LightGBM] [Info] Number of positive: 138, number of negative: 137
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000222 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 71
[LightGBM] [Info] Number of data points in the train set: 275, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501818 -> initscore=0.007273
[LightGBM] [Info] Start

<h1> DT on Random Oversampling </h1>

In [27]:
X_train, X_test, y_train, y_test = splitting_data(df, 'over')
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [28]:
print("Number of observations in each class in the set:")
print(y_train.value_counts())

Number of observations in each class in the set:
diagnostic
1    1192
0    1192
Name: count, dtype: int64


In [29]:
LGBM5 =training(X_train, y_train)
y_pred = predict('oversampling',LGBM5, X_test, y_test)

[LightGBM] [Info] Number of positive: 1192, number of negative: 1192
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000727 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 97
[LightGBM] [Info] Number of data points in the train set: 2384, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
              precision    recall  f1-score   support

           0       0.74      0.87      0.80        39
           1       0.98      0.96      0.97       302

    accuracy                           0.95       341
   macro avg       0.86      0.92      0.89       341
weighted avg       0.96      0.95      0.95       341



In [30]:
best_LGBM5 = optimize_with_grid(X_train, y_train)
prediction = predict('oversampling_grid',best_LGBM5, X_test, y_test)

[LightGBM] [Info] Number of positive: 953, number of negative: 954
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000557 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 97
[LightGBM] [Info] Number of data points in the train set: 1907, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499738 -> initscore=-0.001049
[LightGBM] [Info] Start training from score -0.001049
[LightGBM] [Info] Number of positive: 953, number of negative: 954
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000298 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 95
[LightGBM] [Info] Number of data points in the train set: 1907, number of used features: 8
[LightGBM] [Info] [binary:BoostFro

In [31]:
best_model_df = pd.DataFrame.from_dict(best_models, orient='index')
best_model_df.sort_values(by='accuracy', ascending=False, inplace=True)
best_model_df

Unnamed: 0,accuracy,precision,recall,f1
original,0.958944,0.958148,0.958944,0.958472
original_grid,0.958944,0.958148,0.958944,0.958472
SMOTE_grid,0.956012,0.95653,0.956012,0.956254
oversampling_grid,0.953079,0.957083,0.953079,0.95452
oversampling,0.950147,0.955154,0.950147,0.951907
SMOTE,0.947214,0.949939,0.947214,0.948326
SMOTEENN,0.938416,0.948185,0.938416,0.941651
SMOTEENN_grid,0.938416,0.948185,0.938416,0.941651
undersampling,0.900293,0.940447,0.900293,0.911536
undersampling_grid,0.891496,0.928446,0.891496,0.902908
