In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.discriminant_analysis import StandardScaler
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
%pip install lightgbm

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Note: you may need to restart the kernel to use updated packages.


In [2]:
random_state=42
best_models = {}

In [3]:
# Read in the data
df = pd.read_csv('../Data/Final_skin_cancer.csv')
df.drop('drink', axis=1, inplace=True)

# New Section

In [4]:
from imblearn.under_sampling import ClusterCentroids, TomekLinks
from imblearn.over_sampling import RandomOverSampler

def splitting_data(df, sampling):
    X = df.drop(['diagnostic'], axis=1)
    y = df['diagnostic']

    if sampling == 'none':
        return X, y
    elif sampling == 'SMOTEENN':
        sampler = SMOTEENN(random_state=random_state)
    elif sampling == 'SMOTE':
        sampler = SMOTE(random_state=random_state)
    elif sampling == 'under':
        sampler = RandomUnderSampler(random_state=random_state)
    elif sampling == 'over':
        sampler = RandomOverSampler(random_state=random_state)
    elif sampling == 'cluster_centroids':
        sampler = ClusterCentroids(random_state=random_state)
    elif sampling == 'tomek_links':
        sampler = TomekLinks()

    X_resampled, y_resampled = sampler.fit_resample(X, y)
    return X_resampled, y_resampled


In [5]:

def training(X_train, y_train):
    # Create a KNN classifier with 5 neighbors
    LGBM = lgb.LGBMClassifier()
    # Fit the classifier to the data
    LGBM.fit(X_train, y_train)
    return LGBM

In [6]:
def best_model(modelName, accuracy, precision, recall, f1):
    best_models[modelName] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [7]:
def predict(modleName,LGBM, X_test ,y_test):
    # Predict the labels for the training data X
    y_pred = LGBM.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    cr=classification_report(y_test, y_pred, output_dict=True)
    precision = cr['weighted avg']['precision']
    recall = cr['weighted avg']['recall']
    f1 = cr['weighted avg']['f1-score']
    best_model(modleName,accuracy,precision,recall,f1)
    cr=classification_report(y_test, y_pred)
    print(cr)

In [8]:
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

def optimize_with_grid(X_train, y_train):

    # Initialize the LGBMClassifier
    LGBM = lgb.LGBMClassifier()

    # Define the parameter grid
    param_grid = {
        'num_leaves': [31, 50, 70],
        'learning_rate': [0.01, 0.1, 0.5],
        'n_estimators': [50, 100, 200]
    }

    # Initialize GridSearchCV
    LGBM_cv = GridSearchCV(LGBM, param_grid, cv=5)

    # Fit the grid search to the data
    LGBM_cv.fit(X_train, y_train)

    # Best parameters and best score
    best_params = LGBM_cv.best_params_
    best_score = LGBM_cv.best_score_
    best_estimator = LGBM_cv.best_estimator_
    print(best_params)
    print(best_score)

    return best_estimator

<h1> LGBM on original data with optimization </h1>

In [9]:
# using function with no sampling
X, y= splitting_data(df, 'none')
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
# Scale the features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
#check number of observations in each class in the set
print("Number of observations in each class in the training set:")
print(y.value_counts())

Number of observations in each class in the training set:
diagnostic
1    1494
0     211
Name: count, dtype: int64


In [11]:
LGBM1 = training(X_train, y_train)
y_pred = predict('original',LGBM1, X_test, y_test)


[LightGBM] [Info] Number of positive: 1190, number of negative: 174
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000434 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 95
[LightGBM] [Info] Number of data points in the train set: 1364, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.872434 -> initscore=1.922653
[LightGBM] [Info] Start training from score 1.922653


              precision    recall  f1-score   support

           0       0.87      0.73      0.79        37
           1       0.97      0.99      0.98       304

    accuracy                           0.96       341
   macro avg       0.92      0.86      0.89       341
weighted avg       0.96      0.96      0.96       341



In [12]:
best_LGBM1 = optimize_with_grid(X_train, y_train)
prediction = predict('original_grid',best_LGBM1, X_test, y_test)

[LightGBM] [Info] Number of positive: 952, number of negative: 139
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000074 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 93
[LightGBM] [Info] Number of data points in the train set: 1091, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.872594 -> initscore=1.924091
[LightGBM] [Info] Start training from score 1.924091
[LightGBM] [Info] Number of positive: 952, number of negative: 139
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000135 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 93
[LightGBM] [Info] Number of data points in the train set: 1091, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.872594 -> initscore=1.924091
[LightGBM] [Info] Sta

<h1> LGBM using SMOTE sampling </h1>

In [13]:
X,y = splitting_data(df, 'SMOTE')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [14]:
#check number of observations in each class in the set
print("Number of observations in each class in the training set:")
print(y.value_counts())

Number of observations in each class in the training set:
diagnostic
1    1494
0    1494
Name: count, dtype: int64


In [15]:
LGBM2 =training(X_train, y_train)
y_pred = predict('SMOTE',LGBM2, X_test, y_test)

[LightGBM] [Info] Number of positive: 1180, number of negative: 1210
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000428 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 98
[LightGBM] [Info] Number of data points in the train set: 2390, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493724 -> initscore=-0.025106
[LightGBM] [Info] Start training from score -0.025106
              precision    recall  f1-score   support

           0       0.96      0.99      0.97       284
           1       0.99      0.96      0.97       314

    accuracy                           0.97       598
   macro avg       0.97      0.97      0.97       598
weighted avg       0.97      0.97      0.97       598



In [16]:
best_LGBM2 = optimize_with_grid(X_train, y_train)
prediction = predict('SMOTE_grid',best_LGBM2, X_test, y_test)

[LightGBM] [Info] Number of positive: 944, number of negative: 968
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000336 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 98
[LightGBM] [Info] Number of data points in the train set: 1912, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.493724 -> initscore=-0.025106
[LightGBM] [Info] Start training from score -0.025106
[LightGBM] [Info] Number of positive: 944, number of negative: 968
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000261 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 97
[LightGBM] [Info] Number of data points in the train set: 1912, number of used features: 8
[LightGBM] [Info] [binary:BoostFro

<h1> LGBM using SMOTEENN sampling </h1>

In [17]:
X,y = splitting_data(df, 'SMOTEENN')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [18]:
print("Number of observations in each class in the set:")
print(y.value_counts())

Number of observations in each class in the set:
diagnostic
1    1332
0    1244
Name: count, dtype: int64


In [19]:
LGBM3 =training(X_train, y_train)
y_pred = predict('SMOTEENN',LGBM3, X_test, y_test)

[LightGBM] [Info] Number of positive: 1065, number of negative: 995
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000861 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 96
[LightGBM] [Info] Number of data points in the train set: 2060, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.516990 -> initscore=0.067987
[LightGBM] [Info] Start training from score 0.067987
              precision    recall  f1-score   support

           0       1.00      0.99      0.99       249
           1       0.99      1.00      0.99       267

    accuracy                           0.99       516
   macro avg       0.99      0.99      0.99       516
weighted avg       0.99      0.99      0.99       516



In [20]:
# from joblib import dump
# dump(LGBM3,'/content/LGBM_SMOTEENN.joblib')

In [21]:
# from google.colab import drive
# drive.mount('/content/drive')

In [22]:
best_LGBM3 = optimize_with_grid(X_train, y_train)
prediction = predict('SMOTEENN_grid',best_LGBM3, X_test, y_test)

[LightGBM] [Info] Number of positive: 852, number of negative: 796
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000852 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 94
[LightGBM] [Info] Number of data points in the train set: 1648, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.516990 -> initscore=0.067987
[LightGBM] [Info] Start training from score 0.067987
[LightGBM] [Info] Number of positive: 852, number of negative: 796
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000354 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 95
[LightGBM] [Info] Number of data points in the train set: 1648, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.516990 -> initscore=0.067987
[LightGBM] [Info] Sta

<h1> DT on Random undersampling </h1>

In [23]:
X,y = splitting_data(df, 'under')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [24]:
print("Number of observations in each class in the set:")
print(y.value_counts())

Number of observations in each class in the set:
diagnostic
0    211
1    211
Name: count, dtype: int64


In [25]:
LGBM4 =training(X_train, y_train)
y_pred = predict('undersampling',LGBM4, X_test, y_test)

[LightGBM] [Info] Number of positive: 161, number of negative: 176
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000276 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 78
[LightGBM] [Info] Number of data points in the train set: 337, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.477745 -> initscore=-0.089080
[LightGBM] [Info] Start training from score -0.089080
              precision    recall  f1-score   support

           0       0.82      0.91      0.86        35
           1       0.93      0.86      0.90        50

    accuracy                           0.88        85
   macro avg       0.88      0.89      0.88        85
weighted avg       0.89      0.88      0.88        85



In [26]:
best_LGBM4 = optimize_with_grid(X_train, y_train)
prediction = predict('undersampling_grid',best_LGBM4, X_test, y_test)

[LightGBM] [Info] Number of positive: 129, number of negative: 140
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000365 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 67
[LightGBM] [Info] Number of data points in the train set: 269, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.479554 -> initscore=-0.081830
[LightGBM] [Info] Start training from score -0.081830
[LightGBM] [Info] Number of positive: 128, number of negative: 141
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000287 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 70
[LightGBM] [Info] Number of data points in the train set: 269, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.475836 -> initscore=-0.096730
[LightGBM] [Info] St

<h1> DT on Random Oversampling </h1>

In [27]:
X,y = splitting_data(df, 'over')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [28]:
print("Number of observations in each class in the set:")
print(y.value_counts())

Number of observations in each class in the set:
diagnostic
1    1494
0    1494
Name: count, dtype: int64


In [29]:
LGBM5 =training(X_train, y_train)
y_pred = predict('oversampling',LGBM5, X_test, y_test)

[LightGBM] [Info] Number of positive: 1188, number of negative: 1202
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000163 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 98
[LightGBM] [Info] Number of data points in the train set: 2390, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.497071 -> initscore=-0.011716
[LightGBM] [Info] Start training from score -0.011716
              precision    recall  f1-score   support

           0       0.93      0.99      0.96       292
           1       0.99      0.93      0.96       306

    accuracy                           0.96       598
   macro avg       0.96      0.96      0.96       598
weighted avg       0.96      0.96      0.96       598



In [30]:
best_LGBM5 = optimize_with_grid(X_train, y_train)
prediction = predict('oversampling_grid',best_LGBM5, X_test, y_test)

[LightGBM] [Info] Number of positive: 951, number of negative: 961
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000391 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 97
[LightGBM] [Info] Number of data points in the train set: 1912, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.497385 -> initscore=-0.010460
[LightGBM] [Info] Start training from score -0.010460
[LightGBM] [Info] Number of positive: 951, number of negative: 961
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000200 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 98
[LightGBM] [Info] Number of data points in the train set: 1912, number of used features: 8
[LightGBM] [Info] [binary:BoostFro

KeyboardInterrupt: 

<h1> DT on Cluster Centroids </h1>

In [None]:
X,y = splitting_data(df, 'cluster_centroids')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
print("Number of observations in each class in the set:")
print(y.value_counts())

Number of observations in each class in the set:
diagnostic
0    211
1    211
Name: count, dtype: int64


In [None]:
LGBM6 =training(X_train, y_train)
y_pred = predict('cluster_centroids',LGBM6, X_test, y_test)

[LightGBM] [Info] Number of positive: 161, number of negative: 176
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000909 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 731
[LightGBM] [Info] Number of data points in the train set: 337, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.477745 -> initscore=-0.089080
[LightGBM] [Info] Start training from score -0.089080
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        35
           1       1.00      1.00      1.00        50

    accuracy                           1.00        85
   macro avg       1.00      1.00      1.00        85
weighted avg       1.00      1.00      1.00        85



In [None]:
best_LGBM6 = optimize_with_grid(X_train, y_train)
prediction = predict('cluster_centroids_grid',best_LGBM6, X_test, y_test)

[LightGBM] [Info] Number of positive: 129, number of negative: 140
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000813 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 631
[LightGBM] [Info] Number of data points in the train set: 269, number of used features: 42
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.479554 -> initscore=-0.081830
[LightGBM] [Info] Start training from score -0.081830
[LightGBM] [Info] Number of positive: 128, number of negative: 141
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000380 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 642
[LightGBM] [Info] Number of data points in the train set: 269, number of used features: 42
[LightGBM] [Info] [binary:BoostF

<h1> DT on Tomek Links </h1>

In [None]:
X,y = splitting_data(df, 'tomek_links')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
print("Number of observations in each class in the set:")
print(y.value_counts())

Number of observations in each class in the set:
diagnostic
1    1494
0     211
Name: count, dtype: int64


In [None]:
LGBM7 =training(X_train, y_train)
y_pred = predict('tomek_links',LGBM7, X_test, y_test)

[LightGBM] [Info] Number of positive: 1192, number of negative: 172
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000982 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 293
[LightGBM] [Info] Number of data points in the train set: 1364, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.873900 -> initscore=1.935893
[LightGBM] [Info] Start training from score 1.935893
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        39
           1       1.00      1.00      1.00       302

    accuracy                           1.00       341
   macro avg       1.00      1.00      1.00       341
weighted avg       1.00      1.00      1.00       341



In [None]:
best_LGBM7 = optimize_with_grid(X_train, y_train)
prediction = predict('tomek_links_grid',best_LGBM7, X_test, y_test)

[LightGBM] [Info] Number of positive: 953, number of negative: 138
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001150 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 292
[LightGBM] [Info] Number of data points in the train set: 1091, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.873511 -> initscore=1.932361
[LightGBM] [Info] Start training from score 1.932361
[LightGBM] [Info] Number of positive: 953, number of negative: 138
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000512 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 290
[LightGBM] [Info] Number of data points in the train set: 1091, number of used features: 43
[LightGBM] [Info] [binary:BoostF

In [None]:
best_model_df = pd.DataFrame.from_dict(best_models, orient='index')
best_model_df.sort_values(by='accuracy', ascending=False, inplace=True)
best_model_df

Unnamed: 0,accuracy,precision,recall,f1
original,1.0,1.0,1.0,1.0
original_grid,1.0,1.0,1.0,1.0
SMOTE,1.0,1.0,1.0,1.0
SMOTE_grid,1.0,1.0,1.0,1.0
SMOTEENN,1.0,1.0,1.0,1.0
SMOTEENN_grid,1.0,1.0,1.0,1.0
undersampling,1.0,1.0,1.0,1.0
undersampling_grid,1.0,1.0,1.0,1.0
oversampling,1.0,1.0,1.0,1.0
oversampling_grid,1.0,1.0,1.0,1.0
