In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.discriminant_analysis import StandardScaler
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score


In [17]:
random_state=42
best_models = {}

In [18]:
# Read in the data
df = pd.read_csv('../Data/Final_skin_cancer.csv')

df

Unnamed: 0,smoke,drink,age,pesticide,gender,skin_cancer_history,cancer_history,has_piped_water,has_sewage_system,diagnostic
0,False,False,55,False,0,True,True,True,True,1
1,False,True,79,False,1,True,False,False,False,1
2,False,True,52,False,0,False,True,True,True,1
3,False,False,74,True,0,False,False,False,False,1
4,False,True,58,True,0,True,True,True,True,1
...,...,...,...,...,...,...,...,...,...,...
1700,False,False,23,True,0,False,True,True,True,0
1701,False,False,27,False,0,False,False,True,True,0
1702,True,True,23,False,1,False,False,True,True,0
1703,False,False,23,True,0,False,False,True,False,0


In [19]:
df.isnull().sum()

smoke                  0
drink                  0
age                    0
pesticide              0
gender                 0
skin_cancer_history    0
cancer_history         0
has_piped_water        0
has_sewage_system      0
diagnostic             0
dtype: int64

# New Section

In [20]:
from imblearn.under_sampling import ClusterCentroids, TomekLinks
from imblearn.over_sampling import RandomOverSampler

def splitting_data(df, sampling):
    X = df.drop(['diagnostic'], axis=1)
    y = df['diagnostic']

    if sampling == 'none':
        return X, y
    elif sampling == 'SMOTEENN':
        sampler = SMOTEENN(random_state=random_state)
    elif sampling == 'SMOTE':
        sampler = SMOTE(random_state=random_state)
    elif sampling == 'under':
        sampler = RandomUnderSampler(random_state=random_state)
    elif sampling == 'over':
        sampler = RandomOverSampler(random_state=random_state)
    elif sampling == 'cluster_centroids':
        sampler = ClusterCentroids(random_state=random_state)
    elif sampling == 'tomek_links':
        sampler = TomekLinks()

    X_resampled, y_resampled = sampler.fit_resample(X, y)
    return X_resampled, y_resampled


In [21]:

def training(X_train, y_train):
    # Create a KNN classifier with 5 neighbors
    LGBM = AdaBoostClassifier()
    # Fit the classifier to the data
    LGBM.fit(X_train, y_train)
    return LGBM

In [22]:
def best_model(modelName, accuracy, precision, recall, f1):
    best_models[modelName] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [23]:
def predict(modleName,LGBM, X_test ,y_test):
    # Predict the labels for the training data X
    y_pred = LGBM.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    cr=classification_report(y_test, y_pred, output_dict=True)
    precision = cr['weighted avg']['precision']
    recall = cr['weighted avg']['recall']
    f1 = cr['weighted avg']['f1-score']
    best_model(modleName,accuracy,precision,recall,f1)
    cr=classification_report(y_test, y_pred)
    print(cr)

In [24]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

def optimize_with_grid(X_train, y_train, random_state=None):
    # Define a pipeline that first scales the data and then applies the classifier
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('adaboost', AdaBoostClassifier(DecisionTreeClassifier(random_state=random_state), random_state=random_state))
    ])

    # Define the parameter grid to search
    param_grid = {
        'adaboost__n_estimators': [50, 100, 200],  # Number of base estimators to use
        'adaboost__learning_rate': [0.01, 0.1, 1],  # Learning rate shrinks the contribution of each classifier
    }

    # Create the GridSearchCV object
    adaboost_cv = GridSearchCV(pipe, param_grid, cv=5, verbose=1, n_jobs=-1)

    # Perform the grid search on the provided data
    adaboost_cv.fit(X_train, y_train)

    # Best parameters and best score
    best_params = adaboost_cv.best_params_
    best_score = adaboost_cv.best_score_
    best_estimator = adaboost_cv.best_estimator_
    print("Best Parameters:", best_params)
    print("Best Score:", best_score)

    return best_estimator

# Example usage (ensure you have defined X_train, y_train, and random_state before calling this function)
# best_adaboost_model = optimize_with_grid(X_train, y_train, random_state=42)


<h1> LGBM on original data with optimization </h1>

In [25]:
# using function with no sampling
X, y= splitting_data(df, 'none')
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
# Scale the features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [26]:
#check number of observations in each class in the set
print("Number of observations in each class in the training set:")
print(y.value_counts())

Number of observations in each class in the training set:
diagnostic
1    1494
0     211
Name: count, dtype: int64


In [27]:
LGBM1 = training(X_train, y_train)
y_pred = predict('original',LGBM1, X_test, y_test)


              precision    recall  f1-score   support

           0       0.86      0.68      0.76        37
           1       0.96      0.99      0.97       304

    accuracy                           0.95       341
   macro avg       0.91      0.83      0.87       341
weighted avg       0.95      0.95      0.95       341





In [28]:
best_LGBM1 = optimize_with_grid(X_train, y_train)
prediction = predict('original_grid',best_LGBM1, X_test, y_test)

Fitting 5 folds for each of 9 candidates, totalling 45 fits




Best Parameters: {'adaboost__learning_rate': 0.01, 'adaboost__n_estimators': 50}
Best Score: 0.934014759750054
              precision    recall  f1-score   support

           0       0.87      0.73      0.79        37
           1       0.97      0.99      0.98       304

    accuracy                           0.96       341
   macro avg       0.92      0.86      0.89       341
weighted avg       0.96      0.96      0.96       341





<h1> LGBM using SMOTE sampling </h1>

In [29]:
X,y = splitting_data(df, 'SMOTE')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [30]:
#check number of observations in each class in the set
print("Number of observations in each class in the training set:")
print(y.value_counts())

Number of observations in each class in the training set:
diagnostic
1    1494
0    1494
Name: count, dtype: int64


In [31]:
LGBM2 =training(X_train, y_train)
y_pred = predict('SMOTE',LGBM2, X_test, y_test)

              precision    recall  f1-score   support

           0       0.91      0.96      0.93       284
           1       0.96      0.91      0.94       314

    accuracy                           0.94       598
   macro avg       0.94      0.94      0.94       598
weighted avg       0.94      0.94      0.94       598





In [32]:
best_LGBM2 = optimize_with_grid(X_train, y_train)
prediction = predict('SMOTE_grid',best_LGBM2, X_test, y_test)



Fitting 5 folds for each of 9 candidates, totalling 45 fits




Best Parameters: {'adaboost__learning_rate': 0.01, 'adaboost__n_estimators': 50}
Best Score: 0.9669456066945606
              precision    recall  f1-score   support

           0       0.94      0.98      0.96       284
           1       0.98      0.95      0.96       314

    accuracy                           0.96       598
   macro avg       0.96      0.96      0.96       598
weighted avg       0.96      0.96      0.96       598





<h1> LGBM using SMOTEENN sampling </h1>

In [33]:
X,y = splitting_data(df, 'SMOTEENN')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [34]:
print("Number of observations in each class in the set:")
print(y.value_counts())

Number of observations in each class in the set:
diagnostic
1    1332
0    1256
Name: count, dtype: int64


In [35]:
LGBM3 =training(X_train, y_train)
y_pred = predict('SMOTEENN',LGBM3, X_test, y_test)



              precision    recall  f1-score   support

           0       0.94      0.98      0.96       241
           1       0.98      0.95      0.96       277

    accuracy                           0.96       518
   macro avg       0.96      0.96      0.96       518
weighted avg       0.96      0.96      0.96       518



In [36]:
# from joblib import dump
# dump(LGBM3,'/content/LGBM_SMOTEENN.joblib')

In [37]:
# from google.colab import drive
# drive.mount('/content/drive')

In [38]:
best_LGBM3 = optimize_with_grid(X_train, y_train)
prediction = predict('SMOTEENN_grid',best_LGBM3, X_test, y_test)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best Parameters: {'adaboost__learning_rate': 0.1, 'adaboost__n_estimators': 50}
Best Score: 0.9898550724637681
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       241
           1       1.00      0.99      0.99       277

    accuracy                           0.99       518
   macro avg       0.99      0.99      0.99       518
weighted avg       0.99      0.99      0.99       518





<h1> DT on Random undersampling </h1>

In [39]:
X,y = splitting_data(df, 'under')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [40]:
print("Number of observations in each class in the set:")
print(y.value_counts())

Number of observations in each class in the set:
diagnostic
0    211
1    211
Name: count, dtype: int64


In [41]:
LGBM4 =training(X_train, y_train)
y_pred = predict('undersampling',LGBM4, X_test, y_test)

              precision    recall  f1-score   support

           0       0.84      0.91      0.88        35
           1       0.94      0.88      0.91        50

    accuracy                           0.89        85
   macro avg       0.89      0.90      0.89        85
weighted avg       0.90      0.89      0.89        85





In [42]:
best_LGBM4 = optimize_with_grid(X_train, y_train)
prediction = predict('undersampling_grid',best_LGBM4, X_test, y_test)

Fitting 5 folds for each of 9 candidates, totalling 45 fits






Best Parameters: {'adaboost__learning_rate': 0.01, 'adaboost__n_estimators': 200}
Best Score: 0.8755048287971905
              precision    recall  f1-score   support

           0       0.82      0.89      0.85        35
           1       0.91      0.86      0.89        50

    accuracy                           0.87        85
   macro avg       0.87      0.87      0.87        85
weighted avg       0.87      0.87      0.87        85





<h1> DT on Random Oversampling </h1>

In [43]:
X,y = splitting_data(df, 'over')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [44]:
print("Number of observations in each class in the set:")
print(y.value_counts())

Number of observations in each class in the set:
diagnostic
1    1494
0    1494
Name: count, dtype: int64


In [45]:
LGBM5 =training(X_train, y_train)
y_pred = predict('oversampling',LGBM5, X_test, y_test)



              precision    recall  f1-score   support

           0       0.86      0.92      0.89       292
           1       0.92      0.86      0.89       306

    accuracy                           0.89       598
   macro avg       0.89      0.89      0.89       598
weighted avg       0.89      0.89      0.89       598



In [46]:
best_LGBM5 = optimize_with_grid(X_train, y_train)
prediction = predict('oversampling_grid',best_LGBM5, X_test, y_test)

Fitting 5 folds for each of 9 candidates, totalling 45 fits




Best Parameters: {'adaboost__learning_rate': 0.01, 'adaboost__n_estimators': 50}
Best Score: 0.9736401673640168
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       292
           1       1.00      0.94      0.97       306

    accuracy                           0.97       598
   macro avg       0.97      0.97      0.97       598
weighted avg       0.97      0.97      0.97       598





<h1> DT on Cluster Centroids </h1>

In [47]:
X,y = splitting_data(df, 'cluster_centroids')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [48]:
print("Number of observations in each class in the set:")
print(y.value_counts())

Number of observations in each class in the set:
diagnostic
0    211
1    211
Name: count, dtype: int64


In [49]:
LGBM6 =training(X_train, y_train)
y_pred = predict('cluster_centroids',LGBM6, X_test, y_test)

              precision    recall  f1-score   support

           0       0.97      1.00      0.99        35
           1       1.00      0.98      0.99        50

    accuracy                           0.99        85
   macro avg       0.99      0.99      0.99        85
weighted avg       0.99      0.99      0.99        85





In [50]:
best_LGBM6 = optimize_with_grid(X_train, y_train)
prediction = predict('cluster_centroids_grid',best_LGBM6, X_test, y_test)

Fitting 5 folds for each of 9 candidates, totalling 45 fits




Best Parameters: {'adaboost__learning_rate': 1, 'adaboost__n_estimators': 200}
Best Score: 0.9168568920105356
              precision    recall  f1-score   support

           0       0.85      0.97      0.91        35
           1       0.98      0.88      0.93        50

    accuracy                           0.92        85
   macro avg       0.91      0.93      0.92        85
weighted avg       0.93      0.92      0.92        85



<h1> DT on Tomek Links </h1>

In [51]:
X,y = splitting_data(df, 'tomek_links')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [52]:
print("Number of observations in each class in the set:")
print(y.value_counts())

Number of observations in each class in the set:
diagnostic
1    1492
0     211
Name: count, dtype: int64


In [53]:
LGBM7 =training(X_train, y_train)
y_pred = predict('tomek_links',LGBM7, X_test, y_test)



              precision    recall  f1-score   support

           0       0.84      0.77      0.80        48
           1       0.96      0.98      0.97       293

    accuracy                           0.95       341
   macro avg       0.90      0.87      0.89       341
weighted avg       0.95      0.95      0.95       341



In [54]:
best_LGBM7 = optimize_with_grid(X_train, y_train)
prediction = predict('tomek_links_grid',best_LGBM7, X_test, y_test)

Fitting 5 folds for each of 9 candidates, totalling 45 fits




Best Parameters: {'adaboost__learning_rate': 1, 'adaboost__n_estimators': 100}
Best Score: 0.9427251669898729
              precision    recall  f1-score   support

           0       0.84      0.75      0.79        48
           1       0.96      0.98      0.97       293

    accuracy                           0.94       341
   macro avg       0.90      0.86      0.88       341
weighted avg       0.94      0.94      0.94       341





In [55]:
best_model_df = pd.DataFrame.from_dict(best_models, orient='index')
best_model_df.sort_values(by='accuracy', ascending=False, inplace=True)
best_model_df

Unnamed: 0,accuracy,precision,recall,f1
SMOTEENN_grid,0.994208,0.99428,0.994208,0.994211
cluster_centroids,0.988235,0.988562,0.988235,0.988259
oversampling_grid,0.968227,0.970169,0.968227,0.968219
SMOTE_grid,0.963211,0.964031,0.963211,0.963233
SMOTEENN,0.959459,0.960113,0.959459,0.959496
original_grid,0.958944,0.957242,0.958944,0.957334
original,0.953079,0.950746,0.953079,0.95054
tomek_links,0.947214,0.945782,0.947214,0.946245
tomek_links_grid,0.944282,0.942485,0.944282,0.942986
SMOTE,0.936455,0.937875,0.936455,0.936495
