In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.discriminant_analysis import StandardScaler
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
%pip install lightgbm

Note: you may need to restart the kernel to use updated packages.


In [33]:
random_state=42
best_models = {}

In [34]:
# Read in the data
# Read in the data
df = pd.read_csv('../Data/Final_skin_cancer.csv')

df.drop('drink', axis=1, inplace=True)

# New Section

In [35]:
from sklearn.model_selection import train_test_split

def splitting_data(df, sampling, test_size=0.3, random_state=42):
    # First, split the data into features and target variable
    X = df.drop(['diagnostic'], axis=1)
    y = df['diagnostic']
    
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    # Apply sampling methods to the training data based on the specified method
    if sampling == 'none':
        return X_train, X_test, y_train, y_test
    elif sampling == 'SMOTEENN':
        from imblearn.combine import SMOTEENN
        smote_enn = SMOTEENN(random_state=random_state)
        X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)
        return X_resampled, X_test, y_resampled, y_test
    elif sampling == 'SMOTE':
        from imblearn.over_sampling import SMOTE
        smote = SMOTE(random_state=random_state)
        X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
        return X_resampled, X_test, y_resampled, y_test
    elif sampling == 'under':
        from imblearn.under_sampling import RandomUnderSampler
        rus = RandomUnderSampler(random_state=random_state)
        X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
        return X_resampled, X_test, y_resampled, y_test
    elif sampling == 'over':
        from imblearn.over_sampling import RandomOverSampler
        rus = RandomOverSampler(random_state=random_state)
        X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
        return X_resampled, X_test, y_resampled, y_test

In [36]:
from sklearn.tree import DecisionTreeClassifier
def training(X_train, y_train):
    # Create a KNN classifier with 5 neighbors
    DT = DecisionTreeClassifier(random_state=random_state)
    # Fit the classifier to the data
    DT.fit(X_train, y_train)
    return DT

In [37]:
def best_model(modelName, accuracy, precision, recall, f1):
    best_models[modelName] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [38]:
def predict(modleName,LGBM, X_test ,y_test):
    # Predict the labels for the training data X
    y_pred = LGBM.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    cr=classification_report(y_test, y_pred, output_dict=True)
    precision = cr['weighted avg']['precision']
    recall = cr['weighted avg']['recall']
    f1 = cr['weighted avg']['f1-score']
    best_model(modleName,accuracy,precision,recall,f1)
    cr=classification_report(y_test, y_pred)
    print(cr)

In [39]:




from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

def optimize_with_grid(X_train, y_train):
    # Define a pipeline that first scales the data and then applies the classifier
    pipe = Pipeline([
        # ('scaler', StandardScaler()),
        ('dt', DecisionTreeClassifier(random_state=random_state))
    ])

    # Define the parameter grid to search
    param_grid = {
        'dt__max_depth': [None, 10, 20, 30, 40, 50],
        'dt__min_samples_split': [2, 5, 10],
        'dt__min_samples_leaf': [1, 2, 4],
        'dt__criterion': ['gini', 'entropy']
    }

    # Create the GridSearchCV object
    DT_cv = GridSearchCV(pipe,param_grid, cv=5, verbose=1, n_jobs=-1)

    # Perform the grid search on the provided data
    DT_cv.fit(X_train, y_train)

    # Best parameters and best score
    best_params = DT_cv.best_params_
    best_score = DT_cv.best_score_
    best_estimator = DT_cv.best_estimator_
    print(best_params)
    print(best_score)

    return best_estimator


<h1> LGBM on original data with optimization </h1>

In [40]:
# using function with no sampling
X_train, X_test, y_train, y_test = splitting_data(df, 'none')
# Split the data into train and test sets
# Scale the features using StandardScaler
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

In [41]:
#check number of observations in each class in the set
print("Number of observations in each class in the training set:")
print(y_train.value_counts())

Number of observations in each class in the training set:
diagnostic
1    1047
0     146
Name: count, dtype: int64


In [42]:
LGBM1 = training(X_train, y_train)
y_pred = predict('original',LGBM1, X_test, y_test)


              precision    recall  f1-score   support

           0       0.90      0.69      0.78        65
           1       0.96      0.99      0.97       447

    accuracy                           0.95       512
   macro avg       0.93      0.84      0.88       512
weighted avg       0.95      0.95      0.95       512



In [43]:
best_LGBM1 = optimize_with_grid(X_train, y_train)
prediction = predict('original_grid',best_LGBM1, X_test, y_test)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


{'dt__criterion': 'entropy', 'dt__max_depth': None, 'dt__min_samples_leaf': 1, 'dt__min_samples_split': 10}
0.9379628001828347
              precision    recall  f1-score   support

           0       0.84      0.65      0.73        65
           1       0.95      0.98      0.97       447

    accuracy                           0.94       512
   macro avg       0.90      0.81      0.85       512
weighted avg       0.94      0.94      0.94       512



<h1> LGBM using SMOTE sampling </h1>

In [44]:
X_train, X_test, y_train, y_test = splitting_data(df, 'SMOTE')
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

In [45]:
#check number of observations in each class in the set
print("Number of observations in each class in the training set:")
print(y_train.value_counts())

Number of observations in each class in the training set:
diagnostic
1    1047
0    1047
Name: count, dtype: int64


In [46]:
LGBM2 =training(X_train, y_train)
y_pred = predict('SMOTE',LGBM2, X_test, y_test)

              precision    recall  f1-score   support

           0       0.82      0.77      0.79        65
           1       0.97      0.98      0.97       447

    accuracy                           0.95       512
   macro avg       0.89      0.87      0.88       512
weighted avg       0.95      0.95      0.95       512



In [47]:
best_LGBM2 = optimize_with_grid(X_train, y_train)
prediction = predict('SMOTE_grid',best_LGBM2, X_test, y_test)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
{'dt__criterion': 'gini', 'dt__max_depth': None, 'dt__min_samples_leaf': 1, 'dt__min_samples_split': 2}
0.9541549142981125
              precision    recall  f1-score   support

           0       0.82      0.77      0.79        65
           1       0.97      0.98      0.97       447

    accuracy                           0.95       512
   macro avg       0.89      0.87      0.88       512
weighted avg       0.95      0.95      0.95       512



<h1> LGBM using SMOTEENN sampling </h1>

In [48]:
X_train, X_test, y_train, y_test = splitting_data(df, 'SMOTEENN')
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

In [49]:
print("Number of observations in each class in the set:")
print(y_train.value_counts())

Number of observations in each class in the set:
diagnostic
1    895
0    880
Name: count, dtype: int64


In [50]:
LGBM3 =training(X_train, y_train)
y_pred = predict('SMOTEENN',LGBM3, X_test, y_test)

              precision    recall  f1-score   support

           0       0.68      0.77      0.72        65
           1       0.97      0.95      0.96       447

    accuracy                           0.92       512
   macro avg       0.82      0.86      0.84       512
weighted avg       0.93      0.92      0.93       512



In [51]:
# from joblib import dump
# dump(LGBM3,'/content/LGBM_SMOTEENN.joblib')

In [52]:
# from google.colab import drive
# drive.mount('/content/drive')

In [53]:
best_LGBM3 = optimize_with_grid(X_train, y_train)
prediction = predict('SMOTEENN_grid',best_LGBM3, X_test, y_test)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
{'dt__criterion': 'entropy', 'dt__max_depth': None, 'dt__min_samples_leaf': 1, 'dt__min_samples_split': 2}
0.9966197183098592
              precision    recall  f1-score   support

           0       0.70      0.77      0.74        65
           1       0.97      0.95      0.96       447

    accuracy                           0.93       512
   macro avg       0.84      0.86      0.85       512
weighted avg       0.93      0.93      0.93       512



<h1> DT on Random undersampling </h1>

In [54]:
X_train, X_test, y_train, y_test = splitting_data(df, 'under')
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

In [55]:
print("Number of observations in each class in the set:")
print(y_train.value_counts())

Number of observations in each class in the set:
diagnostic
0    146
1    146
Name: count, dtype: int64


In [56]:
LGBM4 =training(X_train, y_train)
y_pred = predict('undersampling',LGBM4, X_test, y_test)

              precision    recall  f1-score   support

           0       0.49      0.85      0.62        65
           1       0.97      0.87      0.92       447

    accuracy                           0.87       512
   macro avg       0.73      0.86      0.77       512
weighted avg       0.91      0.87      0.88       512



In [57]:
best_LGBM4 = optimize_with_grid(X_train, y_train)
prediction = predict('undersampling_grid',best_LGBM4, X_test, y_test)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
{'dt__criterion': 'entropy', 'dt__max_depth': None, 'dt__min_samples_leaf': 4, 'dt__min_samples_split': 10}
0.8662770309760376
              precision    recall  f1-score   support

           0       0.55      0.75      0.64        65
           1       0.96      0.91      0.94       447

    accuracy                           0.89       512
   macro avg       0.76      0.83      0.79       512
weighted avg       0.91      0.89      0.90       512



<h1> DT on Random Oversampling </h1>

In [58]:
X_train, X_test, y_train, y_test = splitting_data(df, 'over')
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

In [59]:
print("Number of observations in each class in the set:")
print(y_train.value_counts())

Number of observations in each class in the set:
diagnostic
1    1047
0    1047
Name: count, dtype: int64


In [60]:
LGBM5 =training(X_train, y_train)
y_pred = predict('oversampling',LGBM5, X_test, y_test)

              precision    recall  f1-score   support

           0       0.76      0.74      0.75        65
           1       0.96      0.97      0.96       447

    accuracy                           0.94       512
   macro avg       0.86      0.85      0.86       512
weighted avg       0.94      0.94      0.94       512



In [61]:
best_LGBM5 = optimize_with_grid(X_train, y_train)
prediction = predict('oversampling_grid',best_LGBM5, X_test, y_test)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
{'dt__criterion': 'entropy', 'dt__max_depth': None, 'dt__min_samples_leaf': 1, 'dt__min_samples_split': 2}
0.9751652944467917
              precision    recall  f1-score   support

           0       0.75      0.72      0.73        65
           1       0.96      0.96      0.96       447

    accuracy                           0.93       512
   macro avg       0.85      0.84      0.85       512
weighted avg       0.93      0.93      0.93       512



In [62]:
best_model_df = pd.DataFrame.from_dict(best_models, orient='index')
best_model_df.sort_values(by='accuracy', ascending=False, inplace=True)
best_model_df

Unnamed: 0,accuracy,precision,recall,f1
original,0.951172,0.94951,0.951172,0.94839
SMOTE,0.949219,0.94807,0.949219,0.948526
SMOTE_grid,0.949219,0.94807,0.949219,0.948526
original_grid,0.939453,0.936224,0.939453,0.936004
oversampling,0.9375,0.936718,0.9375,0.937081
oversampling_grid,0.933594,0.932758,0.933594,0.933149
SMOTEENN_grid,0.929688,0.932755,0.929688,0.931001
SMOTEENN,0.923828,0.928927,0.923828,0.925907
undersampling_grid,0.890625,0.909919,0.890625,0.897639
undersampling,0.869141,0.913564,0.869141,0.882884
