In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.discriminant_analysis import StandardScaler
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
%pip install lightgbm

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Note: you may need to restart the kernel to use updated packages.


In [2]:
random_state=42
best_models = {}

In [3]:
# Read in the data
# Read in the data
df = pd.read_csv('../Data/Final_skin_cancer.csv')

df.drop('drink', axis=1, inplace=True)

# New Section

In [4]:
from sklearn.model_selection import train_test_split

def splitting_data(df, sampling, test_size=0.2, random_state=123):
    # First, split the data into features and target variable
    X = df.drop(['diagnostic'], axis=1)
    y = df['diagnostic']
    
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    # Apply sampling methods to the training data based on the specified method
    if sampling == 'none':
        return X_train, X_test, y_train, y_test
    elif sampling == 'SMOTEENN':
        from imblearn.combine import SMOTEENN
        smote_enn = SMOTEENN(random_state=random_state)
        X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)
        return X_resampled, X_test, y_resampled, y_test
    elif sampling == 'SMOTE':
        from imblearn.over_sampling import SMOTE
        smote = SMOTE(random_state=random_state)
        X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
        return X_resampled, X_test, y_resampled, y_test
    elif sampling == 'under':
        from imblearn.under_sampling import RandomUnderSampler
        rus = RandomUnderSampler(random_state=random_state)
        X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
        return X_resampled, X_test, y_resampled, y_test
    elif sampling == 'over':
        from imblearn.over_sampling import RandomOverSampler
        rus = RandomOverSampler(random_state=random_state)
        X_resampled, y_resampled = rus.fit_resample(X_train, y_train)
        return X_resampled, X_test, y_resampled, y_test

In [5]:

def training(X_train, y_train):
    # Create a KNN classifier with 5 neighbors
    LGBM = SVC()
    # Fit the classifier to the data
    LGBM.fit(X_train, y_train)
    return LGBM

In [6]:
def best_model(modelName, accuracy, precision, recall, f1):
    best_models[modelName] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [7]:
def predict(modleName,LGBM, X_test ,y_test):
    # Predict the labels for the training data X
    y_pred = LGBM.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    cr=classification_report(y_test, y_pred, output_dict=True)
    precision = cr['weighted avg']['precision']
    recall = cr['weighted avg']['recall']
    f1 = cr['weighted avg']['f1-score']
    best_model(modleName,accuracy,precision,recall,f1)
    cr=classification_report(y_test, y_pred)
    print(cr)

In [8]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

def optimize_with_grid(X_train, y_train):
    # Define a pipeline that first scales the data and then applies the classifier
    # Scaling is more critical for SVM than for Decision Trees
    pipe = Pipeline([
        ('scaler', StandardScaler()),  # Uncommenting this as it's important for SVM
        ('svm', SVC(random_state=random_state))
    ])

    # Define the parameter grid to search
    param_grid = {
        'svm__C': [0.1, 1, 10, 100],  # Regularization parameter
        'svm__gamma': [1, 0.1, 0.01, 0.001],  # Kernel coefficient for 'rbf', 'poly' and 'sigmoid'
        'svm__kernel': ['rbf', 'poly', 'sigmoid']  # Type of kernel
    }

    # Create the GridSearchCV object
    svm_cv = GridSearchCV(pipe, param_grid, cv=5, verbose=1, n_jobs=-1)

    # Perform the grid search on the provided data
    svm_cv.fit(X_train, y_train)

    # Best parameters and best score
    best_params = svm_cv.best_params_
    best_score = svm_cv.best_score_
    best_estimator = svm_cv.best_estimator_
    print("Best Parameters:", best_params)
    print("Best Score:", best_score)

    return best_estimator

# Example usage (you need to define X_train, y_train, and random_state before calling this function)
# best_svm_model = optimize_with_grid_svm(X_train, y_train, random_state=42)

<h1> LGBM on original data with optimization </h1>

In [9]:
# using function with no sampling
X_train, X_test, y_train, y_test = splitting_data(df, 'none')
# Split the data into train and test sets
# Scale the features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
#check number of observations in each class in the set
print("Number of observations in each class in the training set:")
print(y_train.value_counts())

Number of observations in each class in the training set:
diagnostic
1    1192
0     172
Name: count, dtype: int64


In [11]:
LGBM1 = training(X_train, y_train)
y_pred = predict('original',LGBM1, X_test, y_test)


              precision    recall  f1-score   support

           0       0.81      0.54      0.65        39
           1       0.94      0.98      0.96       302

    accuracy                           0.93       341
   macro avg       0.88      0.76      0.80       341
weighted avg       0.93      0.93      0.93       341



In [12]:
best_LGBM1 = optimize_with_grid(X_train, y_train)
prediction = predict('original_grid',best_LGBM1, X_test, y_test)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


Best Parameters: {'svm__C': 100, 'svm__gamma': 0.1, 'svm__kernel': 'rbf'}
Best Score: 0.9245017237664296
              precision    recall  f1-score   support

           0       0.79      0.59      0.68        39
           1       0.95      0.98      0.96       302

    accuracy                           0.94       341
   macro avg       0.87      0.78      0.82       341
weighted avg       0.93      0.94      0.93       341



<h1> LGBM using SMOTE sampling </h1>

In [13]:
X_train, X_test, y_train, y_test = splitting_data(df, 'SMOTE')
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [14]:
#check number of observations in each class in the set
print("Number of observations in each class in the training set:")
print(y_train.value_counts())

Number of observations in each class in the training set:
diagnostic
1    1192
0    1192
Name: count, dtype: int64


In [15]:
LGBM2 =training(X_train, y_train)
y_pred = predict('SMOTE',LGBM2, X_test, y_test)

              precision    recall  f1-score   support

           0       0.56      0.85      0.67        39
           1       0.98      0.91      0.95       302

    accuracy                           0.91       341
   macro avg       0.77      0.88      0.81       341
weighted avg       0.93      0.91      0.91       341



In [16]:
best_LGBM2 = optimize_with_grid(X_train, y_train)
prediction = predict('SMOTE_grid',best_LGBM2, X_test, y_test)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Parameters: {'svm__C': 100, 'svm__gamma': 1, 'svm__kernel': 'rbf'}
Best Score: 0.9324753800891425
              precision    recall  f1-score   support

           0       0.66      0.79      0.72        39
           1       0.97      0.95      0.96       302

    accuracy                           0.93       341
   macro avg       0.82      0.87      0.84       341
weighted avg       0.94      0.93      0.93       341



<h1> LGBM using SMOTEENN sampling </h1>

In [17]:
X_train, X_test, y_train, y_test = splitting_data(df, 'SMOTEENN')
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [18]:
print("Number of observations in each class in the set:")
print(y_train.value_counts())

Number of observations in each class in the set:
diagnostic
1    1018
0     933
Name: count, dtype: int64


In [19]:
LGBM3 =training(X_train, y_train)
y_pred = predict('SMOTEENN',LGBM3, X_test, y_test)

              precision    recall  f1-score   support

           0       0.51      0.79      0.62        39
           1       0.97      0.90      0.93       302

    accuracy                           0.89       341
   macro avg       0.74      0.85      0.78       341
weighted avg       0.92      0.89      0.90       341



In [20]:
# from joblib import dump
# dump(LGBM3,'/content/LGBM_SMOTEENN.joblib')

In [21]:
# from google.colab import drive
# drive.mount('/content/drive')

In [22]:
best_LGBM3 = optimize_with_grid(X_train, y_train)
prediction = predict('SMOTEENN_grid',best_LGBM3, X_test, y_test)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Parameters: {'svm__C': 100, 'svm__gamma': 1, 'svm__kernel': 'rbf'}
Best Score: 0.9784838350055741
              precision    recall  f1-score   support

           0       0.59      0.82      0.69        39
           1       0.98      0.93      0.95       302

    accuracy                           0.91       341
   macro avg       0.78      0.87      0.82       341
weighted avg       0.93      0.91      0.92       341



<h1> DT on Random undersampling </h1>

In [23]:
X_train, X_test, y_train, y_test = splitting_data(df, 'under')
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [24]:
print("Number of observations in each class in the set:")
print(y_train.value_counts())

Number of observations in each class in the set:
diagnostic
0    172
1    172
Name: count, dtype: int64


In [25]:
LGBM4 =training(X_train, y_train)
y_pred = predict('undersampling',LGBM4, X_test, y_test)

              precision    recall  f1-score   support

           0       0.49      0.95      0.65        39
           1       0.99      0.87      0.93       302

    accuracy                           0.88       341
   macro avg       0.74      0.91      0.79       341
weighted avg       0.94      0.88      0.90       341



In [26]:
best_LGBM4 = optimize_with_grid(X_train, y_train)
prediction = predict('undersampling_grid',best_LGBM4, X_test, y_test)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Parameters: {'svm__C': 10, 'svm__gamma': 0.1, 'svm__kernel': 'rbf'}
Best Score: 0.8837169650468883
              precision    recall  f1-score   support

           0       0.47      0.90      0.62        39
           1       0.99      0.87      0.92       302

    accuracy                           0.87       341
   macro avg       0.73      0.88      0.77       341
weighted avg       0.93      0.87      0.89       341



<h1> DT on Random Oversampling </h1>

In [27]:
X_train, X_test, y_train, y_test = splitting_data(df, 'over')
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [28]:
print("Number of observations in each class in the set:")
print(y_train.value_counts())

Number of observations in each class in the set:
diagnostic
1    1192
0    1192
Name: count, dtype: int64


In [29]:
LGBM5 =training(X_train, y_train)
y_pred = predict('oversampling',LGBM5, X_test, y_test)

              precision    recall  f1-score   support

           0       0.54      0.92      0.68        39
           1       0.99      0.90      0.94       302

    accuracy                           0.90       341
   macro avg       0.76      0.91      0.81       341
weighted avg       0.94      0.90      0.91       341



In [30]:
best_LGBM5 = optimize_with_grid(X_train, y_train)
prediction = predict('oversampling_grid',best_LGBM5, X_test, y_test)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Parameters: {'svm__C': 10, 'svm__gamma': 1, 'svm__kernel': 'rbf'}
Best Score: 0.9286947483395874
              precision    recall  f1-score   support

           0       0.55      0.85      0.67        39
           1       0.98      0.91      0.94       302

    accuracy                           0.90       341
   macro avg       0.76      0.88      0.81       341
weighted avg       0.93      0.90      0.91       341



In [31]:
best_model_df = pd.DataFrame.from_dict(best_models, orient='index')
best_model_df.sort_values(by='accuracy', ascending=False, inplace=True)
best_model_df

Unnamed: 0,accuracy,precision,recall,f1
original_grid,0.935484,0.93092,0.935484,0.931265
original,0.932551,0.927398,0.932551,0.926517
SMOTE_grid,0.929619,0.936967,0.929619,0.93242
SMOTEENN_grid,0.914956,0.931804,0.914956,0.920731
SMOTE,0.906158,0.930757,0.906158,0.914127
oversampling_grid,0.903226,0.929623,0.903226,0.911747
oversampling,0.900293,0.937386,0.900293,0.911039
SMOTEENN,0.888563,0.918449,0.888563,0.898715
undersampling,0.882698,0.935394,0.882698,0.897502
undersampling_grid,0.8739,0.926456,0.8739,0.889551
