In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.discriminant_analysis import StandardScaler
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
%pip install lightgbm

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Note: you may need to restart the kernel to use updated packages.


In [2]:
random_state=42
best_models = {}

In [3]:
# Read in the data
# Read in the data
df = pd.read_csv('../Data/Final_skin_cancer.csv')

df.drop('drink', axis=1, inplace=True)

# New Section

In [4]:
from imblearn.under_sampling import ClusterCentroids, TomekLinks
from imblearn.over_sampling import RandomOverSampler

def splitting_data(df, sampling):
    X = df.drop(['diagnostic'], axis=1)
    y = df['diagnostic']

    if sampling == 'none':
        return X, y
    elif sampling == 'SMOTEENN':
        sampler = SMOTEENN(random_state=random_state)
    elif sampling == 'SMOTE':
        sampler = SMOTE(random_state=random_state)
    elif sampling == 'under':
        sampler = RandomUnderSampler(random_state=random_state)
    elif sampling == 'over':
        sampler = RandomOverSampler(random_state=random_state)
    elif sampling == 'cluster_centroids':
        sampler = ClusterCentroids(random_state=random_state)
    elif sampling == 'tomek_links':
        sampler = TomekLinks()

    X_resampled, y_resampled = sampler.fit_resample(X, y)
    return X_resampled, y_resampled


In [5]:

def training(X_train, y_train):
    # Create a KNN classifier with 5 neighbors
    LGBM = SVC()
    # Fit the classifier to the data
    LGBM.fit(X_train, y_train)
    return LGBM

In [6]:
def best_model(modelName, accuracy, precision, recall, f1):
    best_models[modelName] = {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

In [7]:
def predict(modleName,LGBM, X_test ,y_test):
    # Predict the labels for the training data X
    y_pred = LGBM.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    cr=classification_report(y_test, y_pred, output_dict=True)
    precision = cr['weighted avg']['precision']
    recall = cr['weighted avg']['recall']
    f1 = cr['weighted avg']['f1-score']
    best_model(modleName,accuracy,precision,recall,f1)
    cr=classification_report(y_test, y_pred)
    print(cr)

In [8]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

def optimize_with_grid(X_train, y_train):
    # Define a pipeline that first scales the data and then applies the classifier
    # Scaling is more critical for SVM than for Decision Trees
    pipe = Pipeline([
        ('scaler', StandardScaler()),  # Uncommenting this as it's important for SVM
        ('svm', SVC(random_state=random_state))
    ])

    # Define the parameter grid to search
    param_grid = {
        'svm__C': [0.1, 1, 10, 100],  # Regularization parameter
        'svm__gamma': [1, 0.1, 0.01, 0.001],  # Kernel coefficient for 'rbf', 'poly' and 'sigmoid'
        'svm__kernel': ['rbf', 'poly', 'sigmoid']  # Type of kernel
    }

    # Create the GridSearchCV object
    svm_cv = GridSearchCV(pipe, param_grid, cv=5, verbose=1, n_jobs=-1)

    # Perform the grid search on the provided data
    svm_cv.fit(X_train, y_train)

    # Best parameters and best score
    best_params = svm_cv.best_params_
    best_score = svm_cv.best_score_
    best_estimator = svm_cv.best_estimator_
    print("Best Parameters:", best_params)
    print("Best Score:", best_score)

    return best_estimator

# Example usage (you need to define X_train, y_train, and random_state before calling this function)
# best_svm_model = optimize_with_grid_svm(X_train, y_train, random_state=42)

<h1> LGBM on original data with optimization </h1>

In [9]:
# using function with no sampling
X, y= splitting_data(df, 'none')
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
# Scale the features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [10]:
#check number of observations in each class in the set
print("Number of observations in each class in the training set:")
print(y.value_counts())

Number of observations in each class in the training set:
diagnostic
1    1494
0     211
Name: count, dtype: int64


In [11]:
LGBM1 = training(X_train, y_train)
y_pred = predict('original',LGBM1, X_test, y_test)


              precision    recall  f1-score   support

           0       0.95      0.57      0.71        37
           1       0.95      1.00      0.97       304

    accuracy                           0.95       341
   macro avg       0.95      0.78      0.84       341
weighted avg       0.95      0.95      0.94       341



In [12]:
best_LGBM1 = optimize_with_grid(X_train, y_train)
prediction = predict('original_grid',best_LGBM1, X_test, y_test)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


Best Parameters: {'svm__C': 100, 'svm__gamma': 1, 'svm__kernel': 'poly'}
Best Score: 0.9215416936005172
              precision    recall  f1-score   support

           0       0.77      0.65      0.71        37
           1       0.96      0.98      0.97       304

    accuracy                           0.94       341
   macro avg       0.87      0.81      0.84       341
weighted avg       0.94      0.94      0.94       341



<h1> LGBM using SMOTE sampling </h1>

In [13]:
X,y = splitting_data(df, 'SMOTE')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [14]:
#check number of observations in each class in the set
print("Number of observations in each class in the training set:")
print(y.value_counts())

Number of observations in each class in the training set:
diagnostic
1    1494
0    1494
Name: count, dtype: int64


In [15]:
LGBM2 =training(X_train, y_train)
y_pred = predict('SMOTE',LGBM2, X_test, y_test)

              precision    recall  f1-score   support

           0       0.90      0.95      0.93       284
           1       0.95      0.91      0.93       314

    accuracy                           0.93       598
   macro avg       0.93      0.93      0.93       598
weighted avg       0.93      0.93      0.93       598



In [16]:
best_LGBM2 = optimize_with_grid(X_train, y_train)
prediction = predict('SMOTE_grid',best_LGBM2, X_test, y_test)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


Best Parameters: {'svm__C': 1, 'svm__gamma': 1, 'svm__kernel': 'poly'}
Best Score: 0.9330543933054394
              precision    recall  f1-score   support

           0       0.91      0.99      0.95       284
           1       0.99      0.91      0.95       314

    accuracy                           0.95       598
   macro avg       0.95      0.95      0.95       598
weighted avg       0.95      0.95      0.95       598



<h1> LGBM using SMOTEENN sampling </h1>

In [17]:
X,y = splitting_data(df, 'SMOTEENN')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [18]:
print("Number of observations in each class in the set:")
print(y.value_counts())

Number of observations in each class in the set:
diagnostic
1    1332
0    1244
Name: count, dtype: int64


In [19]:
LGBM3 =training(X_train, y_train)
y_pred = predict('SMOTEENN',LGBM3, X_test, y_test)

              precision    recall  f1-score   support

           0       0.96      0.98      0.97       249
           1       0.98      0.96      0.97       267

    accuracy                           0.97       516
   macro avg       0.97      0.97      0.97       516
weighted avg       0.97      0.97      0.97       516



In [20]:
# from joblib import dump
# dump(LGBM3,'/content/LGBM_SMOTEENN.joblib')

In [21]:
# from google.colab import drive
# drive.mount('/content/drive')

In [22]:
best_LGBM3 = optimize_with_grid(X_train, y_train)
prediction = predict('SMOTEENN_grid',best_LGBM3, X_test, y_test)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


Best Parameters: {'svm__C': 100, 'svm__gamma': 1, 'svm__kernel': 'rbf'}
Best Score: 0.9766990291262136
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       249
           1       0.99      0.97      0.98       267

    accuracy                           0.98       516
   macro avg       0.98      0.98      0.98       516
weighted avg       0.98      0.98      0.98       516



<h1> DT on Random undersampling </h1>

In [23]:
X,y = splitting_data(df, 'under')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [24]:
print("Number of observations in each class in the set:")
print(y.value_counts())

Number of observations in each class in the set:
diagnostic
0    211
1    211
Name: count, dtype: int64


In [25]:
LGBM4 =training(X_train, y_train)
y_pred = predict('undersampling',LGBM4, X_test, y_test)

              precision    recall  f1-score   support

           0       0.91      0.91      0.91        35
           1       0.94      0.94      0.94        50

    accuracy                           0.93        85
   macro avg       0.93      0.93      0.93        85
weighted avg       0.93      0.93      0.93        85



In [26]:
best_LGBM4 = optimize_with_grid(X_train, y_train)
prediction = predict('undersampling_grid',best_LGBM4, X_test, y_test)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Parameters: {'svm__C': 1, 'svm__gamma': 0.1, 'svm__kernel': 'rbf'}
Best Score: 0.8992537313432836
              precision    recall  f1-score   support

           0       0.89      0.91      0.90        35
           1       0.94      0.92      0.93        50

    accuracy                           0.92        85
   macro avg       0.91      0.92      0.92        85
weighted avg       0.92      0.92      0.92        85



<h1> DT on Random Oversampling </h1>

In [27]:
X,y = splitting_data(df, 'over')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [28]:
print("Number of observations in each class in the set:")
print(y.value_counts())

Number of observations in each class in the set:
diagnostic
1    1494
0    1494
Name: count, dtype: int64


In [29]:
LGBM5 =training(X_train, y_train)
y_pred = predict('oversampling',LGBM5, X_test, y_test)

              precision    recall  f1-score   support

           0       0.86      0.96      0.90       292
           1       0.95      0.85      0.90       306

    accuracy                           0.90       598
   macro avg       0.91      0.90      0.90       598
weighted avg       0.91      0.90      0.90       598



In [30]:
best_LGBM5 = optimize_with_grid(X_train, y_train)
prediction = predict('oversampling_grid',best_LGBM5, X_test, y_test)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Parameters: {'svm__C': 100, 'svm__gamma': 1, 'svm__kernel': 'rbf'}
Best Score: 0.9380753138075313
              precision    recall  f1-score   support

           0       0.88      0.97      0.92       292
           1       0.97      0.87      0.92       306

    accuracy                           0.92       598
   macro avg       0.93      0.92      0.92       598
weighted avg       0.93      0.92      0.92       598



<h1> DT on Cluster Centroids </h1>

In [31]:
X,y = splitting_data(df, 'cluster_centroids')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [32]:
print("Number of observations in each class in the set:")
print(y.value_counts())

Number of observations in each class in the set:
diagnostic
0    211
1    211
Name: count, dtype: int64


In [33]:
LGBM6 =training(X_train, y_train)
y_pred = predict('cluster_centroids',LGBM6, X_test, y_test)

              precision    recall  f1-score   support

           0       0.89      0.97      0.93        35
           1       0.98      0.92      0.95        50

    accuracy                           0.94        85
   macro avg       0.94      0.95      0.94        85
weighted avg       0.94      0.94      0.94        85



In [34]:
best_LGBM6 = optimize_with_grid(X_train, y_train)
prediction = predict('cluster_centroids_grid',best_LGBM6, X_test, y_test)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Parameters: {'svm__C': 1, 'svm__gamma': 0.1, 'svm__kernel': 'rbf'}
Best Score: 0.9466198419666373
              precision    recall  f1-score   support

           0       0.89      0.97      0.93        35
           1       0.98      0.92      0.95        50

    accuracy                           0.94        85
   macro avg       0.94      0.95      0.94        85
weighted avg       0.94      0.94      0.94        85



<h1> DT on Tomek Links </h1>

In [35]:
X,y = splitting_data(df, 'tomek_links')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [36]:
print("Number of observations in each class in the set:")
print(y.value_counts())

Number of observations in each class in the set:
diagnostic
1    1491
0     211
Name: count, dtype: int64


In [37]:
LGBM7 =training(X_train, y_train)
y_pred = predict('tomek_links',LGBM7, X_test, y_test)

              precision    recall  f1-score   support

           0       0.83      0.54      0.66        46
           1       0.93      0.98      0.96       295

    accuracy                           0.92       341
   macro avg       0.88      0.76      0.81       341
weighted avg       0.92      0.92      0.92       341



In [38]:
best_LGBM7 = optimize_with_grid(X_train, y_train)
prediction = predict('tomek_links_grid',best_LGBM7, X_test, y_test)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
Best Parameters: {'svm__C': 0.1, 'svm__gamma': 1, 'svm__kernel': 'poly'}
Best Score: 0.9324095022624433
              precision    recall  f1-score   support

           0       0.80      0.61      0.69        46
           1       0.94      0.98      0.96       295

    accuracy                           0.93       341
   macro avg       0.87      0.79      0.82       341
weighted avg       0.92      0.93      0.92       341



In [39]:
best_model_df = pd.DataFrame.from_dict(best_models, orient='index')
best_model_df.sort_values(by='accuracy', ascending=False, inplace=True)
best_model_df

Unnamed: 0,accuracy,precision,recall,f1
SMOTEENN_grid,0.98062,0.980746,0.98062,0.980624
SMOTEENN,0.968992,0.969271,0.968992,0.969001
original,0.950147,0.950353,0.950147,0.944409
SMOTE_grid,0.948161,0.951515,0.948161,0.948179
original_grid,0.941349,0.938114,0.941349,0.939048
cluster_centroids,0.941176,0.944141,0.941176,0.941476
cluster_centroids_grid,0.941176,0.944141,0.941176,0.941476
undersampling,0.929412,0.929412,0.929412,0.929412
SMOTE,0.928094,0.929352,0.928094,0.928139
tomek_links_grid,0.926686,0.922132,0.926686,0.922379
