In [1]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import RFE
import itertools
import time
from sklearn.metrics import confusion_matrix, classification_report, f1_score
from sklearn.model_selection import cross_val_score
from tabulate import tabulate
import optuna
from joblib import dump
from joblib import load



#### Data Loading and Initial Exploration


In [14]:
# load Train data
train_data = pd.read_csv('../Data/cleaned_combined_skin_cancer.csv')

train_data = train_data.drop(['background_mother','background_father'], axis=1)

# hash encoding for region column
train_data['region'] = train_data['region'].apply(hash)

# save data
train_data.to_csv('../Data/FS_combined_skin_cancer.csv', index=False)

In [3]:
# we will use the following features to train our model
len(train_data.region.unique())


15

In [4]:
# General information and statistics about the train data
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1705 entries, 0 to 1704
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   smoke                1705 non-null   bool   
 1   drink                1705 non-null   bool   
 2   age                  1705 non-null   int64  
 3   pesticide            1705 non-null   bool   
 4   gender               1705 non-null   int64  
 5   skin_cancer_history  1705 non-null   bool   
 6   cancer_history       1705 non-null   bool   
 7   has_piped_water      1705 non-null   bool   
 8   has_sewage_system    1705 non-null   bool   
 9   fitspatrick          1705 non-null   float64
 10  region               1705 non-null   int64  
 11  diameter_1           1705 non-null   float64
 12  diameter_2           1705 non-null   float64
 13  diagnostic           1705 non-null   int64  
 14  itch                 1705 non-null   bool   
 15  grew                 1705 non-null   b

#### Feature Selection and Normalization

In [5]:
# Splitting data into features and target
X_train = train_data.drop(['diagnostic'], axis=1)
Y_train = train_data['diagnostic']

In [6]:
# What is happening in this code
# Random Forest Classifier is used to rank the importance of features
# RFE is used to select the most important features
# we will use 5 features in our model
# we will use the selected features to train our model
# we will use the selected features to test our model

# Feature selection using Random Forest Classifier
rfc = DecisionTreeClassifier()  # Using Decision Tree for feature selection
rfe = RFE(rfc, n_features_to_select=1)
rfe = rfe.fit(X_train, Y_train)

# Selecting important features
feature_map = [(i, v) for i, v in itertools.zip_longest(rfe.get_support(), X_train.columns)]
selected_features = [v for i, v in feature_map if i==True]
selected_features

['fitspatrick']

In [7]:
# Train data with selected features
X_train = X_train[selected_features]

In [8]:

# Splitting the dataset for training and testing
x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, train_size=0.70, random_state=42)

#### Decision Tree Model Training and Tuning

In [9]:

# Training a basic Decision Tree Classifier
# Time to train the model
clfd = DecisionTreeClassifier(criterion ="entropy", max_depth = 4)
start_time = time.time()
clfd.fit(x_train, y_train.values.ravel())
end_time = time.time()
print("Training time: ", end_time-start_time)

Training time:  0.0034551620483398438


In [10]:
# Time taken to test the model
start_time = time.time()
y_test_pred = clfd.predict(x_train)
end_time = time.time()
print("Testing time: ", end_time-start_time)

Testing time:  0.003838062286376953


In [11]:

# Hyperparameter Tuning for Decision Tree using Optuna
def objective(trial):
    dt_max_depth = trial.suggest_int('dt_max_depth', 2, 32, log=False)
    dt_max_features = trial.suggest_int('dt_max_features', 2, 5, log=False)
    classifier_obj = DecisionTreeClassifier(max_features=dt_max_features, max_depth=dt_max_depth)
    classifier_obj.fit(x_train, y_train)
    accuracy = classifier_obj.score(x_test, y_test)
    return accuracy


In [12]:
# start the optimization process
study_dt = optuna.create_study(direction='maximize')
study_dt.optimize(objective, n_trials=30)
print(study_dt.best_trial)

[I 2024-02-27 19:26:47,708] A new study created in memory with name: no-name-ab859009-7fb6-4fc9-bae8-12f9e6d29ded
[W 2024-02-27 19:26:47,715] Trial 0 failed with parameters: {'dt_max_depth': 7, 'dt_max_features': 5} because of the following error: ValueError('max_features must be in (0, n_features]').
Traceback (most recent call last):
  File "/Users/maryam/opt/anaconda3/lib/python3.9/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/1s/rn1cpwcs2d9bny0713mwf76w0000gn/T/ipykernel_21798/1459639200.py", line 6, in objective
    classifier_obj.fit(x_train, y_train)
  File "/Users/maryam/opt/anaconda3/lib/python3.9/site-packages/sklearn/tree/_classes.py", line 937, in fit
    super().fit(
  File "/Users/maryam/opt/anaconda3/lib/python3.9/site-packages/sklearn/tree/_classes.py", line 308, in fit
    raise ValueError("max_features must be in (0, n_features]")
ValueError: max_features must be in (0, n_features]
[W 2024-02-2

ValueError: max_features must be in (0, n_features]

In [None]:

# Training the Decision Tree model with the best parameters
dt = DecisionTreeClassifier(max_features=study_dt.best_trial.params['dt_max_features'], max_depth=study_dt.best_trial.params['dt_max_depth'])
dt.fit(x_train, y_train)

#### Model Evaluation

In [None]:
# Model Performance Metrics
dt_train, dt_test = dt.score(x_train, y_train), dt.score(x_test, y_test)
print(f"Train Score: {dt_train}")
print(f"Test Score: {dt_test}")


In [None]:

# Cross-Validation why we use cross-validation?
# Cross-validation is a resampling procedure used to evaluate machine learning models on a limited data sample.
# The goal of cross-validation is to test the model’s ability to predict new data that was not used in estimating it
# we will use cross-validation to evaluate the performance of our model
# we will use 10 folds for cross-validation
# we will use the accuracy metric to evaluate the performance of our model

scores = cross_val_score(dt, x_train, y_train, cv=10, scoring='accuracy')
print(f"Cross-Validation Accuracy: {np.mean(scores)}")

In [None]:
# Confusion Matrix and Classification Report why we use these metrics?
# Confusion matrix is used to evaluate the performance of a classification model
# Classification report is used to measure the quality of predictions from a classification algorithm
y_pred = dt.predict(x_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

# F1 Score
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1}")

#### Summary Table

In [None]:

# Creating a summary table of model performance
data = [["Decision Tree", dt_train, dt_test, np.mean(scores), f1]]
col_names = ["Model", "Train Score", "Test Score", "CV Accuracy", "F1 Score"]
print(tabulate(data, headers=col_names, tablefmt="fancy_grid"))

#### Saving the model

In [None]:
# confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


In [None]:
# # Save the model to a file
# model_filename = 'decision_tree_model.joblib'
# dump(dt, model_filename)
# print(f"Model saved to {model_filename}")

#### Load the model

In [None]:
# # Load the model from the file
# loaded_model = load(model_filename)
# print("Model loaded successfully")