## Step 0. configurations

In [11]:
import warnings
warnings.filterwarnings("ignore")

## Step 1. reading data and importing libraries

In [12]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")

In [13]:
df = pd.read_csv("./data.csv")
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave_points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave_points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


## Step 2. understanding the data

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.index

In [None]:
df.info()

In [None]:
df.describe()

## Step 3. Visualizing

In [None]:
numerical_columns = df.select_dtypes(include="number").columns \
    .drop("id")

numerical_columns

In [None]:
categorical_columns = df.select_dtypes(include="object").columns

categorical_columns

In [None]:
boolean_columns = df.select_dtypes(include="bool").columns
boolean_columns

### **Distributions**

Numerical columns

In [None]:
plt.figure(figsize=(15, 60))

for i, column in enumerate(numerical_columns):
    plt.subplot(len(df.columns)//2 + len(df.columns)%2, 2, i+1)
    sns.histplot(df[column], kde=True)
    plt.title(f"Distribution of {column}", fontweight="bold")

plt.tight_layout()
plt.show()

Categorical columns

In [None]:
plt.figure(figsize=(15, 60))

for i, column in enumerate(categorical_columns):
    plt.subplot(len(df.columns)//2 + len(df.columns)%2, 2, i+1)
    sns.histplot(data=df, x=df[column])
    plt.title(f"Distribution of {column}", fontweight="bold")

plt.tight_layout()
plt.show()

Boolean columns

In [None]:
plt.figure(figsize=(15, 60))

for i, column in enumerate(boolean_columns):
    plt.subplot(len(df.columns)//2 + len(df.columns)%2, 2, i+1)
    sns.countplot(data=df, x=df[column])
    plt.title(f"Distribution of {column}", fontweight="bold")

plt.tight_layout()
plt.show()

### **Correlations**

In [None]:
correlation_matrix = df[numerical_columns].corr()

plt.figure(figsize=(20, 20))
sns.heatmap(correlation_matrix, 
    annot=True, 
    square=True, 
    cmap="coolwarm", 
    fmt=".2f",
    vmin=-1, vmax=1
)
plt.title("Correlation Matrix", fontweight="bold", fontsize=20, y=1.05)

plt.tight_layout()
plt.show()

## Step 4: Modelling

The goal for this dataset is to create a classification model that can predict the column `diagnosis`

### Import modelling libraries

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, classification_report


# Utility functions
import scipy.stats as stats
from sklearn.preprocessing import LabelEncoder


# Classification models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier


### train_test_split

In [None]:
# # Use LabelEncoder to convert 'B' to 0 and 'M' to 1
# label_encoder = LabelEncoder()
# df['diagnosis'] = label_encoder.fit_transform(df['diagnosis'])

In [None]:
X = df.drop(columns=["id", "diagnosis"])    # ID won't help in making a model
y = df["diagnosis"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

### **KNeighborsClassifier()**

#### 1. Get the best parameters with **RandomizedSearchCV**

In [40]:
knn = KNeighborsClassifier()

param_dist = {
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': stats.randint(1, 11),
    'metric': ['minkowski', 'chebyshev', 'manhattan', 'euclidean', 'mahalanobis'],
    'n_neighbors': stats.randint(3, 10),
    'p': [1, 2],  # 1 for Manhattan distance, 2 for Euclidean distance
    'weights': ['uniform', 'distance'],
}

random_search_knn = RandomizedSearchCV(knn, param_distributions=param_dist, n_iter=100, cv=5, verbose=1, scoring='accuracy', n_jobs=-1)
random_search_knn.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [41]:
best_params_knn = random_search_knn.best_params_
best_params_knn

{'algorithm': 'brute',
 'leaf_size': 5,
 'metric': 'minkowski',
 'n_neighbors': 6,
 'p': 1,
 'weights': 'distance'}

#### 2. Refined **GridSearchCV**

In [42]:
refined_param_grid = {
    'algorithm': [best_params_knn['algorithm']],
    'leaf_size': [best_params_knn['leaf_size'] - 1, best_params_knn['leaf_size'], best_params_knn['leaf_size'] + 1],
    'metric': [best_params_knn['metric']],
    'n_neighbors': [best_params_knn['n_neighbors'] - 1, best_params_knn['n_neighbors'], best_params_knn['n_neighbors'] + 1],
    'p': [best_params_knn['p']],
    'weights': [best_params_knn['weights']]
}

grid_search_knn_refined = GridSearchCV(knn, refined_param_grid, cv=5, verbose=1, scoring='accuracy', n_jobs=-1)
grid_search_knn_refined.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


#### 3. Get the best model and make predictions on the test set

In [43]:
best_knn = grid_search_knn_refined.best_estimator_
y_pred_knn = best_knn.predict(X_test)

#### 4. Evaluation

In [None]:
accuracy_knn = accuracy_score(y_test, y_pred_knn)
precision_knn = precision_score(y_test, y_pred_knn, pos_label="M")
recall_knn = recall_score(y_test, y_pred_knn, pos_label="M")
f1_knn = f1_score(y_test, y_pred_knn, pos_label="M")

# pd.DataFrame({
#     "Metric": ["Accuracy", "Precision", "Recall", "F1 Score"],
#     "Score": [accuracy_knn, precision_knn, recall_knn, f1_knn],
# })

In [44]:
print(f"Classification Report: \n {classification_report(y_test, y_pred_knn)}")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, y_pred_knn)}")

Classification Report: 
               precision    recall  f1-score   support

           B       0.94      0.96      0.95        69
           M       0.93      0.91      0.92        45

    accuracy                           0.94       114
   macro avg       0.94      0.93      0.94       114
weighted avg       0.94      0.94      0.94       114

Confusion Matrix: 
 [[66  3]
 [ 4 41]]


### **RandomForestClassifier()**

#### 1. Get the best parameters with **RandomizedSearchCV**

In [35]:
rf = RandomForestClassifier()

param_dist = {
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None] + list(stats.randint(10, 30).rvs(2)),
    'max_features': ['auto', 'sqrt', 'log2', None],
    'min_samples_leaf': stats.randint(1, 5),
    'min_samples_split': stats.randint(2, 10),
    'n_estimators': stats.randint(50, 200),
}

random_search_rf = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=100, cv=5, verbose=1, scoring='accuracy', n_jobs=-1)
random_search_rf.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [36]:
best_params_rf = random_search_rf.best_params_
best_params_rf

{'bootstrap': True,
 'criterion': 'entropy',
 'max_depth': 22,
 'max_features': 'log2',
 'min_samples_leaf': 4,
 'min_samples_split': 7,
 'n_estimators': 122}

#### 2. Refined **GridSearchCV**

In [37]:
refined_param_grid = {
    'bootstrap': [best_params_rf['bootstrap']],
    'criterion': [best_params_rf['criterion']],
    'max_depth': [best_params_rf['max_depth'] - 2, best_params_rf['max_depth'], best_params_rf['max_depth'] + 2] if best_params_rf['max_depth'] is not None else [None],
    'max_features': [best_params_rf['max_features']],
    'min_samples_leaf': [best_params_rf['min_samples_leaf'] - 1, best_params_rf['min_samples_leaf'], best_params_rf['min_samples_leaf'] + 1],
    'min_samples_split': [best_params_rf['min_samples_split'] - 1, best_params_rf['min_samples_split'], best_params_rf['min_samples_split'] + 1],
    'n_estimators': [best_params_rf['n_estimators'] - 20, best_params_rf['n_estimators'], best_params_rf['n_estimators'] + 20],
}

grid_search_rf_refined = GridSearchCV(rf, refined_param_grid, cv=5, verbose=1, scoring='accuracy', n_jobs=-1)
grid_search_rf_refined.fit(X_train, y_train)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


#### 3. Get the best model and make predictions on the test set

In [38]:
best_rf = grid_search_rf_refined.best_estimator_
y_pred_rf = best_rf.predict(X_test)

#### 4. Evaluation

In [None]:
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf, pos_label="M")
recall_rf = recall_score(y_test, y_pred_rf, pos_label="M")
f1_rf = f1_score(y_test, y_pred_rf, pos_label="M")

# pd.DataFrame({
#     "Metric": ["Accuracy", "Precision", "Recall", "F1 Score"],
#     "Score": [accuracy_rf, precision_rf, recall_rf, f1_rf],
# })

In [39]:
print(f"Classification Report: \n {classification_report(y_test, y_pred_rf)}")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, y_pred_rf)}")

Classification Report: 
               precision    recall  f1-score   support

           B       0.89      0.97      0.93        69
           M       0.95      0.82      0.88        45

    accuracy                           0.91       114
   macro avg       0.92      0.90      0.91       114
weighted avg       0.92      0.91      0.91       114

Confusion Matrix: 
 [[67  2]
 [ 8 37]]


### **DecisionTreeClassifier()**

#### 1. Get the best parameters with **RandomizedSearchCV**

In [29]:
dt = DecisionTreeClassifier()

param_dist = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None] + list(stats.randint(1, 50).rvs(2)),
    'max_features': ['auto', 'sqrt', 'log2', None],
    'min_samples_leaf': stats.randint(1, 5),
    'min_samples_split': stats.randint(2, 10),
}

random_search_dt = RandomizedSearchCV(dt, param_distributions=param_dist, n_iter=1000, cv=5, verbose=1, scoring='accuracy', n_jobs=-1)
random_search_dt.fit(X_train, y_train)

Fitting 5 folds for each of 1000 candidates, totalling 5000 fits


In [30]:
best_params_dt = random_search_dt    .best_params_
best_params_dt

{'criterion': 'entropy',
 'max_depth': 21,
 'max_features': 'sqrt',
 'min_samples_leaf': 4,
 'min_samples_split': 9}

#### 2. Refined **GridSearchCV**

In [32]:
refined_param_grid = {
    'criterion': [best_params_dt['criterion']],
    'max_depth': [best_params_dt['max_depth'] - 2, best_params_dt['max_depth'], best_params_dt['max_depth'] + 2] if best_params_dt['max_depth'] is not None else [None],
    'max_features': [best_params_dt['max_features']],
    'min_samples_leaf': [best_params_dt['min_samples_leaf'] - 1, best_params_dt['min_samples_leaf'], best_params_dt['min_samples_leaf'] + 1],
    'min_samples_split': [best_params_dt['min_samples_split'] - 1, best_params_dt['min_samples_split'], best_params_dt['min_samples_split'] + 1],
}

grid_search_dt_refined = GridSearchCV(dt, refined_param_grid, cv=5, verbose=1, scoring='accuracy', n_jobs=-1)
grid_search_dt_refined.fit(X_train, y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


#### 3. Get the best model and make predictions on the test set

In [33]:
best_dt = grid_search_dt_refined.best_estimator_
y_pred_dt = best_dt.predict(X_test)

#### 4. Evaluation

In [None]:
accuracy_dt = accuracy_score(y_test, y_pred_dt)
precision_dt = precision_score(y_test, y_pred_dt, pos_label="M")
recall_dt = recall_score(y_test, y_pred_dt, pos_label="M")
f1_dt = f1_score(y_test, y_pred_dt, pos_label="M")

# pd.DataFrame({
#     "Metric": ["Accuracy", "Precision", "Recall", "F1 Score"],
#     "Score": [accuracy_dt, precision_dt, recall_dt, f1_dt],
# })

In [34]:
print(f"Classification Report: \n {classification_report(y_test, y_pred_dt)}")
print(f"Confusion Matrix: \n {confusion_matrix(y_test, y_pred_dt)}")

Classification Report: 
               precision    recall  f1-score   support

           B       0.93      0.97      0.95        69
           M       0.95      0.89      0.92        45

    accuracy                           0.94       114
   macro avg       0.94      0.93      0.93       114
weighted avg       0.94      0.94      0.94       114

Confusion Matrix: 
 [[67  2]
 [ 5 40]]
