<b>Installing Libraries</b>

In [None]:
%pip install matplotlib
%pip install seaborn
%pip install plotly
%pip install scipy
%pip install scikit-learn

<b>Importing Libraries</b>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, 
                           confusion_matrix, classification_report, roc_auc_score, roc_curve)

''' Different ML algorithms '''
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

''' Hyperparameter Distributions '''
from scipy.stats import uniform, randint
import time
import warnings
warnings.filterwarnings('ignore')

<b>Loading the data and Exploring it</b>

In [3]:
# Loading breast cancer dataset (binary classification)
data = load_breast_cancer()
X, y = data.data, data.target

print(f"Dataset: {data.filename}")
print(f"Number of samples: {X.shape[0]}")
print(f"Number of features: {X.shape[1]}")
print(f"Target classes: {data.target_names}")
print(f"Class distribution: {np.bincount(y)}")

Dataset: breast_cancer.csv
Number of samples: 569
Number of features: 30
Target classes: ['malignant' 'benign']
Class distribution: [212 357]


<b>Data preprocessing</b>

In [4]:
# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Feature scaling (important for algorithms like SVM, KNN, Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")
print("Features have been standardized using StandardScaler")

Training set size: (455, 30)
Test set size: (114, 30)
Features have been standardized using StandardScaler


<b>Initialiing multiple ML models</b>

In [5]:
# Dictionary to store all models
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Support Vector Machine': SVC(random_state=42, probability=True),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(random_state=42)
}

print(f"Total models to compare: {len(models)}")
for model_name in models.keys():
    print(f"✓ {model_name}")

Total models to compare: 7
✓ Logistic Regression
✓ Random Forest
✓ Support Vector Machine
✓ K-Nearest Neighbors
✓ Gradient Boosting
✓ Naive Bayes
✓ Decision Tree


<b>Train and evaluate baseline models</b>

In [6]:
# Dictionary to store results
baseline_results = {}

# Function to calculate comprehensive metrics
def calculate_metrics(y_true, y_pred, y_proba=None):
    """Calculate all evaluation metrics"""
    metrics = {
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred, average='weighted'),
        'Recall': recall_score(y_true, y_pred, average='weighted'),
        'F1-Score': f1_score(y_true, y_pred, average='weighted')
    }
    
    # Adding AUC for binary classification
    if y_proba is not None:
        metrics['AUC'] = roc_auc_score(y_true, y_proba[:, 1])
    
    return metrics

# Training and evaluating each model
for model_name, model in models.items():
    print(f"\nTraining {model_name}...")
    
    # Start timing
    start_time = time.time()
    
    # Training the model
    if model_name in ['Support Vector Machine', 'K-Nearest Neighbors', 'Logistic Regression']:
        # These models benefit from scaled features
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        y_proba = model.predict_proba(X_test_scaled) if hasattr(model, 'predict_proba') else None
    else:
        # Tree-based models don't require scaling
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test) if hasattr(model, 'predict_proba') else None
    
    # Calculating training time
    training_time = time.time() - start_time
    
    # Calculating metrics
    metrics = calculate_metrics(y_test, y_pred, y_proba)
    metrics['Training Time'] = training_time
    
    # Storing results
    baseline_results[model_name] = metrics
    
    print(f"  Accuracy: {metrics['Accuracy']:.4f}")
    print(f"  F1-Score: {metrics['F1-Score']:.4f}")
    print(f"  Training Time: {training_time:.4f}s")


Training Logistic Regression...
  Accuracy: 0.9825
  F1-Score: 0.9825
  Training Time: 0.0346s

Training Random Forest...
  Accuracy: 0.9561
  F1-Score: 0.9560
  Training Time: 0.1953s

Training Support Vector Machine...
  Accuracy: 0.9825
  F1-Score: 0.9825
  Training Time: 0.0228s

Training K-Nearest Neighbors...
  Accuracy: 0.9561
  F1-Score: 0.9560
  Training Time: 2.9685s

Training Gradient Boosting...
  Accuracy: 0.9561
  F1-Score: 0.9558
  Training Time: 0.6200s

Training Naive Bayes...
  Accuracy: 0.9386
  F1-Score: 0.9384
  Training Time: 0.0000s

Training Decision Tree...
  Accuracy: 0.9123
  F1-Score: 0.9130
  Training Time: 0.0147s


<b>Display baseline results</b>

In [7]:
# Creating results DataFrame for better visualization
results_df = pd.DataFrame(baseline_results).T
results_df = results_df.round(4)

print("Baseline Model Performance:")
print(results_df)

# Finding best performing model
best_model_name = results_df['F1-Score'].idxmax()
print(f"\nBest performing baseline model: {best_model_name}")
print(f"Best F1-Score: {results_df.loc[best_model_name, 'F1-Score']:.4f}")

Baseline Model Performance:
                        Accuracy  Precision  Recall  F1-Score     AUC  \
Logistic Regression       0.9825     0.9825  0.9825    0.9825  0.9954   
Random Forest             0.9561     0.9561  0.9561    0.9560  0.9937   
Support Vector Machine    0.9825     0.9825  0.9825    0.9825  0.9950   
K-Nearest Neighbors       0.9561     0.9561  0.9561    0.9560  0.9788   
Gradient Boosting         0.9561     0.9569  0.9561    0.9558  0.9907   
Naive Bayes               0.9386     0.9384  0.9386    0.9384  0.9878   
Decision Tree             0.9123     0.9161  0.9123    0.9130  0.9157   

                        Training Time  
Logistic Regression            0.0346  
Random Forest                  0.1953  
Support Vector Machine         0.0228  
K-Nearest Neighbors            2.9685  
Gradient Boosting              0.6200  
Naive Bayes                    0.0000  
Decision Tree                  0.0147  

Best performing baseline model: Logistic Regression
Best F1-Score:

<b>Hyperparameter tuning with GridSearchCV</b>

In [8]:
# Defining parameter grids for top 3 models
param_grids = {
    'Random Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    },
    'Support Vector Machine': {
        'C': [0.1, 1, 10, 100],
        'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
        'kernel': ['rbf', 'poly', 'linear']
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0]
    }
}

# Performing GridSearchCV for selected models
grid_search_results = {}

for model_name, param_grid in param_grids.items():
    print(f"\nPerforming GridSearchCV for {model_name}...")
    
    # Get the base model
    base_model = models[model_name]
    
    # Creating GridSearchCV object
    grid_search = GridSearchCV(
        estimator=base_model,
        param_grid=param_grid,
        cv=5,  # 5-fold cross-validation
        scoring='f1_weighted',
        n_jobs=-1,  # Use all available processors
        verbose=1
    )
    
    # Fit GridSearchCV
    start_time = time.time()
    if model_name == 'Support Vector Machine':
        grid_search.fit(X_train_scaled, y_train)
    else:
        grid_search.fit(X_train, y_train)
    
    tuning_time = time.time() - start_time
    
    # Get best model and evaluate
    best_model = grid_search.best_estimator_
    
    # Making predictions
    if model_name == 'Support Vector Machine':
        y_pred = best_model.predict(X_test_scaled)
        y_proba = best_model.predict_proba(X_test_scaled)
    else:
        y_pred = best_model.predict(X_test)
        y_proba = best_model.predict_proba(X_test)
    
    # Calculating metrics
    metrics = calculate_metrics(y_test, y_pred, y_proba)
    metrics['Tuning Time'] = tuning_time
    metrics['Best Params'] = grid_search.best_params_
    metrics['CV Score'] = grid_search.best_score_
    
    grid_search_results[model_name] = metrics
    
    print(f"  Best CV Score: {grid_search.best_score_:.4f}")
    print(f"  Test F1-Score: {metrics['F1-Score']:.4f}")
    print(f"  Best Parameters: {grid_search.best_params_}")


Performing GridSearchCV for Random Forest...
Fitting 5 folds for each of 81 candidates, totalling 405 fits


<b>Hyperparameter tuning with RandomizedSearchCV</b>

In [None]:
# Defining parameter distributions for RandomizedSearchCV
param_distributions = {
    'Random Forest': {
        'n_estimators': randint(100, 500),
        'max_depth': [10, 20, 30, None],
        'min_samples_split': randint(2, 11),
        'min_samples_leaf': randint(1, 5),
        'max_features': ['auto', 'sqrt', 'log2']
    },
    'Support Vector Machine': {
        'C': uniform(0.1, 100),
        'gamma': uniform(0.001, 1),
        'kernel': ['rbf', 'poly', 'linear']
    },
    'Gradient Boosting': {
        'n_estimators': randint(100, 300),
        'learning_rate': uniform(0.01, 0.3),
        'max_depth': randint(3, 10),
        'subsample': uniform(0.6, 0.4)
    }
}

# Performing RandomizedSearchCV
random_search_results = {}

for model_name, param_dist in param_distributions.items():
    print(f"\nPerforming RandomizedSearchCV for {model_name}...")
    
    # Get the base model
    base_model = models[model_name]
    
    # Creating RandomizedSearchCV object
    random_search = RandomizedSearchCV(
        estimator=base_model,
        param_distributions=param_dist,
        n_iter=50,  # Number of parameter settings sampled
        cv=5,
        scoring='f1_weighted',
        n_jobs=-1,
        random_state=42,
        verbose=1
    )
    
    # Fit RandomizedSearchCV
    start_time = time.time()
    if model_name == 'Support Vector Machine':
        random_search.fit(X_train_scaled, y_train)
    else:
        random_search.fit(X_train, y_train)
    
    tuning_time = time.time() - start_time
    
    # Get best model and evaluate
    best_model = random_search.best_estimator_
    
    # Making predictions
    if model_name == 'Support Vector Machine':
        y_pred = best_model.predict(X_test_scaled)
        y_proba = best_model.predict_proba(X_test_scaled)
    else:
        y_pred = best_model.predict(X_test)
        y_proba = best_model.predict_proba(X_test)
    
    # Calculating metrics
    metrics = calculate_metrics(y_test, y_pred, y_proba)
    metrics['Tuning Time'] = tuning_time
    metrics['Best Params'] = random_search.best_params_
    metrics['CV Score'] = random_search.best_score_
    
    random_search_results[model_name] = metrics
    
    print(f"  Best CV Score: {random_search.best_score_:.4f}")
    print(f"  Test F1-Score: {metrics['F1-Score']:.4f}")
    print(f"  Best Parameters: {random_search.best_params_}")


Performing RandomizedSearchCV for Random Forest...
Fitting 5 folds for each of 50 candidates, totalling 250 fits
