# Objective: Apply and compare Grid Search , Random Search and Bayesian optimisation

In [26]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
from sklearn.model_selection import train_test_split , GridSearchCV , RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score , classification_report
import numpy as np
import os
import shutil
import pandas as pd
import optuna

In [27]:
source_dir = kagglehub.dataset_download("yasserh/titanic-dataset")
target_dir = "../../datasets/traditional/"

os.makedirs(target_dir, exist_ok=True)

for item in os.listdir(source_dir):
    src_path = os.path.join(source_dir, item)
    dst_path = os.path.join(target_dir, item)
    
    if os.path.exists(dst_path):
        print(f"Warning: {dst_path} already exists. Skipping or overwriting?")
    
    shutil.move(src_path, dst_path)

os.rmdir(source_dir)
print(f"Dataset files merged into: {target_dir}")

Downloading from https://www.kaggle.com/api/v1/datasets/download/yasserh/titanic-dataset?dataset_version_number=1...


100%|██████████| 22.0k/22.0k [00:00<00:00, 4.60MB/s]

Extracting files...
Dataset files merged into: ../../datasets/traditional/





In [28]:
df = pd.read_csv("../../datasets/traditional/Titanic-Dataset.csv")
df = df.drop(['Name','PassengerId'], axis=1)
le = LabelEncoder()
for col in ['Sex', 'Cabin','Ticket','Embarked']:
    df[col] = le.fit_transform(df[col].astype(str))

In [29]:
x , y = df.drop(columns=['Survived']) , df['Survived']
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size=0.2,random_state=42)
x_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
331,1,1,45.5,0,0,20,28.5,56,2
733,2,1,23.0,0,0,228,13.0,147,2
382,3,1,32.0,0,0,665,7.925,147,2
704,3,1,26.0,1,0,398,7.8542,147,2
813,3,0,6.0,4,2,333,31.275,147,2


In [30]:
model = RandomForestClassifier(random_state=42)
model.fit(x_train,y_train)
y_pred_base = model.predict(x_test)

## Grid Search Optimisation

- Evalutes all combination
- computationally expensive for large grids
- limited to predefined grid
- good for smaller param space

In [31]:
param_grid = {
    'n_estimators': [50,100,150],
    'max_depth': [None,5,10],
    'min_samples_split':[2,5,10]
}

grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)
grid_search.fit(x_train,y_train)
model.fit(x_train,y_train)

best_grid_search_model = grid_search.best_estimator_

y_pred_grid_search = best_grid_search_model.predict(x_test)

## Random Search Optimisation

- Randomly samples combinations
- faster for larger param spaces
- explores more diverse ranges
- good for large parameter spaces with time constraints

In [32]:
param_dist = {
    'n_estimators': np.arange(50,200,10),
    'max_depth': [None,5,10,15],
    'min_samples_split':[2,5,10,20]
}

random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)
random_search.fit(x_train,y_train)
model.fit(x_train,y_train)

best_model_random_search = random_search.best_estimator_

y_pred_random_search = best_model_random_search.predict(x_test)

## Bayesian Optimisation

- Balances Exploration and Exploitation
- Exploration samples parameters from unexplored regrions
- Exploitation refine search around regions with know high performance
- Balances of these approaches is done using acquisition function to minimise unncessary evaluation while improving results

In [33]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators',50,500),
        'max_depth': trial.suggest_int('max_depth',3,100),
        'min_samples_split':trial.suggest_int('min_samples_split',2,100)
    }

    model = RandomForestClassifier(random_state=42,**params)
    model.fit(x_train,y_train)
    pred = model.predict(x_test)
    res = accuracy_score(y_test,pred)
    return res

study = optuna.create_study(direction='maximize')
study.optimize(objective,n_trials=30,n_jobs=-1)
print('Best Hyperparameters: ', study.best_params)
print('Best Accuracy: ',study.best_value)

best_bayesian_model = RandomForestClassifier(random_state=42,**study.best_params)
best_bayesian_model.fit(x_train,y_train)
y_pred_bayesian_search = best_bayesian_model.predict(x_test)

[I 2025-11-21 09:45:53,558] A new study created in memory with name: no-name-9045fb0e-7a49-4e17-829e-fedd8971c4cb
[I 2025-11-21 09:45:53,927] Trial 6 finished with value: 0.776536312849162 and parameters: {'n_estimators': 52, 'max_depth': 9, 'min_samples_split': 94}. Best is trial 6 with value: 0.776536312849162.
[I 2025-11-21 09:45:54,225] Trial 1 finished with value: 0.7877094972067039 and parameters: {'n_estimators': 118, 'max_depth': 79, 'min_samples_split': 81}. Best is trial 1 with value: 0.7877094972067039.
[I 2025-11-21 09:45:54,265] Trial 4 finished with value: 0.8100558659217877 and parameters: {'n_estimators': 120, 'max_depth': 30, 'min_samples_split': 69}. Best is trial 4 with value: 0.8100558659217877.
[I 2025-11-21 09:45:54,310] Trial 8 finished with value: 0.8156424581005587 and parameters: {'n_estimators': 78, 'max_depth': 69, 'min_samples_split': 19}. Best is trial 8 with value: 0.8156424581005587.
[I 2025-11-21 09:45:54,349] Trial 3 finished with value: 0.804469273743

Best Hyperparameters:  {'n_estimators': 135, 'max_depth': 15, 'min_samples_split': 4}
Best Accuracy:  0.8268156424581006


## Results

In [34]:
print(f"Base Model Parameters: {model.get_params()}")
print(classification_report(y_test,y_pred_base))

print(f"Best Model hyperpameters (Grid Search): {grid_search.best_params_}")
print(classification_report(y_test,y_pred_grid_search))

print(f"Best Model hyperpameters (Random Search): {random_search.best_params_}")
print(classification_report(y_test,y_pred_random_search))

print(f"Best Model hyperpameters (Bayesian Search): {study.best_params}")
print(classification_report(y_test,y_pred_bayesian_search))

Base Model Parameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}
              precision    recall  f1-score   support

           0       0.82      0.89      0.85       105
           1       0.82      0.72      0.76        74

    accuracy                           0.82       179
   macro avg       0.82      0.80      0.81       179
weighted avg       0.82      0.82      0.81       179

Best Model hyperpameters (Grid Search): {'max_depth': None, 'min_samples_split': 10, 'n_estimators': 100}
              precision    recall  f1-score   support

           0       0.83      0.89      0.86       105
           1       0.

## Considerations
- use random search to narrow search space
- then use grid search to get the best params