# Hyper Parameter Tuning

#### Comparing GridSearchCV and RandomizedSearchCV

# Importing Libraries and Classes

In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
import numpy as np

# Import Dataset

In [32]:
df = pd.read_csv('datasets/Titanic-Dataset.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Creating preprocessing pipelines

In [33]:
cat_features = ['Sex', 'Cabin', 'Embarked']
num_features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']

cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore")
)

num_pipeline = make_pipeline(
    SimpleImputer(strategy="mean"),
    StandardScaler()
)

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_pipeline, cat_features),
        ('num', num_pipeline, num_features)
    ]
)

# Data split

In [34]:
x = df.drop(['Survived', 'Name', 'PassengerId', 'Ticket'], axis=1)
y = df.Survived

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

# Cross valiadtion, Model and Final Pipeline

In [35]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

rf = RandomForestClassifier()

rf_pipeline = make_pipeline(
    preprocessor,
    rf
)

# rf_pipeline.named_steps

cv_results = cross_validate(
    rf_pipeline,
    x_train,
    y_train,
    scoring = ['accuracy'],
    cv=cv,
    n_jobs=-1,
    return_train_score=True
)

print("Baseline Test Accuracy: ", np.mean(cv_results['test_accuracy']))


Baseline Test Accuracy:  0.7963459076135133


# Grid Search CV evaluation

In [36]:
param_grid = {
    "randomforestclassifier__n_estimators": [1, 10, 100, 1000],
    "randomforestclassifier__max_depth": [None, 5, 10, 20],
    "randomforestclassifier__min_samples_split": [2, 5, 10],
    "randomforestclassifier__max_features": [None, "sqrt", "log2"]
}

grid_cv = GridSearchCV(
    rf_pipeline,
    param_grid=param_grid,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1
)

grid_cv.fit(x_train, y_train)

print("Grid best params", grid_cv.best_params_)
print("Grid best accuracy", grid_cv.best_score_)

Grid best params {'randomforestclassifier__max_depth': 10, 'randomforestclassifier__max_features': 'sqrt', 'randomforestclassifier__min_samples_split': 10, 'randomforestclassifier__n_estimators': 100}
Grid best accuracy 0.8300502314586822


# RandomizedSearchCV

In [37]:
rand_cv = RandomizedSearchCV(
    rf_pipeline,
    param_distributions=param_grid,
    cv=cv,
    scoring='accuracy',
    n_jobs=1,
    n_iter=20,
    random_state=42
)

rand_cv.fit(x_train, y_train)

print("Random Search best params", rand_cv.best_params_)
print("Random Search best accuracy", rand_cv.best_score_)

Random Search best params {'randomforestclassifier__n_estimators': 100, 'randomforestclassifier__min_samples_split': 5, 'randomforestclassifier__max_features': None, 'randomforestclassifier__max_depth': 10}
Random Search best accuracy 0.8258642765685019


# Final Comparison

In [40]:
comparison = pd.DataFrame({
    "Method": [
        "Baseline RF (CV)",
        "GridSearchCV RF",
        "RandomizedSearchCV RF"
    ],
    "CV Accuracy": [
        np.mean(cv_results["test_accuracy"]),
        grid_cv.best_score_,
        rand_cv.best_score_
    ],
    "Best Parameters": [
        "Default",
        grid_cv.best_params_,
        rand_cv.best_params_
    ]
})

def format_params(params):
    if params == "Default":
        return "Default"
    return ", ".join([f"{k.split('__')[-1]}={v}" for k, v in params.items()])

comparison["Best Parameters"] = comparison["Best Parameters"].apply(format_params)

comparison


Unnamed: 0,Method,CV Accuracy,Best Parameters
0,Baseline RF (CV),0.796346,Default
1,GridSearchCV RF,0.83005,"max_depth=10, max_features=sqrt, min_samples_s..."
2,RandomizedSearchCV RF,0.825864,"n_estimators=100, min_samples_split=5, max_fea..."
