<a href="https://colab.research.google.com/github/Hassan7838/titanic-hyperparameter-optimization/blob/main/titanic-hyperparameter-optimization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Hyperparameter Optimization of ML Model for Survival Prediction

# Upload Dataset
from google.colab import files
upload = files.upload()

Saving train.csv to train (1).csv


In [None]:
# load and explore dataset
import pandas as pd

df = pd.read_csv("train.csv")

print("----First Five Rows----")
print(df.head())
print("----Information----")
print(df.info())
print("----Null Values----")
print(df.isnull().sum())

----First Five Rows----
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.

In [None]:
# dropping useless columns
df = df.drop(["PassengerId","Name","Ticket","Cabin"],axis=1)

In [None]:
# missing values
df["Age"].fillna(df["Age"].median(),inplace=True)
df["Embarked"].fillna(df["Embarked"].mode()[0],inplace=True)

# checking...
print(df.isnull().sum())

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Age"].fillna(df["Age"].median(),inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Embarked"].fillna(df["Embarked"].mode()[0],inplace=True)


In [None]:
# feature engineering
df["FamilySize"]=df["SibSp"]+df["Parch"]+1

# checking...
print(df.head(3))

   Survived  Pclass     Sex   Age  SibSp  Parch     Fare Embarked  FamilySize
0         0       3    male  22.0      1      0   7.2500        S           2
1         1       1  female  38.0      1      0  71.2833        C           2
2         1       3  female  26.0      0      0   7.9250        S           1


In [None]:
# encode categorical data
df["Sex"]=df["Sex"].map({'male':0,'female':1})
df=pd.get_dummies(df,columns=['Embarked'],drop_first=True)

# # checking...
print(df.head(3))

   Survived  Pclass  Sex   Age  SibSp  Parch     Fare  FamilySize  Embarked_Q  \
0         0       3    0  22.0      1      0   7.2500           2       False   
1         1       1    1  38.0      1      0  71.2833           2       False   
2         1       3    1  26.0      0      0   7.9250           1       False   

   Embarked_S  
0        True  
1       False  
2        True  


In [None]:
# Split into features (X) and target (y)
X = df.drop('Survived',axis=1)
y = df['Survived']

In [None]:
# Train & test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [None]:
# train decision tree
from sklearn.tree import DecisionTreeClassifier

dt_default = DecisionTreeClassifier(random_state=1)
dt_default.fit(X_train,y_train)

In [None]:
# evalute default model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

y_pred_default = dt_default.predict(X_test)

print("Accuracy: {:.2f}".format(accuracy_score(y_test, y_pred_default)))
print("Precision: {:.2f}".format(precision_score(y_test, y_pred_default)))
print("Recall: {:.2f}".format(recall_score(y_test, y_pred_default)))
print("F1 Score: {:.2f}".format(f1_score(y_test, y_pred_default)))

Accuracy: 0.73
Precision: 0.69
Recall: 0.60
F1 Score: 0.64


In [None]:
# Grid Search
from sklearn.model_selection import GridSearchCV

param_grid = { 'max_depth':[None,3,5,7,10],
               'min_samples_split':[2,5,10],
               'min_samples_leaf':[1,2,4],
               'criterion':['gini','entropy'] }

# create model
dt = DecisionTreeClassifier(random_state=1)
grid_search = GridSearchCV(estimator=dt,param_grid=param_grid,cv=5,scoring='accuracy',n_jobs=-1)

grid_search.fit(X_train,y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation accuracy:",grid_search.best_score_)

Best Parameters: {'criterion': 'entropy', 'max_depth': 3, 'min_samples_leaf': 2, 'min_samples_split': 2}
Best Cross-Validation accuracy: 0.8329360780065006


In [None]:
# Test Grid Search
best_grid_model=grid_search.best_estimator_
y_pred_grid=best_grid_model.predict(X_test)

print("Grid Search Model Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_grid))
print("Precision:", precision_score(y_test, y_pred_grid))
print("Recall:", recall_score(y_test, y_pred_grid))
print("F1 Score:", f1_score(y_test, y_pred_grid))

Grid Search Model Performance:
Accuracy: 0.8044692737430168
Precision: 0.8166666666666667
Recall: 0.6712328767123288
F1 Score: 0.7368421052631579


In [None]:
# Random Search
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

param_dist = { 'max_depth':[3,5,7,None],
               'min_samples_split':randint(2,20),
               'min_samples_leaf':randint(1,10),
               'criterion':['gini','entropy'] }

dt_model = DecisionTreeClassifier(random_state=1)
random_search = RandomizedSearchCV(estimator=dt_model,param_distributions=param_dist,n_iter=20,cv=5,scoring='accuracy',random_state=1,n_jobs=-1)
random_search.fit(X_train,y_train)

print("Best Hyperparameters (Randomized Search):", random_search.best_params_)
print("Best Cross-Validation Score:", random_search.best_score_)


Best Hyperparameters (Randomized Search): {'criterion': 'entropy', 'max_depth': 3, 'min_samples_leaf': 8, 'min_samples_split': 15}
Best Cross-Validation Score: 0.8329360780065006


In [None]:
# Test random Search
best_rand_model=random_search.best_estimator_
y_pred_rand=best_rand_model.predict(X_test)

print("Random Search Model Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_rand))
print("Precision:", precision_score(y_test, y_pred_rand))
print("Recall:", recall_score(y_test, y_pred_rand))
print("F1 Score:", f1_score(y_test, y_pred_rand))

Random Search Model Performance:
Accuracy: 0.8044692737430168
Precision: 0.8166666666666667
Recall: 0.6712328767123288
F1 Score: 0.7368421052631579


In [None]:
# Summary Table
import pandas as pd

results = {
    "Model": ["Default", "Grid Search", "Randomized Search"],
    "Accuracy": [
        accuracy_score(y_test, y_pred_default),
        accuracy_score(y_test, y_pred_grid),
        accuracy_score(y_test, y_pred_rand)
    ],
    "Precision": [
        precision_score(y_test, y_pred_default),
        precision_score(y_test, y_pred_grid),
        precision_score(y_test, y_pred_rand)
    ],
    "Recall": [
        recall_score(y_test, y_pred_default),
        recall_score(y_test, y_pred_grid),
        recall_score(y_test, y_pred_rand)
    ],
    "F1 Score": [
        f1_score(y_test, y_pred_default),
        f1_score(y_test, y_pred_grid),
        f1_score(y_test, y_pred_rand)
    ]
}

df_results = pd.DataFrame(results)
print(df_results)

               Model  Accuracy  Precision    Recall  F1 Score
0            Default  0.726257   0.687500  0.602740  0.642336
1        Grid Search  0.804469   0.816667  0.671233  0.736842
2  Randomized Search  0.804469   0.816667  0.671233  0.736842
