# Grid Search

- Titanic Dataset

### Import necessary libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

### Load the Titanic dataset

In [2]:
titanic_data = pd.read_csv('titanic (1).csv')  # Replace 'path_to_titanic.csv' with the actual file path

### Preprocess the dataset

In [3]:
# Drop 'Name,' 'Ticket,' and 'Cabin' columns
titanic_data = titanic_data.drop(['Name', 'Ticket', 'Cabin'], axis=1)


In [4]:
# Handle missing values (e.g., fill missing ages with the mean age)
titanic_data['Age'].fillna(titanic_data['Age'].mean(), inplace=True)
titanic_data['Embarked'].fillna(titanic_data['Embarked'].mode()[0], inplace=True)


In [5]:
# Encode categorical variables ('Sex' and 'Embarked')
label_encoder = LabelEncoder()
titanic_data['Sex'] = label_encoder.fit_transform(titanic_data['Sex'])
titanic_data['Embarked'] = label_encoder.fit_transform(titanic_data['Embarked'])


In [6]:
# Define features (X) and target (y)
X = titanic_data.drop('Survived', axis=1)  # Assuming 'Survived' is the target variable
y = titanic_data['Survived']

In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)


In [8]:
# Create a Decision Tree classifier
clf = DecisionTreeClassifier(random_state=42)

In [9]:
clf.fit(X_train,y_train)

DecisionTreeClassifier(random_state=42)

In [11]:
y_pred = clf.predict(X_test)

In [12]:
from sklearn.metrics import accuracy_score

In [13]:
Accuracy = accuracy_score(y_test,y_pred)
Accuracy

0.7597765363128491

### Hyperparameter grid to search

In [19]:
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_features':[5,7,8,10],
    'splitter':['best', 'random'],
    'max_depth': [10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


In [20]:
# Perform Grid Search with cross-validation (e.g., K=5)
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)


 0.77384024 0.80056141 0.79076135 0.78784596 0.79209101 0.7850389
 0.81039102 0.79915296 0.81039102 0.79915296 0.79066286 0.7808628
 0.75696838 0.79214025 0.76545849 0.79918251 0.75559933 0.79349946
 0.76819659 0.7878755  0.78646705 0.78792475 0.7809022  0.79913326
 0.80473752 0.80337831 0.80473752 0.80337831 0.80754457 0.7780459
 0.75415148 0.7879149  0.76403034 0.81033192 0.76683739 0.80617551
 0.77943465 0.7921698  0.7794445  0.77242194 0.76959519 0.80475721
 0.79348961 0.80758397 0.79348961 0.80758397 0.79630651 0.80332907
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
 0.73738796 0.73458091 0.76402049 0.78227125 0.77101349 0.80055156
 0.77387964 0.76683739 0.79494731 0.7864572  0.7892938  0.79766571
 0.79210086 0.7921304  0.79210086 0.7921304  0.76676844 0.7907318
 0.75140353 0.73315276 0.75987393 0.78794445 0.75140353 0.78650645

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=42), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [10, 20, 30, 40, 50],
                         'max_features': [5, 7, 8, 10],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10],
                         'splitter': ['best', 'random']},
             scoring='accuracy')

In [21]:
# Get the best hyperparameters
best_params = grid_search.best_params_
best_params

{'criterion': 'entropy',
 'max_depth': 10,
 'max_features': 5,
 'min_samples_leaf': 2,
 'min_samples_split': 10,
 'splitter': 'random'}

In [22]:
# Train a Decision Tree classifier with the best hyperparameters
best_clf = DecisionTreeClassifier(random_state=42, **best_params)
best_clf.fit(X_train, y_train)


DecisionTreeClassifier(criterion='entropy', max_depth=10, max_features=5,
                       min_samples_leaf=2, min_samples_split=10,
                       random_state=42, splitter='random')

In [25]:
# Evaluate the model on the test set
y_pred = best_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8156424581005587

In [24]:
# Print the best hyperparameters and model accuracy
print(f"Best Hyperparameters: {best_params}")
print(f"Model Accuracy on Test Data: {accuracy:.2f}")

Best Hyperparameters: {'criterion': 'entropy', 'max_depth': 10, 'max_features': 5, 'min_samples_leaf': 2, 'min_samples_split': 10, 'splitter': 'random'}
Model Accuracy on Test Data: 0.82
