# **MODEL EVALUATION**

# Step-1 : Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report

# Step-2 : Data Collection

In [2]:
# Read the data

df = pd.read_csv("titanic.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


# Step-3 : Exploratory Data Analysis (EDA)

In [3]:
# Shape of the data

df.shape

(891, 12)

In [4]:
# Description of numerical data

df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
# Description of categorical data

df.describe(include='object')

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Dooley, Mr. Patrick",male,347082,G6,S
freq,1,577,7,4,644


In [6]:
# Check for duplicated values

df.duplicated().sum()

np.int64(0)

In [7]:
# Check for missing values

df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

# Step-4 : Data Preprocessing

In [8]:
# Drop the columns that are not useful

df.drop(columns=["Cabin", "Name", "Ticket"], inplace=True)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


In [9]:
# Categories of the Embarked column

df["Embarked"].value_counts()

Embarked
S    644
C    168
Q     77
Name: count, dtype: int64

In [10]:
# Fill the missing values in the Embarked column with the most frequent value

df["Embarked"].fillna("S", inplace=True)
df["Embarked"].value_counts()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Embarked"].fillna("S", inplace=True)


Embarked
S    646
C    168
Q     77
Name: count, dtype: int64

In [11]:
# Fill the missing values in the Age column with the mean

df["Age"].fillna(df["Age"].mean(), inplace=True)
df["Age"].describe()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Age"].fillna(df["Age"].mean(), inplace=True)


count    891.000000
mean      29.699118
std       13.002015
min        0.420000
25%       22.000000
50%       29.699118
75%       35.000000
max       80.000000
Name: Age, dtype: float64

In [12]:
# Check for Age below 1 and replace with 1

df[df["Age"] < 1] = 1
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


In [13]:
# Round the Age column as it is not possible to have Age in decimal

df["Age"].round().astype(int)

0      22
1      38
2      26
3      35
4      35
       ..
886    27
887    19
888    30
889    26
890    32
Name: Age, Length: 891, dtype: int64

In [14]:
# Convert the categorical columns to numerical columns using one-hot encoding

df = pd.get_dummies(df, drop_first=True)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,22.0,1,0,7.25,False,True,False,False,True
1,2,1,1,38.0,1,0,71.2833,True,False,True,False,False
2,3,1,3,26.0,0,0,7.925,True,False,False,False,True
3,4,1,1,35.0,1,0,53.1,True,False,False,False,True
4,5,0,3,35.0,0,0,8.05,False,True,False,False,True


In [15]:
# Check the data types

df.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Age            float64
SibSp            int64
Parch            int64
Fare           float64
Sex_female        bool
Sex_male          bool
Embarked_C        bool
Embarked_Q        bool
Embarked_S        bool
dtype: object

# **CROSS-VALIDATION : K-FOLD**

### Step-5 : Data Splitting

In [16]:
# Split the data into features and target using cross validation score

x = df.drop(columns= ["Survived"])
y = df["Survived"]

### Step-6 : Model Development

In [17]:
# Model selection

model = DecisionTreeClassifier()
model.fit(x, y)

### Step-7 : Model Evaluation

In [18]:
# Perform K-Fold cross-validation with k = 5

k = 5
score = cross_val_score(model, x, y, cv=k)

In [19]:
# Cross-validation score (accuracy)

print(f"Cross-validation score: {score}")
print(f"Average cross-validation score: {score.mean()}")
print(f"Standard deviation of the score: {score.std()}")

Cross-validation score: [0.70949721 0.79775281 0.76966292 0.75842697 0.84831461]
Average cross-validation score: 0.7767309020149394
Standard deviation of the score: 0.04576353909834903


# **HYPERPARAMETER TUNING : GRID SEARCH**

### Step-5 : Data Splitting

In [20]:
# Split the data into features and target using train_test_split

x = df.drop(columns= ["Survived"])
y = df["Survived"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

### Step-6 : Model Development

In [21]:
# Model selection

model = DecisionTreeClassifier()
model.fit(x_train, y_train)

Hyperparameter Grid Search

In [22]:
# Select the parameters and Perform Grid Search with cross-validation

grid_parameter = {
                    'criterion': ['gini', 'entropy'],
                    'max_depth': [None, 10, 20, 30, 40, 50],
                    'min_samples_split': [2, 5, 10],
                    'min_samples_leaf': [1, 2, 4]
                }
            
grid_search = GridSearchCV(model, grid_parameter, cv=5)
grid_search.fit(x_train, y_train)

In [23]:
# Find the best parameters

best_parameter = grid_search.best_params_
best_parameter

{'criterion': 'entropy',
 'max_depth': 10,
 'min_samples_leaf': 4,
 'min_samples_split': 5}

In [24]:
# Fit the model with the best parameters

best_model = DecisionTreeClassifier(**best_parameter)
best_model.fit(x_train, y_train)

### Step-7 : Model Evaluation

In [25]:
# Predict the target values using the best model

y_pred = best_model.predict(x_test)
y_pred

array([0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0])

In [26]:
# Classification report of the model with the best parameters

classification = classification_report(y_test, y_pred)
print("Classification Report:\n", classification)

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.84      0.83       105
           1       0.76      0.73      0.74        74

    accuracy                           0.79       179
   macro avg       0.79      0.78      0.79       179
weighted avg       0.79      0.79      0.79       179

