## Training the Model without using Pipelines

In [101]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

import pickle 

In [102]:
# Load the dataset
df = pd.read_csv('Titanic-Dataset.csv')
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [103]:
# Dropping unnecessary features
df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
df.sample(3)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
37,0,3,male,21.0,0,0,8.05,S
355,0,3,male,28.0,0,0,9.5,S
505,0,1,male,18.0,1,0,108.9,C


In [104]:
X = df.drop(['Survived'], axis=1)
y = df['Survived']

# Split the data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [105]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 331 to 102
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Pclass    712 non-null    int64  
 1   Sex       712 non-null    object 
 2   Age       572 non-null    float64
 3   SibSp     712 non-null    int64  
 4   Parch     712 non-null    int64  
 5   Fare      712 non-null    float64
 6   Embarked  710 non-null    object 
dtypes: float64(2), int64(3), object(2)
memory usage: 44.5+ KB


In [106]:
# Impute missing values in Age and Embarked
si_age = SimpleImputer(strategy='mean')
si_embarked = SimpleImputer(strategy="most_frequent")

X_train_ageImp = si_age.fit_transform(X_train[['Age']])
X_train_embarkedImp = si_embarked.fit_transform(X_train[['Embarked']])

X_test_ageImp = si_age.transform(X_test[['Age']])
X_test_embarkedImp = si_embarked.transform(X_test[['Embarked']])


In [107]:
# Ohe-Hot encoding Sex and Embarked
ohe_sex = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
ohe_embarked = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')

X_train_sexEnc = ohe_sex.fit_transform(X_train[['Sex']])
X_train_embarkedEnc = ohe_embarked.fit_transform(X_train_embarkedImp)

X_test_sexEnc = ohe_sex.transform(X_test[['Sex']])
X_test_embarkedEnc = ohe_embarked.transform(X_test_embarkedImp)


In [108]:
X_train.sample(3)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
14,3,female,14.0,0,0,7.8542,S
627,1,female,21.0,0,0,77.9583,S
162,3,male,26.0,0,0,7.775,S


In [109]:
# Extract the remaining columns from X_train and X_test
X_train_remaining = X_train.drop(columns=['Sex', 'Age', 'Embarked'])
X_test_remaining = X_test.drop(columns=['Sex', 'Age', 'Embarked'])

In [110]:
X_train_remaining.sample(3)

Unnamed: 0,Pclass,SibSp,Parch,Fare
516,2,0,0,10.5
470,3,0,0,7.25
818,3,0,0,6.45


In [111]:
# Now merge both X_train_remaining and preprocessed features for both train and test
X_train_transformed = np.concatenate((X_train_remaining, X_train_ageImp, X_train_sexEnc, X_train_embarkedEnc), axis=1)
X_test_transformed = np.concatenate((X_test_remaining, X_test_ageImp, X_test_sexEnc, X_test_embarkedEnc), axis=1)

In [112]:
X_train_transformed.shape

(712, 8)

In [113]:
# Call the model
clf = DecisionTreeClassifier()

clf.fit(X_train_transformed, y_train)

In [114]:
# Predict for X_test_transformed
y_pred = clf.predict(X_test_transformed)
y_pred

array([0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 1, 1], dtype=int64)

In [115]:
# Evaluate the model and rounding odd upto 5 decimal points
np.round(accuracy_score(y_test, y_pred), 5)

0.77654

## Saving the model

In [116]:
pickle.dump(ohe_sex, open('models/ohe_sex.pkl', 'wb'))
pickle.dump(ohe_embarked, open('models/ohe_embarked.pkl', 'wb'))
pickle.dump(clf, open('models/clf.pkl', 'wb'))