In [73]:
import numpy as np
import pandas as pd

In [74]:
df = pd.read_csv('../../../data/titanic.csv')

In [75]:
df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
223,224,0,3,"Nenkoff, Mr. Christo",male,,0,0,349234,7.8958,,S
690,691,1,1,"Dick, Mr. Albert Adrian",male,31.0,1,0,17474,57.0,B20,S
688,689,0,3,"Fischer, Mr. Eberhard Thelander",male,18.0,0,0,350036,7.7958,,S
129,130,0,3,"Ekstrom, Mr. Johan",male,45.0,0,0,347061,6.975,,S
821,822,1,3,"Lulic, Mr. Nikola",male,27.0,0,0,315098,8.6625,,S


In [76]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)

In [77]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


# Train Test Split

In [78]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['Survived']),
                                                    df['Survived'],
                                                    test_size=0.2,
                                                    random_state=42)

X_train.shape, X_test.shape

((712, 7), (179, 7))

In [79]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

# Applying Imputation

In [80]:
# Applying Imputation on Age and Embarked 
from sklearn.impute import SimpleImputer
si_age = SimpleImputer() # --> replace missing values with mean
si_embarked = SimpleImputer(strategy='most_frequent') # replace missing values with most frequent value

X_train_age = si_age.fit_transform(X_train[['Age']])
X_test_age = si_age.transform(X_test[['Age']]) # --> these are new dataframes the original is same as before

X_train_embarked = si_embarked.fit_transform(X_train[['Embarked']])
X_test_embarked = si_embarked.transform(X_test[['Embarked']])

# One Hot Encoding

In [81]:
# One Hot Encoding Sex and Embarked
from sklearn.preprocessing import OneHotEncoder

ohe_sex = OneHotEncoder(sparse_output=False,handle_unknown='ignore')# --> handle unknown ignore will make the entries on a new category 0 which was not present during fit.
ohe_embarked = OneHotEncoder(sparse_output=False,handle_unknown='ignore')

X_train_sex = ohe_sex.fit_transform(X_train[['Sex']])
X_test_sex = ohe_sex.transform(X_test[['Sex']])

X_train_embarked = ohe_embarked.fit_transform(X_train_embarked)# --> X_trian_embarked is used because the original dataframe has missing values for embarked column
X_test_embarked = ohe_embarked.transform(X_test_embarked)

# Combining the columns


In [82]:
X_train_rem = X_train.drop(columns=['Age','Sex','Embarked'])
X_test_rem = X_test.drop(columns=['Age','Sex','Embarked'])

In [83]:
X_train_transformed = np.concatenate((X_train_rem,X_train_age,X_train_embarked,X_train_sex),axis=1)
X_test_transformed = np.concatenate((X_test_rem,X_test_age,X_test_embarked,X_test_sex),axis=1)

# Training Model

In [84]:
from sklearn.tree import DecisionTreeClassifier

In [85]:
clf = DecisionTreeClassifier()
clf.fit(X_train_transformed,y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [86]:
y_pred = clf.predict(X_test_transformed)

In [87]:
from sklearn.metrics import accuracy_score

In [88]:
accuracy_score(y_test,y_pred)

0.7988826815642458

# Importing Model

In [89]:
import pickle

In [91]:
pickle.dump(ohe_sex,open('models/ohe_sex.pkl','wb'))
pickle.dump(ohe_embarked,open('models/ohe_embarked.pkl','wb'))
pickle.dump(clf,open('models/clf.pkl','wb'))