In [250]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier

In [251]:
df = pd.read_csv('train.csv')
df.sample(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
653,654,1,3,"O'Leary, Miss. Hanora ""Norah""",female,,0,0,330919,7.8292,,Q
381,382,1,3,"Nakid, Miss. Maria (""Mary"")",female,1.0,0,2,2653,15.7417,,C
160,161,0,3,"Cribb, Mr. John Hatfield",male,44.0,0,1,371362,16.1,,S
39,40,1,3,"Nicola-Yarred, Miss. Jamila",female,14.0,1,0,2651,11.2417,,C
254,255,0,3,"Rosblom, Mrs. Viktor (Helena Wilhelmina)",female,41.0,0,2,370129,20.2125,,S


In [252]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True)
df.sample(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
845,0,3,male,42.0,0,0,7.55,S
96,0,1,male,71.0,0,0,34.6542,C
618,1,2,female,4.0,2,1,39.0,S
212,0,3,male,22.0,0,0,7.25,S
13,0,3,male,39.0,1,5,31.275,S


In [253]:
from sklearn.model_selection import train_test_split

X_train , X_test , y_train , y_test = train_test_split(df.iloc[:,1:8],df.iloc[:,0],test_size=0.2)

In [254]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [255]:
#Applying imputation to fill up null values

si_age = SimpleImputer()
si_emb = SimpleImputer(strategy="most_frequent")

X_train_age = si_age.fit_transform(X_train[['Age']])
X_train_emb = si_emb.fit_transform(X_train[['Embarked']])

X_test_age = si_age.transform(X_test[['Age']])
X_test_emb = si_emb.transform(X_test[['Embarked']])


In [256]:
#One hot encoding to encode categorical value

ohe_sex = OneHotEncoder(sparse_output=False ,handle_unknown='ignore')
ohe_emb = OneHotEncoder(sparse_output=False,handle_unknown='ignore')

X_train_sex = ohe_sex.fit_transform(X_train[['Sex']])
X_train_emb = ohe_emb.fit_transform(X_train_emb)

X_test_sex = ohe_sex.transform(X_test[['Sex']])
X_test_emb = ohe_emb.transform(X_test_emb)



In [257]:
X_train_rem = X_train.drop(columns=['Age','Sex','Embarked'])
X_test_rem = X_test.drop(columns=['Age','Sex','Embarked'])

In [258]:
X_train_transformed = np.concatenate((X_train_rem,X_train_age,X_train_sex,X_train_emb),axis=1)
X_test_transformed = np.concatenate((X_test_rem,X_test_age,X_test_sex,X_test_emb),axis=1)

In [259]:
clf = DecisionTreeClassifier()

clf.fit(X_train_transformed,y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [260]:
y_pred = clf.predict(X_test_transformed)
y_pred

array([0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 1])

In [261]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test,y_pred)

0.7653631284916201

In [262]:
import pickle

pickle.dump(ohe_sex,open('model/ohe_sex.pkl','wb'))
pickle.dump(ohe_emb,open('model/ohe_embarked.pkl','wb'))
pickle.dump(clf,open('model/clf.pkl','wb'))