In [34]:
import pandas as pd 
import numpy as np

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import KBinsDiscretizer
from sklearn.compose import ColumnTransformer

In [35]:
df = pd.read_csv('train.csv', usecols=['Age','Fare','Survived'])
df.head()

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.25
1,1,38.0,71.2833
2,1,26.0,7.925
3,1,35.0,53.1
4,0,35.0,8.05


In [36]:
df.isnull().sum()

Survived      0
Age         177
Fare          0
dtype: int64

In [37]:
df.dropna(inplace=True)

In [38]:
X= df.iloc[:,1:3]
y = df[['Survived']]

X_train, X_test, y_train, y_test =train_test_split(X,y,test_size=0.3)

In [39]:
clf = DecisionTreeClassifier()

clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)

accuracy_score(y_pred,y_test)

0.6558139534883721

In [40]:
np.mean(cross_val_score(DecisionTreeClassifier(),X,y,cv=10,scoring='accuracy'))

np.float64(0.6359154929577465)

In [41]:
kbin_age = KBinsDiscretizer(n_bins=5,encode='ordinal',strategy='kmeans')
kbin_fare = KBinsDiscretizer(n_bins=5,encode='ordinal',strategy='kmeans')

In [42]:
trf = ColumnTransformer([
    ('first',kbin_age,[0]),
    ('second',kbin_fare,[1])
])

In [43]:
X_train_trans = trf.fit_transform(X_train)
X_test_trans = trf.transform(X_test)
X_test_trans

array([[0., 0.],
       [1., 0.],
       [0., 0.],
       [2., 0.],
       [1., 0.],
       [4., 2.],
       [2., 2.],
       [2., 0.],
       [2., 0.],
       [0., 0.],
       [1., 0.],
       [2., 0.],
       [0., 2.],
       [3., 0.],
       [2., 3.],
       [1., 1.],
       [1., 1.],
       [3., 0.],
       [2., 0.],
       [2., 1.],
       [1., 0.],
       [4., 0.],
       [1., 0.],
       [1., 1.],
       [2., 0.],
       [1., 0.],
       [1., 0.],
       [3., 0.],
       [0., 0.],
       [1., 0.],
       [1., 0.],
       [4., 0.],
       [2., 0.],
       [2., 0.],
       [2., 0.],
       [1., 0.],
       [2., 0.],
       [2., 0.],
       [1., 0.],
       [1., 0.],
       [0., 0.],
       [2., 0.],
       [2., 0.],
       [0., 0.],
       [3., 0.],
       [0., 0.],
       [2., 0.],
       [1., 0.],
       [2., 1.],
       [3., 0.],
       [3., 0.],
       [3., 0.],
       [2., 1.],
       [3., 3.],
       [4., 0.],
       [3., 0.],
       [3., 2.],
       [3., 0.],
       [1., 0.

In [44]:
output = pd.DataFrame(
{
    'age':X_train['Age'],
    'age_trf':X_train_trans[:,0],
    'fare':X_train['Fare'],
    'fare_trf':X_train_trans[:,1]
}
)

output.head()

Unnamed: 0,age,age_trf,fare,fare_trf
242,29.0,2.0,10.5,0.0
144,18.0,1.0,11.5,0.0
847,35.0,2.0,7.8958,0.0
874,28.0,2.0,24.0,0.0
391,21.0,1.0,7.7958,0.0


In [45]:
output['age_label'] =pd.cut(x=X_train['Age'],bins=trf.named_transformers_['first'].bin_edges_[0].tolist())
output['fare_label'] =pd.cut(x=X_train['Fare'],bins=trf.named_transformers_['second'].bin_edges_[0].tolist())

In [46]:
output.sample(5)

Unnamed: 0,age,age_trf,fare,fare_trf,age_label,fare_label
642,2.0,0.0,27.9,0.0,"(0.67, 13.068]","(0.0, 42.189]"
118,24.0,1.0,247.5208,3.0,"(13.068, 26.649]","(187.93, 378.114]"
365,30.0,2.0,7.25,0.0,"(26.649, 39.258]","(0.0, 42.189]"
9,14.0,1.0,30.0708,0.0,"(13.068, 26.649]","(0.0, 42.189]"
556,48.0,3.0,39.6,0.0,"(39.258, 54.224]","(0.0, 42.189]"


In [47]:
output.isnull().sum()

age           0
age_trf       0
fare          0
fare_trf      0
age_label     1
fare_label    6
dtype: int64

In [48]:
clf  =DecisionTreeClassifier()

clf.fit(X_train_trans,y_train)

y_pred = clf.predict(X_test_trans)
accuracy_score(y_pred,y_test)

0.6790697674418604