In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer

In [13]:
# titanic dataset
df = pd.read_csv('../../Machine_Learning/tested.csv', usecols = ['Age', 'Fare', 'SibSp', 'Parch', 'Survived'])

In [14]:
df.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare
0,0,34.5,0,0,7.8292
1,1,47.0,1,0,7.0
2,0,62.0,0,0,9.6875
3,0,27.0,0,0,8.6625
4,1,22.0,1,1,12.2875


In [15]:
df.dropna(inplace = True)

In [16]:
df.isna().sum()

Survived    0
Age         0
SibSp       0
Parch       0
Fare        0
dtype: int64

In [17]:
df['family'] = df['SibSp'] + df['Parch']

In [18]:
df.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,family
0,0,34.5,0,0,7.8292,0
1,1,47.0,1,0,7.0,1
2,0,62.0,0,0,9.6875,0
3,0,27.0,0,0,8.6625,0
4,1,22.0,1,1,12.2875,2


In [32]:
df.drop(columns = ['SibSp', 'Parch'], inplace=True)

In [33]:
X = df.drop(columns = ['Survived'])
y = df['Survived']

In [34]:
y

0      0
1      1
2      0
3      0
4      1
      ..
409    1
411    1
412    1
414    1
415    0
Name: Survived, Length: 331, dtype: int64

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
X_train.head()

Unnamed: 0,Age,Fare,family
281,0.75,13.775,2
96,76.0,78.85,1
341,32.0,7.5792,0
18,27.0,7.925,1
26,22.0,61.9792,1


## Without Binarization

In [38]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

In [39]:
y_pred = clf.predict(X_test)

In [40]:
accuracy_score(y_test, y_pred)

0.5970149253731343

In [41]:
np.mean(cross_val_score(DecisionTreeClassifier(), X,y, cv=10, scoring = 'accuracy'))

0.6103386809269162

## Applying Binarization

In [42]:
from sklearn.preprocessing import Binarizer

In [43]:
trf = ColumnTransformer([
    ('bin', Binarizer(copy = False), ['family'])
], remainder = 'passthrough')

In [44]:
X_train_trf = trf.fit_transform(X_train)
X_test_trf = trf.transform(X_test)

In [46]:
pd.DataFrame(X_train_trf, columns = ['family', 'Age', 'Fare'])

Unnamed: 0,family,Age,Fare
0,1.0,0.75,13.7750
1,1.0,76.00,78.8500
2,0.0,32.00,7.5792
3,1.0,27.00,7.9250
4,1.0,22.00,61.9792
...,...,...,...
259,0.0,20.00,7.2250
260,0.0,27.00,7.8792
261,0.0,43.00,7.8958
262,0.0,16.00,7.6500


In [52]:
clf = DecisionTreeClassifier()
clf.fit(X_train_trf, y_train)
y_pred2 = clf.predict(X_test_trf)
accuracy_score(y_test, y_pred2)

0.6268656716417911

In [53]:
X_trf = trf.fit_transform(X)

## Cross validation is a technique used in machine learning to evaluate the performance of a model on unseen data.

In [54]:
np.mean(cross_val_score(DecisionTreeClassifier(), X_trf,y,cv=10,scoring='accuracy'))

0.6132798573975045