In [1]:
import numpy as np
import pandas as pd

In [2]:
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import KBinsDiscretizer

In [3]:
df=pd.read_csv('Titanic-Dataset.csv',usecols=['Age','Fare','Survived'])

In [4]:
df.head()

Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.25
1,1,38.0,71.2833
2,1,26.0,7.925
3,1,35.0,53.1
4,0,35.0,8.05


In [5]:
df.isnull().sum()

Survived      0
Age         177
Fare          0
dtype: int64

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Age       714 non-null    float64
 2   Fare      891 non-null    float64
dtypes: float64(2), int64(1)
memory usage: 21.0 KB


In [9]:
df['Age'].fillna(df['Age'].mean(),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(),inplace=True)


In [10]:
df.isnull().sum()

Survived    0
Age         0
Fare        0
dtype: int64

In [13]:
X=df.iloc[:,1:]
Y=df.iloc[:,0]

In [17]:
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=42)

In [18]:
clf=DecisionTreeClassifier()
clf.fit(x_train,y_train)
y_pred=clf.predict(x_test)
accuracy_score(y_test,y_pred)

0.664804469273743

In [19]:
np.mean(cross_val_score(clf,X,Y,cv=10,scoring='accuracy'))

0.6588389513108615

In [22]:
kbin_age= KBinsDiscretizer(n_bins=10,encode='ordinal',strategy='quantile')
kbin_fare=KBinsDiscretizer(n_bins=10,encode='ordinal',strategy='quantile')

In [24]:
trf=ColumnTransformer(transformers=[('age',kbin_age,[0]),('Fare',kbin_fare,[1])],remainder='passthrough')

In [25]:
x_train_tf=trf.fit_transform(x_train)
x_test_tf=trf.transform(x_test)



In [27]:
trf.named_transformers_['age'].n_bins_

array([9])

In [28]:
trf.named_transformers_['Fare'].n_bins_

array([10])

In [29]:
trf.named_transformers_['Fare'].bin_edges_

array([array([  0.     ,   7.55   ,   7.88916,   8.05   ,  10.5    ,  14.4542 ,
               21.045  ,  26.55   ,  39.6875 ,  77.2875 , 512.3292 ])          ],
      dtype=object)

In [30]:
output=pd.DataFrame({'age':x_train['Age'],'new_age':x_train_tf[:,0],'fare':x_train['Fare'],'new_fair':x_train_tf[:,-1]})

In [34]:
output['age_labels'] = pd.cut(x=x_train['Age'],
                                    bins=trf.named_transformers_['age'].bin_edges_[0].tolist())
output['fare_labels'] = pd.cut(x=x_train['Fare'],
                                    bins=trf.named_transformers_['Fare'].bin_edges_[0].tolist())

In [35]:
output.sample(5)

Unnamed: 0,age,new_age,fare,new_fair,age_labels,fare_labels
118,24.0,3.0,247.5208,9.0,"(21.0, 24.0]","(77.288, 512.329]"
811,39.0,7.0,24.15,6.0,"(38.0, 47.0]","(21.045, 26.55]"
413,29.699118,5.0,0.0,0.0,"(28.0, 29.699]",
423,28.0,4.0,14.4,4.0,"(24.0, 28.0]","(10.5, 14.454]"
467,56.0,8.0,26.55,7.0,"(47.0, 80.0]","(21.045, 26.55]"


In [32]:
clf=DecisionTreeClassifier()
clf.fit(x_train_tf,y_train)
y_pred=clf.predict(x_test_tf)
accuracy_score(y_test,y_pred)

0.6871508379888268

In [36]:
df1=pd.read_csv('Titanic-Dataset.csv',usecols=['Age','Fare','SibSp','Parch','Survived'])

In [37]:
df1.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare
0,0,22.0,1,0,7.25
1,1,38.0,1,0,71.2833
2,1,26.0,0,0,7.925
3,1,35.0,1,0,53.1
4,0,35.0,0,0,8.05


In [38]:
df.dropna(inplace=True)

In [40]:
df1['family']=df1['SibSp']+df1['Parch']

In [41]:
df1=df1.drop(['SibSp','Parch'],axis=1)

In [42]:
df1.head()

Unnamed: 0,Survived,Age,Fare,family
0,0,22.0,7.25,1
1,1,38.0,71.2833,1
2,1,26.0,7.925,0
3,1,35.0,53.1,1
4,0,35.0,8.05,0


In [48]:
X=df1.iloc[:,1:]
Y=df1.iloc[:,0]
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.2,random_state=42)

In [44]:
from sklearn.preprocessing import Binarizer

In [45]:
trf=ColumnTransformer(transformers=[('bin',Binarizer(copy=False),['family'])],remainder='passthrough')

In [49]:
x_train_t=trf.fit_transform(x_train)
x_test_t=trf.transform(x_test)

In [54]:
pd.DataFrame(x_train_t,columns=['family','age','fare'])

Unnamed: 0,family,age,fare
0,0.0,45.5,28.5000
1,0.0,23.0,13.0000
2,0.0,32.0,7.9250
3,1.0,26.0,7.8542
4,1.0,6.0,31.2750
...,...,...,...
707,0.0,21.0,7.6500
708,0.0,,31.0000
709,1.0,41.0,14.1083
710,1.0,14.0,120.0000
