In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Name,Pclass,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,"Braund, Mr. Owen Harris",3,male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,"Heikkinen, Miss. Laina",3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,female,35.0,1,0,113803,53.1,C123,S,1
4,5,"Allen, Mr. William Henry",3,male,35.0,0,0,373450,8.05,,S,0


In [3]:
df.drop(['Name','PassengerId','SibSp','Parch','Cabin','Embarked','Ticket'],axis=1,inplace=True)
df.head()

Unnamed: 0,Pclass,Gender,Age,Fare,Survived
0,3,male,22.0,7.25,0
1,1,female,38.0,71.2833,1
2,3,female,26.0,7.925,1
3,1,female,35.0,53.1,1
4,3,male,35.0,8.05,0


In [4]:
dummies = pd.get_dummies(df.Gender,drop_first=True)
dummies

Unnamed: 0,male
0,1
1,0
2,0
3,0
4,1
...,...
886,1
887,0
888,0
889,1


In [5]:
df['Gender']=dummies
df

Unnamed: 0,Pclass,Gender,Age,Fare,Survived
0,3,1,22.0,7.2500,0
1,1,0,38.0,71.2833,1
2,3,0,26.0,7.9250,1
3,1,0,35.0,53.1000,1
4,3,1,35.0,8.0500,0
...,...,...,...,...,...
886,2,1,27.0,13.0000,0
887,1,0,19.0,30.0000,1
888,3,0,,23.4500,0
889,1,1,26.0,30.0000,1


In [6]:
df.isna().sum()

Pclass        0
Gender        0
Age         177
Fare          0
Survived      0
dtype: int64

In [7]:
df['Age'].unique()

array([22.  , 38.  , 26.  , 35.  ,   nan, 54.  ,  2.  , 27.  , 14.  ,
        4.  , 58.  , 20.  , 39.  , 55.  , 31.  , 34.  , 15.  , 28.  ,
        8.  , 19.  , 40.  , 66.  , 42.  , 21.  , 18.  ,  3.  ,  7.  ,
       49.  , 29.  , 65.  , 28.5 ,  5.  , 11.  , 45.  , 17.  , 32.  ,
       16.  , 25.  ,  0.83, 30.  , 33.  , 23.  , 24.  , 46.  , 59.  ,
       71.  , 37.  , 47.  , 14.5 , 70.5 , 32.5 , 12.  ,  9.  , 36.5 ,
       51.  , 55.5 , 40.5 , 44.  ,  1.  , 61.  , 56.  , 50.  , 36.  ,
       45.5 , 20.5 , 62.  , 41.  , 52.  , 63.  , 23.5 ,  0.92, 43.  ,
       60.  , 10.  , 64.  , 13.  , 48.  ,  0.75, 53.  , 57.  , 80.  ,
       70.  , 24.5 ,  6.  ,  0.67, 30.5 ,  0.42, 34.5 , 74.  ])

In [8]:
df['Age'] =df['Age'].fillna(df["Age"].mean())
df['Age'].unique()

array([22.        , 38.        , 26.        , 35.        , 29.69911765,
       54.        ,  2.        , 27.        , 14.        ,  4.        ,
       58.        , 20.        , 39.        , 55.        , 31.        ,
       34.        , 15.        , 28.        ,  8.        , 19.        ,
       40.        , 66.        , 42.        , 21.        , 18.        ,
        3.        ,  7.        , 49.        , 29.        , 65.        ,
       28.5       ,  5.        , 11.        , 45.        , 17.        ,
       32.        , 16.        , 25.        ,  0.83      , 30.        ,
       33.        , 23.        , 24.        , 46.        , 59.        ,
       71.        , 37.        , 47.        , 14.5       , 70.5       ,
       32.5       , 12.        ,  9.        , 36.5       , 51.        ,
       55.5       , 40.5       , 44.        ,  1.        , 61.        ,
       56.        , 50.        , 36.        , 45.5       , 20.5       ,
       62.        , 41.        , 52.        , 63.        , 23.5 

In [9]:
imp = SimpleImputer(missing_values =np.nan,strategy = 'mean')
imp = imp.fit(df)
modifiedata = imp.transform(df)
print(modifiedata)

[[ 3.          1.         22.          7.25        0.        ]
 [ 1.          0.         38.         71.2833      1.        ]
 [ 3.          0.         26.          7.925       1.        ]
 ...
 [ 3.          0.         29.69911765 23.45        0.        ]
 [ 1.          1.         26.         30.          1.        ]
 [ 3.          1.         32.          7.75        0.        ]]


In [10]:
df.isna().sum()

Pclass      0
Gender      0
Age         0
Fare        0
Survived    0
dtype: int64

In [11]:
x = df.iloc[:,:-1]
y = df.iloc[:,-1]
x

Unnamed: 0,Pclass,Gender,Age,Fare
0,3,1,22.000000,7.2500
1,1,0,38.000000,71.2833
2,3,0,26.000000,7.9250
3,1,0,35.000000,53.1000
4,3,1,35.000000,8.0500
...,...,...,...,...
886,2,1,27.000000,13.0000
887,1,0,19.000000,30.0000
888,3,0,29.699118,23.4500
889,1,1,26.000000,30.0000


In [12]:
x_train , x_test , y_train , y_test =train_test_split(x,y,test_size=0.2,random_state=42)

In [13]:
from sklearn.preprocessing import StandardScaler 
scaler= StandardScaler()
scaler.fit(x_train)
x_train=scaler.transform(x_train)
x_test=scaler.transform(x_test)

In [14]:
clf=DecisionTreeClassifier(criterion="gini")
clf=clf.fit(x_train,y_train)
y_pred=clf.predict(x_test)

In [15]:
acc=accuracy_score(y_test,y_pred)*100
acc

74.30167597765363