In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [33]:
df = pd.read_csv("titanic.csv")

In [34]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    int32  
 3   Age       714 non-null    float64
 4   Fare      891 non-null    float64
dtypes: float64(2), int32(1), int64(2)
memory usage: 31.4 KB


In [60]:
df["Age"].unique()

array([22.  , 38.  , 26.  , 35.  ,   nan, 54.  ,  2.  , 27.  , 14.  ,
        4.  , 58.  , 20.  , 39.  , 55.  , 31.  , 34.  , 15.  , 28.  ,
        8.  , 19.  , 40.  , 66.  , 42.  , 21.  , 18.  ,  3.  ,  7.  ,
       49.  , 29.  , 65.  , 28.5 ,  5.  , 11.  , 45.  , 17.  , 32.  ,
       16.  , 25.  ,  0.83, 30.  , 33.  , 23.  , 24.  , 46.  , 59.  ,
       71.  , 37.  , 47.  , 14.5 , 70.5 , 32.5 , 12.  ,  9.  , 36.5 ,
       51.  , 55.5 , 40.5 , 44.  ,  1.  , 61.  , 56.  , 50.  , 36.  ,
       45.5 , 20.5 , 62.  , 41.  , 52.  , 63.  , 23.5 ,  0.92, 43.  ,
       60.  , 10.  , 64.  , 13.  , 48.  ,  0.75, 53.  , 57.  , 80.  ,
       70.  , 24.5 ,  6.  ,  0.67, 30.5 ,  0.42, 34.5 , 74.  ])

The data in the "Age" column is too unclean to draw inference from without making unverifiable assumptions

In [61]:
df.drop("Age",axis = 1,inplace = True)

In [35]:
df.corr(numeric_only=True)

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.005007,-0.035144,0.036847,-0.057527,-0.001652,0.012658
Survived,-0.005007,1.0,-0.338481,-0.077221,-0.035322,0.081629,0.257307
Pclass,-0.035144,-0.338481,1.0,-0.369226,0.083081,0.018443,-0.5495
Age,0.036847,-0.077221,-0.369226,1.0,-0.308247,-0.189119,0.096067
SibSp,-0.057527,-0.035322,0.083081,-0.308247,1.0,0.414838,0.159651
Parch,-0.001652,0.081629,0.018443,-0.189119,0.414838,1.0,0.216225
Fare,0.012658,0.257307,-0.5495,0.096067,0.159651,0.216225,1.0


The "PassengerId", "PClass","Parch" and the "SibSp" are not important columns in relation to the target class. 

In [36]:
df.drop(["PassengerId","Parch","SibSp"],axis = 1, inplace = True)

In [37]:
df['Ticket'].nunique()

681

There are too many unique values in the "Ticket" column to draw meaningful inference from all of them

In [38]:
from sklearn.preprocessing import LabelEncoder

In [39]:
LabelEncoder = LabelEncoder()

In [40]:
l_embarked = LabelEncoder.fit_transform(df["Embarked"])

In [41]:
df['L_embarked'] = l_embarked

In [48]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Fare
0,0,3,1,22.0,7.25
1,1,1,0,38.0,71.2833
2,1,3,0,26.0,7.925
3,1,1,0,35.0,53.1
4,0,3,1,35.0,8.05


In [43]:
df[["Survived","L_embarked"]].corr()

Unnamed: 0,Survived,L_embarked
Survived,1.0,-0.163517
L_embarked,-0.163517,1.0


The "Embarked" columns also has no effect on the "Survived" column

In [44]:
df.drop(["Embarked","L_embarked"],axis = 1, inplace = True)

In [45]:
df['Sex'] = LabelEncoder.fit_transform(df["Sex"])

In [46]:
df.drop("Name",axis = 1, inplace = True)

In [47]:
df.drop(["Ticket","Cabin"],axis = 1, inplace = True)

In [57]:
df.isna().any()

Survived    False
Pclass      False
Sex         False
Age          True
Fare        False
dtype: bool

In [49]:
from sklearn.model_selection import train_test_split

In [62]:
X = df.drop("Survived",axis = 1)
y = df['Survived']

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)

In [53]:
from sklearn import tree

In [54]:
model = tree.DecisionTreeClassifier()

In [64]:
model.fit(X_train,y_train)

In [68]:
model.score(X_test,y_test)

0.7932203389830509