In [17]:
#importing the necessary libraries

import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [5]:
#Getting the shape of the dataset

df.shape

(891, 15)

In [6]:
#Checking for null values

df.isna().sum()

survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [7]:
#Choosing the columns to work with

df= df[['survived','sex','age','sibsp','class','who','adult_male','embark_town','alive']]
df.head()

Unnamed: 0,survived,sex,age,sibsp,class,who,adult_male,embark_town,alive
0,0,male,22.0,1,Third,man,True,Southampton,no
1,1,female,38.0,1,First,woman,False,Cherbourg,yes
2,1,female,26.0,0,Third,woman,False,Southampton,yes
3,1,female,35.0,1,First,woman,False,Southampton,yes
4,0,male,35.0,0,Third,man,True,Southampton,no


In [9]:
#Filling the null values with the median age

df['age'].fillna(df['age'].median(), inplace=True)

df.isna().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['age'].fillna(df['age'].median(), inplace=True)


survived       0
sex            0
age            0
sibsp          0
class          0
who            0
adult_male     0
embark_town    2
alive          0
dtype: int64

In [10]:
#Finding the mode of embark town

df['embark_town'].mode()

0    Southampton
Name: embark_town, dtype: object

In [11]:
#Filling the null values in the embark town column with the mode

df['embark_town'].fillna(df['embark_town'].mode()[0], inplace=True)

df.isna().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['embark_town'].fillna(df['embark_town'].mode()[0], inplace=True)


survived       0
sex            0
age            0
sibsp          0
class          0
who            0
adult_male     0
embark_town    0
alive          0
dtype: int64

In [18]:
#Machine learning modelling

df['sex'] = df['sex'].astype('category').cat.codes #Encoding the categorical variables
df['class'] = df['class'].astype('category').cat.codes
df['who'] = df['who'].astype('category').cat.codes
df['adult_male'] = df['adult_male'].astype('category').cat.codes
df['embark_town'] = df['embark_town'].astype('category').cat.codes
df['alive'] = df['alive'].astype('category').cat.codes


#Feature engineering

x = df[['survived','sex','age','sibsp','class','who','adult_male','embark_town']]
y = df['alive']


#Splitting the dataset

x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=42,test_size=0.2)


#Geting the model

model = DecisionTreeClassifier()


#Fitting and training the model

model.fit(x_train, y_train)


#Making predictions

y_predict = model.predict(x_test)


#Obtaining the accuracy score

accuracy = accuracy_score(y_predict, y_test)
print('Accuracy of the model = ', accuracy)

#Getting a confusion matrix

matrix = confusion_matrix(y_predict, y_test)
print('\nThe confusion matrix of the model is: \n',matrix)

Accuracy of the model =  1.0

The confusion matrix of the model is: 
 [[105   0]
 [  0  74]]


The model has a perfect accuracy