<h2>Titanic Survival Prediction</h2>

In [162]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import joblib

In [163]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [164]:
drop_list = ['sibsp','parch','embarked','class','fare','who','adult_male','deck','alive']

In [165]:
df.drop(drop_list,inplace=True,axis=1)

In [166]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   survived     891 non-null    int64  
 1   pclass       891 non-null    int64  
 2   sex          891 non-null    object 
 3   age          714 non-null    float64
 4   embark_town  889 non-null    object 
 5   alone        891 non-null    bool   
dtypes: bool(1), float64(1), int64(2), object(2)
memory usage: 35.8+ KB


In [167]:
df.isna().sum()

survived         0
pclass           0
sex              0
age            177
embark_town      2
alone            0
dtype: int64

In [168]:
df.dropna(inplace=True)

In [169]:
df.isna().sum()

survived       0
pclass         0
sex            0
age            0
embark_town    0
alone          0
dtype: int64

In [170]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 712 entries, 0 to 890
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   survived     712 non-null    int64  
 1   pclass       712 non-null    int64  
 2   sex          712 non-null    object 
 3   age          712 non-null    float64
 4   embark_town  712 non-null    object 
 5   alone        712 non-null    bool   
dtypes: bool(1), float64(1), int64(2), object(2)
memory usage: 34.1+ KB


In [171]:
df.survived.value_counts()

survived
0    424
1    288
Name: count, dtype: int64

In [172]:
df.sex.value_counts()

sex
male      453
female    259
Name: count, dtype: int64

In [173]:
df.head()

Unnamed: 0,survived,pclass,sex,age,embark_town,alone
0,0,3,male,22.0,Southampton,False
1,1,1,female,38.0,Cherbourg,False
2,1,3,female,26.0,Southampton,True
3,1,1,female,35.0,Southampton,False
4,0,3,male,35.0,Southampton,True


In [174]:
sex_dt = {'male':1,'female':0}

In [175]:
df['sex'] = df['sex'].map(sex_dt)

In [176]:
df.head()

Unnamed: 0,survived,pclass,sex,age,embark_town,alone
0,0,3,1,22.0,Southampton,False
1,1,1,0,38.0,Cherbourg,False
2,1,3,0,26.0,Southampton,True
3,1,1,0,35.0,Southampton,False
4,0,3,1,35.0,Southampton,True


In [177]:
df.embark_town.value_counts()

embark_town
Southampton    554
Cherbourg      130
Queenstown      28
Name: count, dtype: int64

In [178]:
embark_town_dt = {'Southampton':1,
                  'Cherbourg':2,
                  'Queenstown':3}

In [179]:
df.embark_town = df.embark_town.map(embark_town_dt)

In [180]:
df.head()

Unnamed: 0,survived,pclass,sex,age,embark_town,alone
0,0,3,1,22.0,1,False
1,1,1,0,38.0,2,False
2,1,3,0,26.0,1,True
3,1,1,0,35.0,1,False
4,0,3,1,35.0,1,True


In [181]:
alone_dt = {True:1,
            False:0}

df.alone = df.alone.map(alone_dt)

In [182]:
df.head()

Unnamed: 0,survived,pclass,sex,age,embark_town,alone
0,0,3,1,22.0,1,0
1,1,1,0,38.0,2,0
2,1,3,0,26.0,1,1
3,1,1,0,35.0,1,0
4,0,3,1,35.0,1,1


In [183]:
x = df.drop('survived',axis=1)
y = df.survived

In [184]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.15)

In [185]:
lgr = LogisticRegression()
lgr.fit(x_train,y_train)
print("Logistic Regressor :")
print("Train :",lgr.score(x_train,y_train)*100)
print("Test :",lgr.score(x_test,y_test)*100)


rfc = RandomForestClassifier(n_estimators=250)
rfc.fit(x_train,y_train)
print("Random Forest :")
print("Train :",rfc.score(x_train,y_train)*100)
print("Test :",rfc.score(x_test,y_test)*100)

dtc = DecisionTreeClassifier()
dtc.fit(x_train,y_train)
print("Decision Tree :")
print("Train :",dtc.score(x_train,y_train)*100)
print("Test :",dtc.score(x_test,y_test)*100)

Logistic Regressor :
Train : 79.33884297520662
Test : 77.57009345794393


Random Forest :
Train : 92.89256198347108
Test : 80.37383177570094
Decision Tree :
Train : 92.89256198347108
Test : 79.43925233644859


In [186]:
joblib.dump(dtc,'model.lb')

['model.lb']

In [187]:
model = joblib.load('model.lb')

In [188]:
model.predict([[1,0,8.0,1,0]])[0]



1