In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('titanic.csv')
df.head()

Unnamed: 0,PassengerId,Name,Pclass,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,"Braund, Mr. Owen Harris",3,male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,"Heikkinen, Miss. Laina",3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,female,35.0,1,0,113803,53.1,C123,S,1
4,5,"Allen, Mr. William Henry",3,male,35.0,0,0,373450,8.05,,S,0


In [3]:
df.drop(['Name','PassengerId','SibSp','Parch','Cabin','Embarked','Ticket'],axis=1,inplace=True)
df.head()

Unnamed: 0,Pclass,Gender,Age,Fare,Survived
0,3,male,22.0,7.25,0
1,1,female,38.0,71.2833,1
2,3,female,26.0,7.925,1
3,1,female,35.0,53.1,1
4,3,male,35.0,8.05,0


In [4]:
x = df.iloc[:,:-1]
y = df.iloc[:,-1]
x

Unnamed: 0,Pclass,Gender,Age,Fare
0,3,male,22.0,7.2500
1,1,female,38.0,71.2833
2,3,female,26.0,7.9250
3,1,female,35.0,53.1000
4,3,male,35.0,8.0500
...,...,...,...,...
886,2,male,27.0,13.0000
887,1,female,19.0,30.0000
888,3,female,,23.4500
889,1,male,26.0,30.0000


In [5]:
dummies = pd.get_dummies(x.Gender,drop_first=True)
dummies

Unnamed: 0,male
0,1
1,0
2,0
3,0
4,1
...,...
886,1
887,0
888,0
889,1


In [6]:
x = pd.concat([x,dummies],axis='columns')
x

Unnamed: 0,Pclass,Gender,Age,Fare,male
0,3,male,22.0,7.2500,1
1,1,female,38.0,71.2833,0
2,3,female,26.0,7.9250,0
3,1,female,35.0,53.1000,0
4,3,male,35.0,8.0500,1
...,...,...,...,...,...
886,2,male,27.0,13.0000,1
887,1,female,19.0,30.0000,0
888,3,female,,23.4500,0
889,1,male,26.0,30.0000,1


In [7]:
x.drop('Gender',axis='columns',inplace=True)
x

Unnamed: 0,Pclass,Age,Fare,male
0,3,22.0,7.2500,1
1,1,38.0,71.2833,0
2,3,26.0,7.9250,0
3,1,35.0,53.1000,0
4,3,35.0,8.0500,1
...,...,...,...,...
886,2,27.0,13.0000,1
887,1,19.0,30.0000,0
888,3,,23.4500,0
889,1,26.0,30.0000,1


In [8]:
x.isna().sum()

Pclass      0
Age       177
Fare        0
male        0
dtype: int64

In [9]:
x['Age'].unique()

array([22.  , 38.  , 26.  , 35.  ,   nan, 54.  ,  2.  , 27.  , 14.  ,
        4.  , 58.  , 20.  , 39.  , 55.  , 31.  , 34.  , 15.  , 28.  ,
        8.  , 19.  , 40.  , 66.  , 42.  , 21.  , 18.  ,  3.  ,  7.  ,
       49.  , 29.  , 65.  , 28.5 ,  5.  , 11.  , 45.  , 17.  , 32.  ,
       16.  , 25.  ,  0.83, 30.  , 33.  , 23.  , 24.  , 46.  , 59.  ,
       71.  , 37.  , 47.  , 14.5 , 70.5 , 32.5 , 12.  ,  9.  , 36.5 ,
       51.  , 55.5 , 40.5 , 44.  ,  1.  , 61.  , 56.  , 50.  , 36.  ,
       45.5 , 20.5 , 62.  , 41.  , 52.  , 63.  , 23.5 ,  0.92, 43.  ,
       60.  , 10.  , 64.  , 13.  , 48.  ,  0.75, 53.  , 57.  , 80.  ,
       70.  , 24.5 ,  6.  ,  0.67, 30.5 ,  0.42, 34.5 , 74.  ])

In [10]:
x['Age'] = x['Age'].fillna(x["Age"].mean())
x['Age'].unique

<bound method Series.unique of 0      22.000000
1      38.000000
2      26.000000
3      35.000000
4      35.000000
         ...    
886    27.000000
887    19.000000
888    29.699118
889    26.000000
890    32.000000
Name: Age, Length: 891, dtype: float64>

In [11]:
x['Age']

0      22.000000
1      38.000000
2      26.000000
3      35.000000
4      35.000000
         ...    
886    27.000000
887    19.000000
888    29.699118
889    26.000000
890    32.000000
Name: Age, Length: 891, dtype: float64

In [12]:
X_train,X_test,Y_train,Y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [13]:
model = LogisticRegression(solver='liblinear')
model.fit(X_train,Y_train)

In [14]:
y_pred = model.predict(X_test)
acc = accuracy_score(y_pred,Y_test)*100 
print("Accuracy Score:",acc)

Accuracy Score: 78.2122905027933


In [26]:
##########################################################################################

In [27]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()

In [28]:
model.fit(X_train,Y_train)
model.score(X_test,Y_test)
y_pred = model.predict(X_test)
model.predict(X_test[:4])

array([0, 0, 0, 1], dtype=int64)

In [30]:
from sklearn.model_selection import cross_val_score
cross_val_score(GaussianNB(),X_train,Y_train,cv=5)

array([0.8041958 , 0.81818182, 0.73943662, 0.73239437, 0.80985915])