In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB   #import Gaussian Bayes modeling function
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [2]:
filepath = r"datasets\titanic.xls"   

Titanic_df = pd.read_excel(filepath)
Titanic_df.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [3]:
Titanic_df.count()

pclass       1309
survived     1309
name         1309
sex          1309
age          1046
sibsp        1309
parch        1309
ticket       1309
fare         1308
cabin         295
embarked     1307
boat          486
body          121
home.dest     745
dtype: int64

In [4]:
#drop columns not important

modeldf = Titanic_df.drop(['name','ticket','fare','cabin','boat','body','home.dest'],axis = 1)

In [5]:
modeldf.head()

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,embarked
0,1,1,female,29.0,0,0,S
1,1,1,male,0.9167,1,2,S
2,1,0,female,2.0,1,2,S
3,1,0,male,30.0,1,2,S
4,1,0,female,25.0,1,2,S


In [6]:
#convert the 'sex' column into binary
modeldf['sex'] = modeldf['sex'].map({'female':0,'male':1})

In [7]:
#merge 'sibsp' and 'parch' columns
modeldf['family_num'] = modeldf['sibsp'] + modeldf['parch']
modeldf.drop(['sibsp','parch'], axis=1, inplace=True)

In [8]:
#travel alone:0; travel with sb:1
modeldf['travelalone'] = np.where((modeldf['family_num']>0),0,1)
modeldf.head()

Unnamed: 0,pclass,survived,sex,age,embarked,family_num,travelalone
0,1,1,0,29.0,S,0,1
1,1,1,1,0.9167,S,3,0
2,1,0,0,2.0,S,3,0
3,1,0,1,30.0,S,3,0
4,1,0,0,25.0,S,3,0


In [9]:
modeldf.count()

pclass         1309
survived       1309
sex            1309
age            1046
embarked       1307
family_num     1309
travelalone    1309
dtype: int64

In [10]:
modeldf.groupby(['pclass','survived','sex'])['age'].mean()

pclass  survived  sex
1       0         0      35.200000
                  1      43.658163
        1         0      37.109375
                  1      36.168240
2       0         0      34.090909
                  1      33.092593
        1         0      26.711051
                  1      17.449274
3       0         0      23.418750
                  1      26.679598
        1         0      20.814815
                  1      22.436441
Name: age, dtype: float64

In [11]:
modeldf['age'].fillna(modeldf.groupby(['pclass','survived','sex'])['age'].transform('mean'), inplace = True)

In [12]:
modeldf['embarked'].value_counts()

S    914
C    270
Q    123
Name: embarked, dtype: int64

In [13]:
#don't forget 'inplace=True'
modeldf['embarked'].fillna('S', inplace=True)

In [14]:
modeldf.count()

pclass         1309
survived       1309
sex            1309
age            1309
embarked       1309
family_num     1309
travelalone    1309
dtype: int64

In [15]:
modeldf.corr()

Unnamed: 0,pclass,survived,sex,age,family_num,travelalone
pclass,1.0,-0.312469,0.124617,-0.444002,0.050027,0.147393
survived,-0.312469,1.0,-0.528693,-0.060032,0.026876,-0.201719
sex,0.124617,-0.528693,1.0,0.080752,-0.188583,0.284537
age,-0.444002,-0.060032,0.080752,1.0,-0.206087,0.116266
family_num,0.050027,0.026876,-0.188583,-0.206087,1.0,-0.688864
travelalone,0.147393,-0.201719,0.284537,0.116266,-0.688864,1.0


In [16]:
modeldf = pd.get_dummies(data=modeldf, columns=['pclass','embarked'])
modeldf.head()

Unnamed: 0,survived,sex,age,family_num,travelalone,pclass_1,pclass_2,pclass_3,embarked_C,embarked_Q,embarked_S
0,1,0,29.0,0,1,1,0,0,0,0,1
1,1,1,0.9167,3,0,1,0,0,0,0,1
2,0,0,2.0,3,0,1,0,0,0,0,1
3,0,1,30.0,3,0,1,0,0,0,0,1
4,0,0,25.0,3,0,1,0,0,0,0,1


In [17]:
modeldf.corr()

Unnamed: 0,survived,sex,age,family_num,travelalone,pclass_1,pclass_2,pclass_3,embarked_C,embarked_Q,embarked_S
survived,1.0,-0.528693,-0.060032,0.026876,-0.201719,0.279449,0.05079,-0.283428,0.182123,-0.016071,-0.150542
sex,-0.528693,1.0,0.080752,-0.188583,0.284537,-0.107371,-0.028862,0.116562,-0.066564,-0.088651,0.115193
age,-0.060032,0.080752,1.0,-0.206087,0.116266,0.428501,0.005843,-0.375549,0.082706,-0.085716,-0.018446
family_num,0.026876,-0.188583,-0.206087,1.0,-0.688864,-0.029656,-0.039976,0.05843,-0.036553,-0.08719,0.087771
travelalone,-0.201719,0.284537,0.116266,-0.688864,1.0,-0.126551,-0.035075,0.13825,-0.107874,0.127214,0.014246
pclass_1,0.279449,-0.107371,0.428501,-0.029656,-0.126551,1.0,-0.296526,-0.622172,0.325722,-0.166101,-0.1818
pclass_2,0.05079,-0.028862,0.005843,-0.039976,-0.035075,-0.296526,1.0,-0.56318,-0.134675,-0.121973,0.196532
pclass_3,-0.283428,0.116562,-0.375549,0.05843,0.13825,-0.622172,-0.56318,1.0,-0.17143,0.243706,-0.003805
embarked_C,0.182123,-0.066564,0.082706,-0.036553,-0.107874,0.325722,-0.134675,-0.17143,1.0,-0.164166,-0.778262
embarked_Q,-0.016071,-0.088651,-0.085716,-0.08719,0.127214,-0.166101,-0.121973,0.243706,-0.164166,1.0,-0.491656


In [18]:
#dataframe with predicting features
X = modeldf.drop('survived', axis=1)

#column of predictive target values
y = modeldf['survived']

In [19]:
#create training and test data
#will leave test size at 20%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=15)

## Gaussian Naïve Bayes

In [20]:
#initialize Gaussian Bayes classifier
gnb = GaussianNB()

In [21]:
#train the model to learn trends
gnb.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [22]:
#predictive score of the model on the training data
gnb.score(X_train, y_train)

0.766953199617956

In [23]:
#test the model on unseen data
#score predictive values in variable
y_pred = gnb.predict(X_test)

In [24]:
#Confusion matrix shows which values model predicted correctly vs incorrectly

cm = pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    columns=['Predicted Not Survived', 'Predicted Survived'],
    index=['True Not Survived', 'True Survived']
)

cm

Unnamed: 0,Predicted Not Survived,Predicted Survived
True Not Survived,134,33
True Survived,29,66


In [25]:
#frequency of passed students to failed students in the test dataset
y_test.value_counts()

0    167
1     95
Name: survived, dtype: int64

In [26]:
#predictive score of the model on the test data
gnb.score(X_test, y_test)

0.7633587786259542

In [27]:
#predictive score of the model for each predictive category
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.80      0.81       167
           1       0.67      0.69      0.68        95

   micro avg       0.76      0.76      0.76       262
   macro avg       0.74      0.75      0.75       262
weighted avg       0.77      0.76      0.76       262



## Bernoulli's Naïve Bayes

In [28]:
#import Bernoulli Naïve Bayes function from scikit-learn library
from sklearn.naive_bayes import BernoulliNB

In [29]:
#initialize Bernoulli Naïve Bayes function to a variable
bnb = BernoulliNB()

In [30]:
#build the model with training data
bnb.fit(X_train, y_train)

BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)

In [31]:
#model's predictive score on the training data
bnb.score(X_train, y_train)

0.7554918815663801

In [32]:
#test the model on unseen data
#score predictive values in variable
y_pred = gnb.predict(X_test)

In [33]:
#Confusion matrix shows which values model predicted correctly vs incorrectly

cm = pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    columns=['Predicted Not Survived', 'Predicted Survived'],
    index=['True Not Survived', 'True Survived']
)

cm

Unnamed: 0,Predicted Not Survived,Predicted Survived
True Not Survived,134,33
True Survived,29,66


In [34]:
#predictive score of the model on the test data
gnb.score(X_test, y_test)

0.7633587786259542

In [35]:
#predictive score of the model for each predictive category
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.80      0.81       167
           1       0.67      0.69      0.68        95

   micro avg       0.76      0.76      0.76       262
   macro avg       0.74      0.75      0.75       262
weighted avg       0.77      0.76      0.76       262



## Logistic Regression

In [36]:
#build logistic regression model
LogReg = LogisticRegression()
LogReg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [37]:
#accuracy score of model using training data
LogReg.score(X_train, y_train)

0.7994269340974212

In [38]:
#generate prediction values
y_pred = LogReg.predict(X_test)

In [39]:
#Confusion matrix shows which values model predicted correctly vs incorrectly

cm = pd.DataFrame(
    confusion_matrix(y_test, y_pred),
    columns=['Predicted Not Survival', 'Predicted Survival'],
    index=['True Not Survival', 'True Survival']
)

cm

Unnamed: 0,Predicted Not Survival,Predicted Survival
True Not Survival,139,28
True Survival,28,67


In [40]:
#accuracy score of model on test data
LogReg.score(X_test, y_test)

0.7862595419847328

In [41]:
#from precision column, model is better at predicting passengers that do not survive
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.83      0.83       167
           1       0.71      0.71      0.71        95

   micro avg       0.79      0.79      0.79       262
   macro avg       0.77      0.77      0.77       262
weighted avg       0.79      0.79      0.79       262



Conclusion:
Based on the results from Gaussian and Bernoulli Naive Bayes models and Logistic Regression, there is no big difference. Gaussian and Bernoulli Naive Bayes models have the exactly same results. Logistic Regression is a little better than them.