# Assignment for Day-25

## Apply Naive Bayes Algorithms to Titanic DataSet

Construct the naive bayes for the titanic dataset with the following dependent variables:

___Survived, Pclass, Sex, SibSp, Parch & Embarked___

__Import Libraries__

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

## Load the dataset train.csv

In [2]:
DS_PATH="./datasets/train.csv"
def load_titanic_dataset(path=DS_PATH):
    return pd.read_csv(path)

In [3]:
titanic_ds = load_titanic_dataset()
titanic_ds.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
titanic_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 889 entries, 0 to 888
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Survived     889 non-null    int64  
 2   Pclass       889 non-null    int64  
 3   Name         889 non-null    object 
 4   Sex          889 non-null    object 
 5   Age          889 non-null    float64
 6   SibSp        889 non-null    int64  
 7   Parch        889 non-null    int64  
 8   Ticket       889 non-null    object 
 9   Fare         889 non-null    float64
 10  Cabin        202 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.5+ KB


## Drop Name and PassengerId from the dataset

In [5]:
unwanted_cols=["PassengerId", "Name", "Cabin"]
titanic_ds.drop(unwanted_cols, axis=1, inplace=True)
titanic_ds.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,0,3,male,22.0,1,0,A/5 21171,7.25,S
1,1,1,female,38.0,1,0,PC 17599,71.2833,C
2,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,S
3,1,1,female,35.0,1,0,113803,53.1,S
4,0,3,male,35.0,0,0,373450,8.05,S


In [6]:
titanic_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 889 entries, 0 to 888
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  889 non-null    int64  
 1   Pclass    889 non-null    int64  
 2   Sex       889 non-null    object 
 3   Age       889 non-null    float64
 4   SibSp     889 non-null    int64  
 5   Parch     889 non-null    int64  
 6   Ticket    889 non-null    object 
 7   Fare      889 non-null    float64
 8   Embarked  889 non-null    object 
dtypes: float64(2), int64(4), object(3)
memory usage: 62.6+ KB


In [7]:
titanic_ds.Sex.value_counts()

male      577
female    312
Name: Sex, dtype: int64

In [8]:
titanic_ds.Ticket.value_counts()

CA. 2343      7
347082        7
1601          7
CA 2144       6
347088        6
             ..
36963         1
2674          1
C.A. 17248    1
3474          1
PC 17595      1
Name: Ticket, Length: 680, dtype: int64

In [9]:
titanic_ds.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

## Define the pipeline for conversion

In [10]:
def prepare_titanic_pipeline(data, dv):
    cat_cols = ["Sex", "Ticket", "Embarked"]
    num_cols = ["Survived", "Pclass", "Age", "SibSp", "Parch", "Fare"]
    
    ## Remove the dv from the list
    if dv in cat_cols:
        cat_cols.remove(dv)
    elif dv in num_cols:
        num_cols.remove(dv)
    else:
        print("Is it valid Dependent Variable")
        return None
    
    np = Pipeline([
        ('std scaler', StandardScaler()), 
    ])
    
    fp = ColumnTransformer([
        ('num', np, num_cols),
        ('cat', OrdinalEncoder(), cat_cols),
    ])
    
    return fp.fit_transform(data)

## Routine to apply Naive Bayes

In [11]:
def SpecNBData(data, dv, nbType):
    if nbType == 'bermouli':
        model = BernoulliNB()
    elif nbType == 'gaussian':
        model = GaussianNB()
    else:
        print("Unsupport Naive Bayes Model Type: {0}".format(nbType))
        return None
        
    X = data.drop(dv, axis=1)
    y = data[dv]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=2)
    
    clf = model.fit(prepare_titanic_pipeline(X_train, dv), y_train)
    y_pred = clf.predict(prepare_titanic_pipeline(X_test, dv))
    
    print("Accuracy score for dependent variable '{0}' on '{1}' NB Type:\n{2}".format(dv, nbType, accuracy_score(y_test, y_pred, normalize=True)))
    print("Confusion Matrix for dependent variable '{0}' on '{1}' NB Type:\n{2}".format(dv, nbType, confusion_matrix(y_test, y_pred)))

In [12]:
SpecNBData(titanic_ds, 'Survived', 'bermouli')

Accuracy score for dependent variable 'Survived' on 'bermouli' NB Type:
0.7696629213483146
Confusion Matrix for dependent variable 'Survived' on 'bermouli' NB Type:
[[91 20]
 [21 46]]


In [13]:
dv_list = ['Survived', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']
for dv in dv_list:
    SpecNBData(titanic_ds, dv, 'bermouli')
    print()

Accuracy score for dependent variable 'Survived' on 'bermouli' NB Type:
0.7696629213483146
Confusion Matrix for dependent variable 'Survived' on 'bermouli' NB Type:
[[91 20]
 [21 46]]

Accuracy score for dependent variable 'Pclass' on 'bermouli' NB Type:
0.6123595505617978
Confusion Matrix for dependent variable 'Pclass' on 'bermouli' NB Type:
[[31  0 11]
 [ 3  6 40]
 [ 8  7 72]]

Accuracy score for dependent variable 'Sex' on 'bermouli' NB Type:
0.7808988764044944
Confusion Matrix for dependent variable 'Sex' on 'bermouli' NB Type:
[[44 20]
 [19 95]]

Accuracy score for dependent variable 'SibSp' on 'bermouli' NB Type:
0.6629213483146067
Confusion Matrix for dependent variable 'SibSp' on 'bermouli' NB Type:
[[106  22   0   0   1   0   0]
 [ 19  12   0   0   2   0   0]
 [  3   4   0   0   1   0   0]
 [  1   1   0   0   2   0   0]
 [  0   1   0   0   0   0   0]
 [  0   0   0   0   1   0   0]
 [  0   2   0   0   0   0   0]]

Accuracy score for dependent variable 'Parch' on 'bermouli' NB 

In [14]:
dv_list = ['Survived', 'Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']
for dv in dv_list:
    SpecNBData(titanic_ds, dv, 'gaussian')
    print()

Accuracy score for dependent variable 'Survived' on 'gaussian' NB Type:
0.7696629213483146
Confusion Matrix for dependent variable 'Survived' on 'gaussian' NB Type:
[[87 24]
 [17 50]]

Accuracy score for dependent variable 'Pclass' on 'gaussian' NB Type:
0.7359550561797753
Confusion Matrix for dependent variable 'Pclass' on 'gaussian' NB Type:
[[32  9  1]
 [ 4 39  6]
 [ 9 18 60]]

Accuracy score for dependent variable 'Sex' on 'gaussian' NB Type:
0.7359550561797753
Confusion Matrix for dependent variable 'Sex' on 'gaussian' NB Type:
[[39 25]
 [22 92]]

Accuracy score for dependent variable 'SibSp' on 'gaussian' NB Type:
0.6404494382022472
Confusion Matrix for dependent variable 'SibSp' on 'gaussian' NB Type:
[[103  12   2  12   0   0   0]
 [ 19   8   0   6   0   0   0]
 [  3   3   0   2   0   0   0]
 [  1   0   0   3   0   0   0]
 [  0   0   0   1   0   0   0]
 [  0   0   0   1   0   0   0]
 [  0   0   0   2   0   0   0]]

Accuracy score for dependent variable 'Parch' on 'gaussian' NB 