In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score,confusion_matrix

In [4]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 889 entries, 0 to 888
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Survived     889 non-null    int64  
 2   Pclass       889 non-null    int64  
 3   Name         889 non-null    object 
 4   Sex          889 non-null    object 
 5   Age          889 non-null    float64
 6   SibSp        889 non-null    int64  
 7   Parch        889 non-null    int64  
 8   Ticket       889 non-null    object 
 9   Fare         889 non-null    float64
 10  Cabin        202 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.5+ KB


#Encoding of categorical variables

In [6]:
enc = LabelEncoder()
df['Sex'] = enc.fit_transform(df.Sex)
df['Embarked'] = enc.fit_transform(df.Embarked)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 889 entries, 0 to 888
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  889 non-null    int64  
 1   Survived     889 non-null    int64  
 2   Pclass       889 non-null    int64  
 3   Name         889 non-null    object 
 4   Sex          889 non-null    int64  
 5   Age          889 non-null    float64
 6   SibSp        889 non-null    int64  
 7   Parch        889 non-null    int64  
 8   Ticket       889 non-null    object 
 9   Fare         889 non-null    float64
 10  Cabin        202 non-null    object 
 11  Embarked     889 non-null    int64  
dtypes: float64(2), int64(7), object(3)
memory usage: 83.5+ KB


In [7]:
df.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

#Data Cleaning

In [8]:
df = df.drop(columns = ['Cabin', 'PassengerId', 'Name', 'Ticket'])
df.columns

Index(['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
       'Embarked'],
      dtype='object')

# Model Function

In [10]:
clf=BernoulliNB()

def model(x, y, rstate):
    X_train, X_test, y_train, y_test=train_test_split(x, y,test_size=0.3, random_state=rstate)
    y_pred = clf.fit(X_train, y_train).predict(X_test)
    acc = accuracy_score(y_test, y_pred, normalize=True)
    confm = confusion_matrix(y_test, y_pred)
    return [acc, confm]

#Splitting the data to feed into the model

In [11]:
rstate=-1
models = []

for enum, i in enumerate(df.columns[1:]):
    if i == 'Age' or i == 'Fare':
        continue
    y = df.iloc[:, enum+1:enum+2]
    x = df.drop(columns = y.columns)
    rstate += 1
    models.append([model(x, y, rstate), y.columns[0]])

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


#Model Outcome

In [12]:
for _ in models:
    print(f'{_[1]} vs All\n')
    print(f'Accuracy: {_[0][0]}')
    print(f'Confusion Matrix: \n {_[0][1]}\n')
    print(f"Out of total 267 records only {np.trace(_[0][1])} are successfully classified, giving a model accuracy of {int(_[0][0]*100)}%.")
    print('\n-----------------------\n')

Pclass vs All

Accuracy: 0.5917602996254682
Confusion Matrix: 
 [[ 30   8  32]
 [  9  10  30]
 [ 24   6 118]]

Out of total 267 records only 158 are successfully classified, giving a model accuracy of 59%.

-----------------------

Sex vs All

Accuracy: 0.7602996254681648
Confusion Matrix: 
 [[ 48  49]
 [ 15 155]]

Out of total 267 records only 203 are successfully classified, giving a model accuracy of 76%.

-----------------------

SibSp vs All

Accuracy: 0.6891385767790262
Confusion Matrix: 
 [[167  25   0   0   0   0   0]
 [ 31  17   0   0   0   0   0]
 [  5   7   0   0   0   0   0]
 [  4   3   0   0   0   0   0]
 [  2   2   0   0   0   0   0]
 [  0   1   0   0   0   0   0]
 [  1   2   0   0   0   0   0]]

Out of total 267 records only 184 are successfully classified, giving a model accuracy of 68%.

-----------------------

Parch vs All

Accuracy: 0.7191011235955056
Confusion Matrix: 
 [[183  16   0   0   0]
 [ 34   9   0   0   0]
 [ 18   3   0   0   0]
 [  2   0   0   0   0]
 [  

#**Naive Bayes**

#Importing modules

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import * 
from sklearn.metrics import accuracy_score, confusion_matrix

**Training & Predicting Accuracy, Confusion Matrix of the model using Naive Bayes**
1. Naive Bayes is used to classify Record with the help of Probability
2. As we Know, Naive Bayes is only applicable for Categorical variables.
3. Only for Categorical Variables we are predicting Accuracy and Confusion Matrix :

Categorical Columns are: Survived, Pclass, Sex, SibSp, Parch, Embarked

#By Using Function:

In [20]:
def prediction(col):
    
    clf = BernoulliNB()

    for i in col:
        x = df.drop([i], axis=1)
        print("--------------------------------------------------------------------------------------")
        print(f"Here Dependent Variable is {i} & remaining are Independent Variables")
        print("x :\n", x.head())
        y = df[i]
        print("\ny:\n", y.head())
    
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size =0.3, random_state=0)
        model = clf.fit(x_train,y_train)
        pred = model.predict(x_test)
    
        print(f"\nAcuuracy for {i} is: ", accuracy_score(y_test, pred, normalize=True))
        print(f"Confusion matrix for {i} is \n", confusion_matrix(y_test, pred))

column = ['Survived', 'Pclass', 'Sex', 'SibSp', 'Parch','Embarked']        
prediction(column)

--------------------------------------------------------------------------------------
Here Dependent Variable is Survived & remaining are Independent Variables
x :
    Pclass  Sex   Age  SibSp  Parch     Fare  Embarked
0       3    1  22.0      1      0   7.2500         2
1       1    0  38.0      1      0  71.2833         0
2       3    0  26.0      0      0   7.9250         2
3       1    0  35.0      1      0  53.1000         2
4       3    1  35.0      0      0   8.0500         2

y:
 0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

Acuuracy for Survived is:  0.7715355805243446
Confusion matrix for Survived is 
 [[131  26]
 [ 35  75]]
--------------------------------------------------------------------------------------
Here Dependent Variable is Pclass & remaining are Independent Variables
x :
    Survived  Sex   Age  SibSp  Parch     Fare  Embarked
0         0    1  22.0      1      0   7.2500         2
1         1    0  38.0      1      0  71.2833         0
2   

#Resultant Accuracy Score

In [21]:
score = {'Dependent Variable': ['Survived', 'Pclass', 'Sex', 'SibSp', 'Parch','Embarked'],
         'Accuracy Score': [0.7835,0.5932,0.7835,0.6791,0.7425,0.6977],
         'Total ':[268,268,268,268,268,268],
         'Correct Prediction':[210,159,210,182,199,187],
         'Incorrect Prediction':[58,109,58,86,69,81]}
score = pd.DataFrame(score)
score

Unnamed: 0,Dependent Variable,Accuracy Score,Total,Correct Prediction,Incorrect Prediction
0,Survived,0.7835,268,210,58
1,Pclass,0.5932,268,159,109
2,Sex,0.7835,268,210,58
3,SibSp,0.6791,268,182,86
4,Parch,0.7425,268,199,69
5,Embarked,0.6977,268,187,81


#Conclusion:

1. The Survived Column and Gender column has same accuracy score
2. Pclass has the least accuracy