### In this file, I have built classifier models, 1 is Naive Bayesian and the other is Vanilla Neural Network classifier. The dataset used is titanic dataset.

In [89]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import KNNImputer
import polars as pl


* Polars is also being used as it has the ability to read data types accurately in scenerios where pandas sometimes breaks down.


* So the normal workflow for me is to load the dataset with polars and then convert to pandas. It reduces chances of error.

In [90]:
df= sns.load_dataset('titanic')
df_polars= pl.from_pandas(df)

Here is an important hack:
* If you are working with excel files, they are very fussy in terms of the inconsistency in datatypes when loading the file etc.
* The hack is to convert the excel files to csv with `df.to_csv()` function and then work on that csv file
* This makes life easier as the errors don't occur with .csv.

In [91]:
df_polars.dtypes

[Int64,
 Int64,
 String,
 Float64,
 Int64,
 Int64,
 Float64,
 String,
 Categorical(ordering='physical'),
 String,
 Boolean,
 Categorical(ordering='physical'),
 String,
 String,
 Boolean]

In [92]:
df.to_csv('Titanic_Dataset.csv', )

In [93]:
df= pd.read_csv('Titanic_Dataset.csv', index_col=False)

## Ordinal Encoding to handle missing as well as for Classification

In [94]:
oencoder= OrdinalEncoder()
encoded= oencoder.fit_transform(df)

In [95]:
df=pd.DataFrame(data=encoded, columns= df.columns)

In [96]:
df

Unnamed: 0.1,Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0.0,0.0,2.0,1.0,28.0,1.0,0.0,18.0,2.0,2.0,1.0,1.0,,2.0,0.0,0.0
1,1.0,1.0,0.0,0.0,51.0,1.0,0.0,207.0,0.0,0.0,2.0,0.0,2.0,0.0,1.0,0.0
2,2.0,1.0,2.0,0.0,34.0,0.0,0.0,41.0,2.0,2.0,2.0,0.0,,2.0,1.0,1.0
3,3.0,1.0,0.0,0.0,47.0,1.0,0.0,189.0,2.0,0.0,2.0,0.0,2.0,2.0,1.0,0.0
4,4.0,0.0,2.0,1.0,47.0,0.0,0.0,43.0,2.0,2.0,1.0,1.0,,2.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,886.0,0.0,1.0,1.0,35.0,0.0,0.0,85.0,2.0,1.0,1.0,1.0,,2.0,0.0,1.0
887,887.0,1.0,0.0,0.0,24.0,0.0,0.0,153.0,2.0,0.0,2.0,0.0,1.0,2.0,1.0,1.0
888,888.0,0.0,2.0,0.0,,1.0,2.0,131.0,2.0,2.0,2.0,0.0,,2.0,0.0,0.0
889,889.0,1.0,0.0,1.0,34.0,0.0,0.0,153.0,0.0,0.0,1.0,1.0,2.0,0.0,1.0,1.0


In [97]:
df.isna().sum()

Unnamed: 0       0
survived         0
pclass           0
sex              0
age            177
sibsp            0
parch            0
fare             0
embarked         2
class            0
who              0
adult_male       0
deck           688
embark_town      2
alive            0
alone            0
dtype: int64

In [98]:
imputer= KNNImputer()
imputed_data= imputer.fit_transform(df)

In [99]:
imputed_data

array([[  0.,   0.,   2., ...,   2.,   0.,   0.],
       [  1.,   1.,   0., ...,   0.,   1.,   0.],
       [  2.,   1.,   2., ...,   2.,   1.,   1.],
       ...,
       [888.,   0.,   2., ...,   2.,   0.,   0.],
       [889.,   1.,   0., ...,   0.,   1.,   1.],
       [890.,   0.,   2., ...,   1.,   0.,   1.]])

In [100]:
imputed_data= pd.DataFrame(data= imputed_data, columns= df.columns)

In [101]:
imputed_data

Unnamed: 0.1,Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0.0,0.0,2.0,1.0,28.0,1.0,0.0,18.0,2.0,2.0,1.0,1.0,4.2,2.0,0.0,0.0
1,1.0,1.0,0.0,0.0,51.0,1.0,0.0,207.0,0.0,0.0,2.0,0.0,2.0,0.0,1.0,0.0
2,2.0,1.0,2.0,0.0,34.0,0.0,0.0,41.0,2.0,2.0,2.0,0.0,4.2,2.0,1.0,1.0
3,3.0,1.0,0.0,0.0,47.0,1.0,0.0,189.0,2.0,0.0,2.0,0.0,2.0,2.0,1.0,0.0
4,4.0,0.0,2.0,1.0,47.0,0.0,0.0,43.0,2.0,2.0,1.0,1.0,4.2,2.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,886.0,0.0,1.0,1.0,35.0,0.0,0.0,85.0,2.0,1.0,1.0,1.0,2.8,2.0,0.0,1.0
887,887.0,1.0,0.0,0.0,24.0,0.0,0.0,153.0,2.0,0.0,2.0,0.0,1.0,2.0,1.0,1.0
888,888.0,0.0,2.0,0.0,35.8,1.0,2.0,131.0,2.0,2.0,2.0,0.0,2.4,2.0,0.0,0.0
889,889.0,1.0,0.0,1.0,34.0,0.0,0.0,153.0,0.0,0.0,1.0,1.0,2.0,0.0,1.0,1.0


In [102]:
imputed_data.isna().sum()

Unnamed: 0     0
survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           0
embark_town    0
alive          0
alone          0
dtype: int64

# Gaussian Naive Bayes Classifier

In [103]:
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

nb_classifier= GaussianNB()

X= imputed_data.drop('survived', axis=1)
Y= imputed_data['survived']
X_train, X_test, Y_train, Y_test= train_test_split(X,Y, test_size= 0.2, random_state=42)


nothing =nb_classifier.fit(X_train, Y_train)
predictions= nb_classifier.predict(X_test)


accuracy= accuracy_score(predictions, Y_test)
print("Accuracy of the model is: ", accuracy)

Accuracy of the model is:  1.0


# Neural Network Classifier

For Neural Network, we will also scale the dataset.

In [104]:
from sklearn.preprocessing import MinMaxScaler

# scaler= MinMaxScaler()
# scaled_dataset= scaler.fit_transform(imputed_data)
scaled_dataset= imputed_data

In [105]:
dataset= pd.DataFrame(data= scaled_dataset, columns=imputed_data.columns)
dataset['survived']=dataset['survived'].astype(int)

X=dataset.drop('survived', axis=1)
Y=dataset['survived']

X_train, X_test, Y_train, Y_test= train_test_split(X,Y, test_size= 0.1, random_state=42)


dataset

Unnamed: 0.1,Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0.0,0,2.0,1.0,28.0,1.0,0.0,18.0,2.0,2.0,1.0,1.0,4.2,2.0,0.0,0.0
1,1.0,1,0.0,0.0,51.0,1.0,0.0,207.0,0.0,0.0,2.0,0.0,2.0,0.0,1.0,0.0
2,2.0,1,2.0,0.0,34.0,0.0,0.0,41.0,2.0,2.0,2.0,0.0,4.2,2.0,1.0,1.0
3,3.0,1,0.0,0.0,47.0,1.0,0.0,189.0,2.0,0.0,2.0,0.0,2.0,2.0,1.0,0.0
4,4.0,0,2.0,1.0,47.0,0.0,0.0,43.0,2.0,2.0,1.0,1.0,4.2,2.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,886.0,0,1.0,1.0,35.0,0.0,0.0,85.0,2.0,1.0,1.0,1.0,2.8,2.0,0.0,1.0
887,887.0,1,0.0,0.0,24.0,0.0,0.0,153.0,2.0,0.0,2.0,0.0,1.0,2.0,1.0,1.0
888,888.0,0,2.0,0.0,35.8,1.0,2.0,131.0,2.0,2.0,2.0,0.0,2.4,2.0,0.0,0.0
889,889.0,1,0.0,1.0,34.0,0.0,0.0,153.0,0.0,0.0,1.0,1.0,2.0,0.0,1.0,1.0


In [106]:
print(f"Size of X_train: {X_train.shape}")
print(f"Size of X_test: {X_test.shape}")
print(f"Size of Y_train: {Y_train.shape}")
print(f"Size of Y_test: {Y_test.shape}")


Size of X_train: (801, 15)
Size of X_test: (90, 15)
Size of Y_train: (801,)
Size of Y_test: (90,)


In [108]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout


model = Sequential([
    Dense(128, input_shape=(X_train.shape[1],), activation='relu'),  # Input layer with 16 units
      
    Dense(64, activation='relu'),  
    Dense(1, activation='sigmoid')  ])


model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', 'recall', 'precision'])

model.fit(X_train, Y_train, epochs=50, batch_size=30)

predicted=model.predict(X_test)
predicted = (predicted > 0.5).astype(int)


accuracy= accuracy_score(Y_test, predicted)
print('The accuracy is: ', accuracy)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.5550 - loss: 3.5950 - precision: 0.4260 - recall: 0.4440
Epoch 2/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.6010 - loss: 1.0692 - precision: 0.4612 - recall: 0.4845
Epoch 3/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.6094 - loss: 1.1198 - precision: 0.4990 - recall: 0.5391
Epoch 4/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6570 - loss: 0.9923 - precision: 0.6006 - recall: 0.4843
Epoch 5/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.6768 - loss: 0.6943 - precision: 0.5901 - recall: 0.6105
Epoch 6/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.7772 - loss: 0.5305 - precision: 0.7567 - recall: 0.6099
Epoch 7/50
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

# Conclusion:

The performance of both the models has been good although the Neural Network lacks performance due to lack of tuning. Repeated trials can help us achieve better results.The Naive Bayesian Classifer has 100% accuracy while Vanilla Neural Network showed 96-97% accuracy with 2 layers (1st of 128 units and 2nd of 64 units). The metrics included with the model that are precision, recall, loss seem erratic at points as the model size and batch size is small.