In [1]:
import numpy as np
import pandas as pd

class NaiveBayes:
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.n_classes = len(self.classes)
        self.means = np.zeros((self.n_classes, X.shape[1]))
        self.variances = np.zeros((self.n_classes, X.shape[1]))
        self.class_prior = np.zeros(self.n_classes)
        
        for idx, c in enumerate(self.classes):
            X_c = X[y == c]
            self.means[idx, :] = X_c.mean(axis=0)
            self.variances[idx, :] = X_c.var(axis=0)
            self.class_prior[idx] = X_c.shape[0] / X.shape[0]

    def _calculate_probability(self, X):
        probabilities = []
        for idx, c in enumerate(self.classes):
            mean = self.means[idx, :]
            var = self.variances[idx, :]
            prior = np.log(self.class_prior[idx])
            
            likelihood = -0.5 * np.sum(np.log(2 * np.pi * var))
            likelihood -= 0.5 * np.sum(((X - mean) ** 2) / var, axis=1)
            posterior = prior + likelihood
            probabilities.append(posterior)
        
        return np.array(probabilities).T

    def predict(self, X):
        probs = self._calculate_probability(X)
        return self.classes[np.argmax(probs, axis=1)]
    
    def predict_proba(self, X):
        probs = self._calculate_probability(X)
        return np.exp(probs) / np.sum(np.exp(probs), axis=1, keepdims=True)

# Load the data
df = pd.read_csv("titanic.csv")

# Drop unnecessary columns
df.drop(['PassengerId', 'Name', 'SibSp', 'Parch', 'Ticket', 'Cabin', 'Embarked'], axis='columns', inplace=True)

# Handle categorical data
dummies = pd.get_dummies(df.Sex)
df = pd.concat([df, dummies], axis='columns')
df.drop(['Sex', 'male'], axis='columns', inplace=True)

# Fill missing values
df.Age = df.Age.fillna(df.Age.mean())

# Separate features and target
inputs = df.drop('Survived', axis='columns')
target = df.Survived

# Split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(inputs, target, test_size=0.3, random_state=42)

# Initialize and train the Naive Bayes model
model = NaiveBayes()
model.fit(X_train, y_train)

# Evaluate the model
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on test set: {accuracy}")

# Display the first 10 rows of the test set
print("First 10 rows of the test set:")
print(X_test.head(10))

# Display the true labels of the first 10 rows of the test set
print("True labels of the first 10 rows of the test set:")
print(y_test.head(10))

# Predict survival probabilities for the first 10 rows of the test set
print("Predictions for the first 10 rows of the test set:")
print(model.predict(X_test.head(10)))
print("Prediction probabilities for the first 10 rows of the test set:")
print(model.predict_proba(X_test.head(10)))


Accuracy on test set: 0.7761194029850746
First 10 rows of the test set:
     Pclass        Age     Fare  female
709       3  29.699118  15.2458   False
439       2  31.000000  10.5000   False
840       3  20.000000   7.9250   False
720       2   6.000000  33.0000    True
39        3  14.000000  11.2417    True
290       1  26.000000  78.8500    True
300       3  29.699118   7.7500    True
333       3  16.000000  18.0000   False
208       3  16.000000   7.7500    True
136       1  19.000000  26.2833    True
True labels of the first 10 rows of the test set:
709    1
439    0
840    0
720    1
39     1
290    1
300    1
333    0
208    1
136    1
Name: Survived, dtype: int64
Predictions for the first 10 rows of the test set:
[0 0 0 1 1 1 1 0 1 1]
Prediction probabilities for the first 10 rows of the test set:
[[0.96926072 0.03073928]
 [0.93714839 0.06285161]
 [0.96210901 0.03789099]
 [0.15460801 0.84539199]
 [0.37853196 0.62146804]
 [0.02101401 0.97898599]
 [0.46477178 0.53522822]
 [0.958