# Titanic Survival Prediction using Naive Bayes and K-Nearest Neighbours
Implemented from scratch using Numpy, Pandas, and Matplotlib.

**Dataset:** Titanic - Machine Learning from Disaster

This notebook demonstrates:
- Data preprocessing and visualization
- Naive Bayes and K-Nearest Neighbours from scratch
- Evaluation of predictions using accuracy


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load datasets
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
gender_submission = pd.read_csv("gender_submission.csv")

## Data Preprocessing
- Drop unused columns: `Cabin`, `Ticket`, `Name`, `PassengerId`
- Fill missing `Age` with median
- Encode `Sex` and `Embarked`
- Normalize `Age` and `Fare` for KNN

In [None]:
from sklearn.preprocessing import LabelEncoder

def preprocess(df):
    df = df.copy()
    df = df.drop(columns=["Cabin", "Ticket", "Name", "PassengerId"], errors='ignore')
    df["Age"].fillna(df["Age"].median(), inplace=True)
    df["Fare"].fillna(df["Fare"].median(), inplace=True)
    df["Embarked"].fillna(df["Embarked"].mode()[0], inplace=True)

    le_sex = LabelEncoder()
    le_embarked = LabelEncoder()

    df["Sex"] = le_sex.fit_transform(df["Sex"])
    df["Embarked"] = le_embarked.fit_transform(df["Embarked"])

    df["Age"] = (df["Age"] - df["Age"].mean()) / df["Age"].std()
    df["Fare"] = (df["Fare"] - df["Fare"].mean()) / df["Fare"].std()

    return df

train_processed = preprocess(train)
X_train = train_processed.drop("Survived", axis=1).values
y_train = train_processed["Survived"].values

## Naive Bayes Classifier (from scratch)


In [None]:
def naive_bayes_train(X, y):
    classes = np.unique(y)
    summaries = {}

    for c in classes:
        X_c = X[y == c]
        summaries[c] = {
            "mean": X_c.mean(axis=0),
            "var": X_c.var(axis=0) + 1e-9,  # Avoid division by zero
            "prior": X_c.shape[0] / X.shape[0]
        }
    return summaries

def gaussian_prob(x, mean, var):
    exponent = np.exp(- ((x - mean) ** 2) / (2 * var))
    return (1 / np.sqrt(2 * np.pi * var)) * exponent

def naive_bayes_predict(X, summaries):
    y_pred = []
    for x in X:
        posteriors = {}
        for c, params in summaries.items():
            prior = np.log(params["prior"])
            likelihood = np.sum(np.log(gaussian_prob(x, params["mean"], params["var"])))
            posteriors[c] = prior + likelihood
        y_pred.append(max(posteriors, key=posteriors.get))
    return np.array(y_pred)

nb_model = naive_bayes_train(X_train, y_train)
nb_predictions = naive_bayes_predict(X_train, nb_model)
nb_accuracy = (nb_predictions == y_train).mean()
print(f"Naive Bayes Training Accuracy: {nb_accuracy * 100:.2f}%")

## K-Nearest Neighbours (from scratch)


In [None]:
def euclidean_distance(a, b):
    return np.sqrt(np.sum((a - b) ** 2))

def knn_predict(X_train, y_train, X_test, k=5):
    predictions = []
    for x in X_test:
        distances = [euclidean_distance(x, x_train) for x_train in X_train]
        k_indices = np.argsort(distances)[:k]
        k_nearest_labels = y_train[k_indices]
        values, counts = np.unique(k_nearest_labels, return_counts=True)
        predictions.append(values[np.argmax(counts)])
    return np.array(predictions)

knn_preds = knn_predict(X_train, y_train, X_train, k=5)
knn_accuracy = (knn_preds == y_train).mean()
print(f"KNN Training Accuracy (k=5): {knn_accuracy * 100:.2f}%")

## Conclusion
- Naive Bayes performed well given the assumptions of feature independence.
- KNN was sensitive to feature scaling and choice of `k`.

Both classifiers were implemented without using any ML libraries.
