# Implementing the NaiveBayes Algorithm

In [14]:
import pandas as pd
import numpy as np

df= pd.read_csv('../day5_titanic/train.csv')
df.head(2)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C


In [15]:
y=df['Survived']
X=df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']]

In [16]:
X_numeric = X[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]
X_categorical = X[['Sex','Embarked']]
X_neither = X['Cabin']
X_onehot = pd.get_dummies(X_categorical,['Sex','Embarked']).astype(int)
X_cabin_nonna = X['Cabin'].apply(lambda x: 0 if pd.isna(x) else 1)
X_encoded0 = pd.merge(X_numeric, X_cabin_nonna,  left_index=True, right_index=True)
X_encoded1 = pd.merge(X_encoded0, X_onehot,  left_index=True, right_index=True)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(
    X_encoded1, y, test_size=0.2, random_state=42, stratify=y
)
from sklearn.impute import SimpleImputer

# Imputer for numeric columns (e.g., Age)
imputer = SimpleImputer(strategy='median')  # or 'mean' if you prefer

# Fit on train, transform train
X_train['Age'] = imputer.fit_transform(X_train[['Age']])

# Transform test using same statistics
X_test['Age'] = imputer.transform(X_test[['Age']])

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [17]:
#### Application from sklearn library
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Initialize model with default hyperparameters
nb = GaussianNB(var_smoothing=1e-9)

# Fit the model
nb.fit(X_train_scaled, y_train)

# Predictions
y_train_pred = nb.predict(X_train_scaled)
y_test_pred = nb.predict(X_test_scaled)

# Evaluate accuracy
train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)

print(f"Train Accuracy: {train_acc:.4f}")
print(f"Test Accuracy:  {test_acc:.4f}")


Train Accuracy: 0.7809
Test Accuracy:  0.7430


In [19]:
skvar = nb.var_

In [28]:
skmean = nb.theta_

In [21]:
type(skvar)

numpy.ndarray

In [22]:
skvar.shape

(2, 11)

In [23]:
nb.classes_

array([0, 1])

### Features are not conditionally independent, need to drop one hot encoded ones and see performance

##### Tuning hyperparameter

In [9]:
from sklearn.model_selection import GridSearchCV

param_grid = {'var_smoothing': np.logspace(-9, -6, 7)}
grid = GridSearchCV(GaussianNB(), param_grid, cv=5)
grid.fit(X_train_scaled, y_train)

print("Best var_smoothing:", grid.best_params_['var_smoothing'])
print("Best CV score:", grid.best_score_)


Best var_smoothing: 1e-09
Best CV score: 0.7725007386979218


In [6]:
## Implementation from Scratch

p_1= (np.sum(y_train,axis=0)/np.shape(y_train)[0])
p_1
p_0 = 1-p_1
print(p_1)
print(p_0)

0.38342696629213485
0.6165730337078652


In [7]:
p0 = np.mean(y_train == 0)
p0

np.float64(0.6165730337078652)

In [16]:
np.shape(y_train)

(712,)

In [8]:
X_mean_1 = np.mean(X_train[y_train==1],axis = 0)
X_mean_0 = np.mean(X_train[y_train==0],axis = 0)

X_std_1 = np.std(X_train[y_train==1],axis = 0, ddof=0)
X_std_0 = np.std(X_train[y_train==0],axis = 0, ddof=0)

In [25]:
type(X_train)

pandas.core.frame.DataFrame

In [24]:
type(X_mean_1)

pandas.core.series.Series

In [33]:
def get_distance_vectors(skmean, X_mean_0, X_mean_1):
    mean_0_np = X_mean_0#.to_numpy()
    mean_1_np = X_mean_1#.to_numpy()
    d0 = np.linalg.norm(skmean[0,:]-mean_0_np)
    d1 = np.linalg.norm(skmean[1,:]-mean_1_np)
    return (d0,d1)

In [30]:
print(get_distance_vectors(skmean, X_mean_0, X_mean_1))

(np.float64(37.258430732215736), np.float64(56.08012391456066))


In [31]:
X_mean_11 = np.mean(X_train_scaled[y_train==1],axis = 0)
X_mean_01 = np.mean(X_train_scaled[y_train==0],axis = 0)

X_std_11 = np.std(X_train_scaled[y_train==1],axis = 0, ddof=0)
X_std_01 = np.std(X_train_scaled[y_train==0],axis = 0, ddof=0)

In [34]:
print(get_distance_vectors(skmean, X_mean_01, X_mean_11))

(np.float64(0.0), np.float64(0.0))


In [None]:

def predict(X, mean1, std1, mean0, std0, p0):
    '''
    X has n_examples rows and n_features columns
    p0 is the probability of observing class 0 in the training data
    need to determinate if each example in X came from one multivariate gaussian or the other
    '''

### Gotcha!! We are doing naive bayes, naive assumption xis are condtionaly independent given yi

In [35]:
X_mean_1 = np.mean(X_train_scaled[y_train==1],axis = 0)
X_mean_0 = np.mean(X_train_scaled[y_train==0],axis = 0)

X_std_1 = np.std(X_train_scaled[y_train==1],axis = 0, ddof=0)
X_std_0 = np.std(X_train_scaled[y_train==0],axis = 0, ddof=0)

In [36]:
def predict(X, mean1, std1, mean0, std0, p0):
    """
    Predict class labels using Gaussian Naive Bayes.

    Parameters
    ----------
    X : array, shape (n_samples, n_features)
        Test data.
    mean1, std1 : arrays of shape (n_features,)
        Mean and std for features given class 1.
    mean0, std0 : arrays of shape (n_features,)
        Mean and std for features given class 0.
    p0 : float
        Prior probability of class 0 (P(y=0)).

    Returns
    -------
    y_pred : array of shape (n_samples,)
        Predicted class labels (0 or 1).
    """
    X = np.asarray(X)         # ensure numpy array
    mean0 = np.asarray(mean0)
    std0 = np.asarray(std0)
    mean1 = np.asarray(mean1)
    std1 = np.asarray(std1)
    eps = 1e-9  # small constant to prevent division by zero

    # Compute log-likelihood for each class (vectorized)
    log_p_x_given_0 = -0.5 * np.sum(np.log(2 * np.pi * (std0**2 + eps))) \
                      - 0.5 * np.sum(((X - mean0)**2) / (std0**2 + eps), axis=1)
    
    log_p_x_given_1 = -0.5 * np.sum(np.log(2 * np.pi * (std1**2 + eps))) \
                      - 0.5 * np.sum(((X - mean1)**2) / (std1**2 + eps), axis=1)

    # Class priors
    log_p0 = np.log(p0 + eps)
    log_p1 = np.log(1 - p0 + eps)

    # Posterior log-probabilities (up to constant)
    log_posterior_0 = log_p0 + log_p_x_given_0
    log_posterior_1 = log_p1 + log_p_x_given_1

    # Predict class with higher posterior
    y_pred = (log_posterior_1 > log_posterior_0).astype(int)

    return y_pred


y_train_pred = predict(X_train_scaled, X_mean_1, X_std_1, X_mean_0, X_std_0, p_0)


y_test_pred = predict(X_test_scaled, X_mean_1, X_std_1, X_mean_0, X_std_0, p_0)

# ----------------------------
# Evaluate accuracy
# ----------------------------
train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)

print(f"Train Accuracy: {train_acc:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")


Train Accuracy: 0.7809
Test Accuracy: 0.7430


#### Plotting 

#### Gaussian naive bayes form sklearn gives accuracy of 0.74 but i get around 0.6 percent accuracy from my implementation: find what could be different and needs improvement

###  My hunch :This should be on X_train_scaled

X_mean_1 = np.mean(X_train[y_train==1],axis = 0)

X_mean_0 = np.mean(X_train[y_train==0],axis = 0)


### Checking the implementation from the github repo MLfromscratch

In [10]:
import numpy as np


class NaiveBayes:
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self._classes = np.unique(y)
        n_classes = len(self._classes)

        # calculate mean, var, and prior for each class
        self._mean = np.zeros((n_classes, n_features), dtype=np.float64)
        self._var = np.zeros((n_classes, n_features), dtype=np.float64)
        self._priors = np.zeros(n_classes, dtype=np.float64)

        for idx, c in enumerate(self._classes):
            X_c = X[y == c]
            self._mean[idx, :] = X_c.mean(axis=0)
            self._var[idx, :] = X_c.var(axis=0)
            self._priors[idx] = X_c.shape[0] / float(n_samples)

    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        posteriors = []

        # calculate posterior probability for each class
        for idx, c in enumerate(self._classes):
            prior = np.log(self._priors[idx])
            posterior = np.sum(np.log(self._pdf(idx, x)))
            posterior = prior + posterior
            posteriors.append(posterior)

        # return class with highest posterior probability
        return self._classes[np.argmax(posteriors)]

    def _pdf(self, class_idx, x):
        mean = self._mean[class_idx]
        var = self._var[class_idx]
        numerator = np.exp(-((x - mean) ** 2) / (2 * var))
        denominator = np.sqrt(2 * np.pi * var)
        return numerator / denominator

In [13]:
# Testing
def accuracy(y_true, y_pred):
        accuracy = np.sum(y_true == y_pred) / len(y_true)
        return accuracy

nb = NaiveBayes()
nb.fit(X_train_scaled, y_train)
predictions = nb.predict(X_test_scaled)

'''
y_train_pred = predict(X_train_scaled, X_mean_1, X_std_1, X_mean_0, X_std_0, p_0)


y_test_pred = predict(X_test_scaled, X_mean_1, X_std_1, X_mean_0, X_std_0, p_0)

# ----------------------------
# Evaluate accuracy
# ----------------------------
train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)

print(f"Train Accuracy: {train_acc:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")

'''
print("Naive Bayes classification accuracy", accuracy(y_test, predictions))

Naive Bayes classification accuracy 0.7430167597765364


## Fixed the error
### Compute the priors on the scaled data, that was what the predict method was seeing