In [35]:
import numpy as np
import pandas as pd

In [36]:
# Load the dataset
data = pd.read_csv("Sleep_health_and_lifestyle_dataset.csv")


In [37]:
# Function to make categorical columns into labelled columns
def makelabelled(column):
    second_limit = column.mean()
    first_limit = 0.5 * second_limit
    third_limit = 1.5 * second_limit
    for i in range(0,len(column)):
        if(column[i]<first_limit):
            column[i] = 0
        elif(column[i] < second_limit):
            column[i] = 1
        elif(column[i]<third_limit):
            column[i] = 2
        else:
            column[i] = 3
    return column

In [38]:
# Function to fit the Multinomial Naive Bayes model
def fit(X_train, Y_train):
    result = {}
    class_values = set(Y_train)
    for curr_value in class_values:
        result[curr_value] = {}
        result["total_data"] = len(Y_train)
        curr_class_rows = (Y_train == curr_value)
        X_train_curr = X_train[curr_class_rows]
        Y_train_curr = Y_train[curr_class_rows]
        num_features = X_train.shape[1]
        result[curr_value]["total_count"] = len(Y_train_curr)
        for j in range(1, num_features + 1):
            result[curr_value][j] = {}
            all_possible_values = set(X_train[:, j - 1])
            for this_value in all_possible_values:
                result[curr_value][j][this_value] = (X_train_curr[:, j - 1] == this_value).sum()
    return result

In [39]:
# Function to calculate probability for a single data point
def probability(dictionary, x, current_class):
    output = np.log(dictionary[current_class]["total_count"]) - np.log(dictionary["total_data"])
    num_features = len(dictionary[current_class].keys()) - 1
    for j in range(1, num_features + 1):
        xj = x[j - 1]
        count_current_class_with_value_xj = dictionary[current_class][j][xj] + 1
        count_current_class = dictionary[current_class]["total_count"] + len(dictionary[current_class][j].keys())
        current_xj_prob = np.log(count_current_class_with_value_xj) - np.log(count_current_class)
        output = output + current_xj_prob
    return output

In [40]:
# Function to predict the class for a single data point
def predictSinglePoint(dictionary, x):
    classes = dictionary.keys()
    best_p = -1000
    best_class = -1
    first_run = True
    for current_class in classes:
        if current_class == "total_data":
            continue
        p_curr_class = probability(dictionary, x, current_class)
        if first_run or p_curr_class > best_p:
            best_p = p_curr_class
            best_class = current_class
        first_run = False
    return best_class



In [41]:
# Function to predict classes for multiple data points
def predict(dictionary, X_test):
    Y_pred = []
    for x in X_test:
        x_class = predictSinglePoint(dictionary, x)
        Y_pred.append(x_class)
    return Y_pred

In [42]:
# Convert categorical columns to numerical labels
from sklearn.preprocessing import LabelEncoder
label_encoders = {}
for column in ["Gender", "Occupation", "BMI Category"]:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])


In [43]:
# Convert blood pressure column to numerical labels
data['Blood Pressure'] = data['Blood Pressure'].apply(lambda x: x.split('/')[0] if isinstance(x, str) else x).astype(float)

# Now you can proceed with the label encoding
label_encoders['Blood Pressure'] = LabelEncoder()
data['Blood Pressure'] = label_encoders['Blood Pressure'].fit_transform(data['Blood Pressure'])


In [44]:
# Prepare X and y
X = data.drop(["Person ID", "Quality of Sleep"], axis=1).values
y = data["Quality of Sleep"].values

In [45]:
# Split data into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [46]:
# Fit the model
dictionary = fit(X_train, Y_train)

In [51]:
# Function to predict the class for a single data point
def predictSinglePoint(dictionary, x):
    classes = dictionary.keys()
    best_p = -1000
    best_class = -1
    first_run = True
    for current_class in classes:
        if current_class == "total_data":
            continue
        p_curr_class = probability(dictionary, x, current_class)
        print("Current class:", current_class)
        print("Dictionary keys:", dictionary[current_class].keys())
        print("x:", x)
        if first_run or p_curr_class > best_p:
            best_p = p_curr_class
            best_class = current_class
        first_run = False
    return best_class


In [52]:
# Predictions
Y_pred = predict(dictionary, X_test)


Current class: 4
Dictionary keys: dict_keys(['total_count', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
x: [0 37 0 7.2 60 4 0 0 68 7000 nan]
Current class: 5
Dictionary keys: dict_keys(['total_count', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
x: [0 37 0 7.2 60 4 0 0 68 7000 nan]
Current class: 6
Dictionary keys: dict_keys(['total_count', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
x: [0 37 0 7.2 60 4 0 0 68 7000 nan]
Current class: 7
Dictionary keys: dict_keys(['total_count', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
x: [0 37 0 7.2 60 4 0 0 68 7000 nan]
Current class: 8
Dictionary keys: dict_keys(['total_count', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
x: [0 37 0 7.2 60 4 0 0 68 7000 nan]
Current class: 9
Dictionary keys: dict_keys(['total_count', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
x: [0 37 0 7.2 60 4 0 0 68 7000 nan]
Current class: 4
Dictionary keys: dict_keys(['total_count', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
x: [0 50 5 6.0 90 8 3 16 75 10000 'Sleep Apnea']
Current class: 5
Dictionary keys: dict_keys(['total_count

KeyError: 6

### Implememtation of Multinomial Naive Bayes from Scratch

In [48]:

# Evaluation
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(Y_test, Y_pred))
print(confusion_matrix(Y_test, Y_pred))

NameError: name 'Y_pred' is not defined