In [1]:
# Cell 1

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df = pd.read_csv("cirrhosis.csv")
df.head()


Unnamed: 0,ID,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,1,400,D,D-penicillamine,21464,F,Y,Y,Y,Y,14.5,261.0,2.6,156.0,1718.0,137.95,172.0,190.0,12.2,4.0
1,2,4500,C,D-penicillamine,20617,F,N,Y,Y,N,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,3.0
2,3,1012,D,D-penicillamine,25594,M,N,N,N,S,1.4,176.0,3.48,210.0,516.0,96.1,55.0,151.0,12.0,4.0
3,4,1925,D,D-penicillamine,19994,F,N,Y,Y,S,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,4.0
4,5,1504,CL,Placebo,13918,F,N,Y,Y,N,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,3.0


In [3]:
# Cell 2

target = "Stage"

# Drop rows with missing values
df = df.dropna()

# Encode ALL object columns (including Stage)
for col in df.columns:
    if df[col].dtype == 'object':
        df[col], _ = pd.factorize(df[col])

# Split features and target
X = df.drop(columns=[target]).values
y = df[target].values


In [4]:
# Cell 3

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [5]:
# Cell 4

class GaussianNaiveBayes:
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.mean = {}
        self.var = {}
        self.prior = {}

        for c in self.classes:
            X_c = X[y == c]
            self.mean[c] = np.mean(X_c, axis=0)
            self.var[c] = np.var(X_c, axis=0) + 1e-6
            self.prior[c] = len(X_c) / len(y)

    def gaussian_pdf(self, x, mean, var):
        return (1 / np.sqrt(2 * np.pi * var)) * np.exp(-((x - mean)**2) / (2 * var))

    def predict(self, X):
        preds = []
        for x in X:
            posteriors = []
            for c in self.classes:
                prior = np.log(self.prior[c])
                likelihood = np.sum(np.log(self.gaussian_pdf(x, self.mean[c], self.var[c])))
                posteriors.append(prior + likelihood)
            preds.append(self.classes[np.argmax(posteriors)])
        return np.array(preds)


In [6]:
# Cell 5

nb = GaussianNaiveBayes()
nb.fit(X_train, y_train)

y_pred = nb.predict(X_test)


  likelihood = np.sum(np.log(self.gaussian_pdf(x, self.mean[c], self.var[c])))


In [7]:
# Cell 6

def macro_f1(y_true, y_pred):
    classes = np.unique(y_true)
    f1_scores = []

    for c in classes:
        tp = np.sum((y_pred == c) & (y_true == c))
        fp = np.sum((y_pred == c) & (y_true != c))
        fn = np.sum((y_pred != c) & (y_true == c))

        precision = tp / (tp + fp + 1e-10)
        recall = tp / (tp + fn + 1e-10)
        f1 = 2 * precision * recall / (precision + recall + 1e-10)

        f1_scores.append(f1)

    return np.mean(f1_scores)

print("Macro F1 Score:", macro_f1(y_test, y_pred))


Macro F1 Score: 0.2541269840895891


In [8]:
# Cell 7

def accuracy(y_true, y_pred):
    return np.sum(y_true == y_pred) / len(y_true)

print("Accuracy:", accuracy(y_test, y_pred))


Accuracy: 0.2857142857142857
