<a href="https://colab.research.google.com/github/Iammufarooq/Iammufarooq/blob/main/OxML_NaiveBayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
"""Full documentation at https://archive.ics.uci.edu/dataset/45/heart+disease"""

!pip install ucimlrepo



# Load and prepare the data

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

from ucimlrepo import fetch_ucirepo

In [None]:
# fetch dataset
heart_disease = fetch_ucirepo(id=45)

# variable information
heart_disease.variables

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,age,Feature,Integer,Age,,years,no
1,sex,Feature,Categorical,Sex,,,no
2,cp,Feature,Categorical,,,,no
3,trestbps,Feature,Integer,,resting blood pressure (on admission to the ho...,mm Hg,no
4,chol,Feature,Integer,,serum cholestoral,mg/dl,no
5,fbs,Feature,Categorical,,fasting blood sugar > 120 mg/dl,,no
6,restecg,Feature,Categorical,,,,no
7,thalach,Feature,Integer,,maximum heart rate achieved,,no
8,exang,Feature,Categorical,,exercise induced angina,,no
9,oldpeak,Feature,Integer,,ST depression induced by exercise relative to ...,,no


In [None]:
# @title Use only categorical features for demonstration

category_features = heart_disease.variables[(heart_disease.variables.type == "Categorical") & (heart_disease.variables.missing_values == "no")].name

# data (as pandas dataframes)
X = heart_disease.data.features[category_features]
y = pd.Series(heart_disease.data.targets.num, name='class')

feature_counts = dict((c, X[c].value_counts()) for c in X.columns)
feature_counts['target'] = y.value_counts()
pd.DataFrame(feature_counts)

Unnamed: 0,sex,cp,fbs,restecg,exang,slope,target
0,97.0,,258.0,151.0,204.0,,164
1,206.0,23.0,45.0,4.0,99.0,142.0,55
2,,50.0,,148.0,,140.0,36
3,,86.0,,,,21.0,35
4,,144.0,,,,,13


In [None]:
# @title Partition the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify=y)

print((len(X_train), len(X_test)))
y.value_counts(),  y_test.value_counts()

(242, 61)


(class
 0    164
 1     55
 2     36
 3     35
 4     13
 Name: count, dtype: int64,
 class
 0    33
 1    11
 3     7
 2     7
 4     3
 Name: count, dtype: int64)

# Implementation


In [None]:
import numpy as np
from typing import Dict, Optional

In [None]:
def compute_prior(target: pd.Series, log: bool = False) -> pd.Series:
  class_counts = target.value_counts().sort_index()

  # Calculate prior probabilities, p(y)
  prior = class_counts / target.sum()

  if log:
    print("Class counts:", class_counts, "Prior:", prior, sep='\n')

  return prior

prior = compute_prior(y, True)

Class counts:
class
0    164
1     55
2     36
3     35
4     13
Name: count, dtype: int64
Prior:
class
0    0.577465
1    0.193662
2    0.126761
3    0.123239
4    0.045775
Name: count, dtype: float64


In [None]:
pd.crosstab(X['cp'], y)

class,0,1,2,3,4
cp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,16,5,1,0,1
2,41,6,1,2,0
3,68,9,4,4,1
4,39,35,30,29,11


In [None]:
def compute_likelihoods(data: pd.DataFrame, target: pd.Series, smoothing: int=1,
                        log: bool=False) -> Dict[str, pd.DataFrame]:
  """Calculate likelihoods, p(x|y)
     For each target class, we find the proportion of the times the feature
     took a particular value."""

  likelihoods = {}
  for feature in data.columns:
    cross_table = pd.crosstab(data[feature], target) + smoothing
    likelihoods[feature] = cross_table / cross_table.sum()

  if log:
    for feature in data.columns:
      print(f'Likelihood for feature {feature}:', likelihoods[feature], sep='\n')

  return likelihoods

likelihoods = compute_likelihoods(X, y, 1, True)

Likelihood for feature sex:
class         0         1         2         3    4
sex                                               
0      0.439759  0.175439  0.210526  0.216216  0.2
1      0.560241  0.824561  0.789474  0.783784  0.8
Likelihood for feature cp:
class         0         1      2         3         4
cp                                                  
1      0.101190  0.101695  0.050  0.025641  0.117647
2      0.250000  0.118644  0.050  0.076923  0.058824
3      0.410714  0.169492  0.125  0.128205  0.117647
4      0.238095  0.610169  0.775  0.769231  0.705882
Likelihood for feature fbs:
class         0         1         2         3         4
fbs                                                    
0      0.855422  0.912281  0.736842  0.756757  0.866667
1      0.144578  0.087719  0.263158  0.243243  0.133333
Likelihood for feature restecg:
class           0         1         2         3       4
restecg                                                
0        0.574850  0.413793

In [None]:
# @title likelihoods without smoothing

compute_likelihoods(X, y, 0)['cp']

class,0,1,2,3,4
cp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.097561,0.090909,0.027778,0.0,0.076923
2,0.25,0.109091,0.027778,0.057143,0.0
3,0.414634,0.163636,0.111111,0.114286,0.076923
4,0.237805,0.636364,0.833333,0.828571,0.846154


In [None]:
# @title likelihoods with strong smoothing

compute_likelihoods(X, y, 20)['cp']

class,0,1,2,3,4
cp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.147541,0.185185,0.181034,0.173913,0.225806
2,0.25,0.192593,0.181034,0.191304,0.215054
3,0.360656,0.214815,0.206897,0.208696,0.225806
4,0.241803,0.407407,0.431034,0.426087,0.333333


In [None]:
def compute_log_posterior(data: pd.DataFrame, priors: pd.Series,
                          likelihoods: Dict[str, pd.Series], log: bool=False):
  """Compute the log-probability of the samples for each class."""

  # Start with the log prior probabilities
  log_posteriors = np.log(pd.DataFrame([priors for _ in range(len(data))],
                                      index=data.index, columns=priors.index))

  classes = priors.index
  features = data.columns

  # For each new data point
  for idx, d in data.iterrows():
    summed_log_likelihoods = []
    # For each possible class
    for c in classes:
      probs = 0
      # For each feature
      for f in features:
        # get the feature value for this data point
        feature_value = d[f]

        # Add the log likelihood probabilities for this feature
        probs += np.log(likelihoods[f][c].loc[feature_value])

      # The posterior for this class
      summed_log_likelihoods.append(probs)

    # add to the prior values
    log_posteriors.loc[idx] += np.array(summed_log_likelihoods)

  if log:
    print(log_posteriors)

  return log_posteriors

log_posteriors = compute_log_posterior(X_train.iloc[0:3], prior, likelihoods, True)

class         0         1         2         3          4
31    -7.430259 -7.354469 -6.645779 -7.210483 -10.185936
45    -4.420533 -5.469011 -6.814276 -6.636856  -6.968331
88    -4.447094 -6.065882 -7.527878 -7.405917  -8.267614


In [None]:
class NaiveBayesClassifier:
  def __init__(self, smoothing: int=1):
    self.likelihood: Dict[str, pd.Series] = {}
    self.prior: Optional[pd.Series] = None
    self.smoothing: int = smoothing

  def fit(self, data: pd.DataFrame, target: pd.Series):
    """Fits the model by computing the prior and likelihood probabilities"""
    self.features = data.columns
    self.classes = target.value_counts()
    self.classes.sort_index()

    self.prior = compute_prior(target)
    self.likelihood = compute_likelihoods(data, target, self.smoothing)

  def predict(self, data, legal=None):
    """Classify the new data"""
    assert self.prior is not None, 'Must fit the classifier first'

    log_posteriors = compute_log_posterior(data, self.prior, self.likelihood)

    predictions = log_posteriors.idxmax(axis='columns')

    return predictions

# Fit

In [None]:
from imblearn.over_sampling import RandomOverSampler
from prettytable import PrettyTable
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.naive_bayes import CategoricalNB

In [None]:
# @title compare performance to sklearn implementation

table = PrettyTable()
table.add_column("Metric", ["Accuracy", "Confusion Matrix"])

for nbc in [NaiveBayesClassifier(), CategoricalNB()]:
  nbc.fit(X_train, y_train)
  y_pred = nbc.predict(X_test)

  accuracy = accuracy_score(y_test, y_pred)
  conf_matrix = confusion_matrix(y_test, y_pred)  # True values are y-axis, predicted is x-axis

  table.add_column(nbc.__class__.__name__, [f"{accuracy:.2%}", f"{conf_matrix}"])

print(table)

+------------------+----------------------+--------------------+
|      Metric      | NaiveBayesClassifier |   CategoricalNB    |
+------------------+----------------------+--------------------+
|     Accuracy     |        54.10%        |       54.10%       |
| Confusion Matrix |  [[29  2  1  1  0]   | [[29  2  1  1  0]  |
|                  |   [ 4  2  3  2  0]   |  [ 4  2  3  2  0]  |
|                  |   [ 2  3  0  2  0]   |  [ 2  3  0  2  0]  |
|                  |   [ 2  0  3  2  0]   |  [ 2  0  3  2  0]  |
|                  |   [ 1  1  0  1  0]]  |  [ 1  1  0  1  0]] |
+------------------+----------------------+--------------------+


In [None]:
# @title uniform prior

table = PrettyTable()
table.add_column("Metric", ["Accuracy", "Confusion Matrix"])

for fp in [True, False]:
  nbc = CategoricalNB(fit_prior=fp)
  nbc.fit(X_train, y_train)
  y_pred = nbc.predict(X_test)

  accuracy = accuracy_score(y_test, y_pred)
  conf_matrix = confusion_matrix(y_test, y_pred)  # True values are y-axis, predicted is x-axis

  table.add_column(f"fit_prior={fp}", [f"{accuracy:.2%}", f"{conf_matrix}"])

print(table)

+------------------+--------------------+--------------------+
|      Metric      |   fit_prior=True   |  fit_prior=False   |
+------------------+--------------------+--------------------+
|     Accuracy     |       54.10%       |       40.98%       |
| Confusion Matrix | [[29  2  1  1  0]  | [[21  2  2  2  6]  |
|                  |  [ 4  2  3  2  0]  |  [ 3  1  3  2  2]  |
|                  |  [ 2  3  0  2  0]  |  [ 1  0  1  3  2]  |
|                  |  [ 2  0  3  2  0]  |  [ 0  1  1  0  5]  |
|                  |  [ 1  1  0  1  0]] |  [ 0  0  0  1  2]] |
+------------------+--------------------+--------------------+


In [None]:
# @title oversample the minority classes
sampling_strategy = dict((id, max(count, 65)) for id, count in y_train.value_counts().items())

ros = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=0)

X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)

y_train.value_counts(), y_train_resampled.value_counts()

(class
 0    131
 1     44
 2     29
 3     28
 4     10
 Name: count, dtype: int64,
 class
 0    131
 2     65
 4     65
 1     65
 3     65
 Name: count, dtype: int64)

In [None]:
table = PrettyTable()
table.add_column("Metric", ["Accuracy", "Confusion Matrix"])

for resample in [False, True]:
  nbc = CategoricalNB()
  nbc.fit(X_train_resampled, y_train_resampled) if resample else nbc.fit(X_train, y_train)
  y_pred = nbc.predict(X_test)

  accuracy = accuracy_score(y_test, y_pred)
  conf_matrix = confusion_matrix(y_test, y_pred)  # True values are y-axis, predicted is x-axis

  table.add_column(f"resample={resample}", [f"{accuracy:.2%}", f"{conf_matrix}"])

print(table)

+------------------+--------------------+--------------------+
|      Metric      |   resample=False   |   resample=True    |
+------------------+--------------------+--------------------+
|     Accuracy     |       54.10%       |       49.18%       |
| Confusion Matrix | [[29  2  1  1  0]  | [[25  0  1  0  7]  |
|                  |  [ 4  2  3  2  0]  |  [ 4  0  5  0  2]  |
|                  |  [ 2  3  0  2  0]  |  [ 2  1  2  0  2]  |
|                  |  [ 2  0  3  2  0]  |  [ 1  0  1  2  3]  |
|                  |  [ 1  1  0  1  0]] |  [ 0  1  1  0  1]] |
+------------------+--------------------+--------------------+


# Futher work

- Implement a Naive Bayes classifer on the iris dataset from the K-means workbook. This is not categorical data, so first read about and select a suitable classifier from the sklearn toolkit.
https://scikit-learn.org/stable/modules/naive_bayes.html

- Take a look at the titanic data science competition
https://www.kaggle.com/competitions/titanic/overview
Then review a highly rated approach. This should introduce you to dataset inspection and wrangling. For example
https://www.kaggle.com/code/startupsci/titanic-data-science-solutions
