# ML Exercise 2-1
## Question 1
By Gholamreza Dar

https://gholamrezadar.ir/

Dec 2021


Questions available at : [Github Link](https://github.com/Gholamrezadar/machine-learning-exercises/blob/main/ML-HW02.pdf)


## Loading the data

In [24]:
!wget -nc https://raw.githubusercontent.com/Gholamrezadar/machine-learning-exercises/main/heart.csv

File ‘heart.csv’ already there; not retrieving.



In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import norm
from scipy.stats import multivariate_normal
from functools import lru_cache
from tqdm.notebook import tqdm
import seaborn as sns; sns.set()

In [26]:
df = pd.read_csv("/content/heart.csv")

In [27]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0




**Discrete features:**

sex, cp, fbs, restecg, exang, slope, ca, thal

**Continuous features:**

age, trestbps, chol, thalach, oldpeak

**Label:**

target

### Preparing the data


1.   Shuffle
2.   Seperate X,y
3.   Train, Test Split




#### Shuffle dataframe

In [28]:
# Shuffle dataframe
df = df.sample(frac=1.0).reset_index(drop=True)

#### Seperate X,y

In [29]:
# Seperate X,y
X = df.drop(columns=["target"])
y = df["target"]

#### Split to train and test

In [30]:
# Split to train and test
split = 0.8

X_train = X.iloc[ : int(len(X)*split),:].reset_index(drop=True)
X_test = X.iloc[int(len(X)*split) : ,:].reset_index(drop=True)

y_train = y.iloc[ : int(len(X)*split)].reset_index(drop=True)
y_test = y.iloc[int(len(X)*split) : ].reset_index(drop=True)

print(f"Train X size = {len(X_train)}")
print(f"Train y size = {len(y_train)}")
print(f"Test X size = {len(X_test)}")
print(f"Test y size = {len(y_test)}")

Train X size = 820
Train y size = 820
Test X size = 205
Test y size = 205


## Helper functions


In [31]:
def generate_frequency_tables(X_train, y_train):
  '''
  Generates frequency_tables for pos and neg labels seperatley.
  Useful for calculating probabilities later on.
  example : frequency_table_pos["sex"][1] -> how many sex=1 entries where found 
  with positive label in the data
  returns frequency_table_pos, frequency_table_neg
  '''
  discrete_cols = ["sex", "cp", "fbs", "restecg", "exang", "slope", "ca", "thal"]

  X_train_pos = X_train.loc[y_train==1]
  X_train_neg = X_train.loc[y_train==0]

  # Generate a frequency table for positive instances
  frequency_table_pos = {}
  for col in discrete_cols:
    frequency_table_pos[col] = X_train_pos[col].value_counts().to_dict()

  # Generate a frequency table for negative instances
  frequency_table_neg = {}
  for col in discrete_cols:
    frequency_table_neg[col] = X_train_neg[col].value_counts().to_dict()
  
  return frequency_table_pos, frequency_table_neg

In [32]:
def p_discrete(X_train, y_train, variable_name, value, heart_disease):
  '''
  returns P(variable_name=value | heart_disease) where variable_name 
  is a discrete variable
  '''
  # frequency_table_pos, frequency_table_neg = generate_frequency_tables(X_train, y_train)

  if heart_disease:
    return (frequency_table_pos[variable_name][value]+1) / (P_POS+1)
  else:
    return (frequency_table_neg[variable_name][value]+1) / (P_NEG+1)

In [33]:
def p_continuous(X_train, y_train, variable_name, value, heart_disease):
  '''
  returns P(variable_name=value | heart_disease) where variable_name 
  is a normally distributed continuous variable
  '''
  if heart_disease:
    X_train_pos = X_train.loc[y_train==1]
    data = X_train_pos[variable_name].to_numpy().reshape(-1,1)
    mean = data.mean()
    std = data.std()
    p = norm.pdf(value, mean, std)
    return p
  else:
    X_train_neg = X_train.loc[y_train==0]
    data = X_train_neg[variable_name].to_numpy().reshape(-1,1)
    mean = data.mean()
    std = data.std()
    p = norm.pdf(value, mean, std)
    return p

In [34]:
def p_continuous_multivariable(X_train, y_train, var_val_pair_dict, heart_disease):
  '''
  returns P() where var_val_pair_dict is variable_value pairs 
  and variables follow a single multivaraite normal distribution
  '''
  if heart_disease:
    # select positive part of the dataset
    X_train_pos = X_train.loc[y_train==1]
    # This person's continuous features' values
    continuous_cols = ["age", "trestbps", "chol", "thalach", "oldpeak"]
    continuous_vals = [i[1] for i in var_val_pair_dict.items()]
    # Multivariate Normal Density
    cov = X_train_pos[continuous_cols].cov()
    mean = X_train_pos[continuous_cols].mean()
    p = multivariate_normal.pdf(continuous_vals, mean=mean, cov=cov)
    return p
  else:
    # select negativr part of the dataset
    X_train_neg = X_train.loc[y_train==1]
    # This person's continuous features' values
    continuous_cols = ["age", "trestbps", "chol", "thalach", "oldpeak"]
    continuous_vals = [i[1] for i in var_val_pair_dict.items()]
    # Multivariate Normal Density
    cov = X_train_neg[continuous_cols].cov()
    mean = X_train_neg[continuous_cols].mean()
    p = multivariate_normal.pdf(continuous_vals, mean=mean, cov=cov)
    return p

In [35]:
def p_pos(X_train):
  '''
  Calculates P(Heart_disease)
  '''
  X_train_pos = X_train.loc[y_train==1]
  p_pos = len(X_train_pos)/len(X_train)
  return p_pos

def p_neg(X_train):
  '''
  Calculates P(!Heart_disease)
  '''
  X_train_neg = X_train.loc[y_train==0]
  p_neg = len(X_train_neg)/len(X_train)
  return p_neg

In [36]:
a = {"a":1, "b":2, "c":3}
b = {}
for i in a:
  if i!="b":
    b[i]=a[i]
b

{'a': 1, 'c': 3}

In [37]:
def calculate_posterior_bayes(X_train, y_train, heart_disease, feature_dict):
  '''
  Calculates the posterior probability 
  P(heart_disease | X)
  where X is a person with this function's parameters as their features.
  using bayes

  Log version, to avoid float underflow
  '''
  discrete_cols = ["sex", "cp", "fbs", "restecg", "exang", "slope", "ca", "thal"]
  continuous_cols = ["age", "trestbps", "chol", "thalach", "oldpeak"]

  prior = P_POS if heart_disease else P_NEG
  posterior = 0

  # Multiply by P(xi | heart_disease) for discrete variables
  for col in discrete_cols:
    posterior += np.log(
        p_discrete(X_train,
                   y_train,
                   col,
                   feature_dict[col],
                   heart_disease=heart_disease)
        )
  
  # we want something like {'age':20, 'trestbps':150, ...} 
  # but only for continuous variables
  # basically filtering 'feature_dict' to only contain continuous
  # variables
  var_val_pair_dict = {}
  for col in feature_dict:
    if col in continuous_cols:
      var_val_pair_dict[col] = feature_dict[col]


  # Multiply by P(x | heart_disease) for continuous variables
  posterior += np.log(
      p_continuous_multivariable(X_train,
                                  y_train,
                                  var_val_pair_dict,
                                  heart_disease=heart_disease)
      )
  
  posterior += np.log(prior)

  return posterior

In [38]:
def calculate_posterior(X_train, y_train, heart_disease, feature_dict, feature_to_delete = None):
  '''
  Calculates the posterior probability 
  P(heart_disease | X)
  where X is a person with this function's parameters as their features.
  using Naive bayes
  
  Log version, to avoid float underflow
  '''
  discrete_cols = ["sex", "cp", "fbs", "restecg", "exang", "slope", "ca", "thal"]
  continuous_cols = ["age", "trestbps", "chol", "thalach", "oldpeak"]
  if feature_to_delete is not None:
    continuous_cols.remove(feature_to_delete)

  prior = P_POS if heart_disease else P_NEG
  posterior = 0

  # Multiply by P(xi | heart_disease) for discrete variables
  for col in discrete_cols:
    posterior += np.log(p_discrete(X_train,
                                   y_train,
                                   col,
                                   feature_dict[col],
                                   heart_disease=heart_disease))
  
  # Multiply by P(xi | heart_disease) for continuous variables
  for col in continuous_cols:
    posterior += np.log(p_continuous(X_train,
                                     y_train,
                                     col,
                                     feature_dict[col],
                                     heart_disease=heart_disease))
  
  posterior += np.log(prior)

  return posterior

In [39]:
def predict_class_bayes(X_train, y_train, X):
  '''
  select between heart_disease and no heart_disease 
  using Bayes classifier
  '''
  p_pos_x = calculate_posterior_bayes(X_train, y_train, True, X.to_dict())
  p_neg_x = calculate_posterior_bayes(X_train, y_train, False, X.to_dict())

  if p_pos_x>p_neg_x:
    return 1
  else:
    return 0

In [40]:
def predict_class(X_train, y_train, X, feature_to_delete = None):
  '''
  select between heart_disease and no heart_disease 
  using Naive bayes classifier
  '''
  p_pos_x = calculate_posterior(X_train, y_train, True, X.to_dict(), feature_to_delete)
  p_neg_x = calculate_posterior(X_train, y_train, False, X.to_dict(), feature_to_delete)

  if p_pos_x>p_neg_x:
    return 1
  else:
    return 0

In [41]:
# predict_class(X_train, y_train, X_train.iloc[0])

## Part 1) Bayes Classifier

Only discrete Features are independent.

Continuous features follow a multivariate(5 variable) Normal distribution

In [42]:
# update : calculate once for performance reasons
P_NEG = p_neg(X_train)
P_POS = p_pos(X_train)
frequency_table_pos, frequency_table_neg = generate_frequency_tables(X_train, y_train)

In [43]:
# predict outcome for every row in test set using bayes classifier
accuracy = 0
for i in tqdm(range(len(X_test))):
  person = X_test.iloc[i]
  pred = predict_class_bayes(X_train, y_train, person)
  label = y_test.iloc[i]

  if pred==label:
    accuracy += 1

accuracy /= len(X_test)
print(f" Accuracy = {accuracy}")

  0%|          | 0/205 [00:00<?, ?it/s]

 Accuracy = 0.8536585365853658


## Part 2) Naïve Bayes classifier

In [44]:
# predict outcome for every row in test set using naive bayes classifier
accuracy = 0
for i in tqdm(range(len(X_test))):
  person = X_test.iloc[i]
  pred = predict_class(X_train, y_train, person)
  label = y_test.iloc[i]

  if pred==label:
    accuracy += 1


accuracy /= len(X_test)
print(f" Accuracy = {accuracy}")

  0%|          | 0/205 [00:00<?, ?it/s]

 Accuracy = 0.8682926829268293


## Part 3) Remove chol or oldpeak

### Remove chol

In [45]:
# predict outcome for every row in test set using naive bayes classifier
accuracy = 0
for i in tqdm(range(len(X_test))):
  person = X_test.iloc[i]
  pred = predict_class(X_train, y_train, person, feature_to_delete="chol")
  label = y_test.iloc[i]

  if pred==label:
    accuracy += 1


accuracy /= len(X_test)
print(f" Accuracy = {accuracy}")

  0%|          | 0/205 [00:00<?, ?it/s]

 Accuracy = 0.8634146341463415


### Remove oldpeak

In [46]:
# predict outcome for every row in test set using naive bayes classifier
accuracy = 0
for i in tqdm(range(len(X_test))):
  person = X_test.iloc[i]
  pred = predict_class(X_train, y_train, person, feature_to_delete="oldpeak")
  label = y_test.iloc[i]

  if pred==label:
    accuracy += 1


accuracy /= len(X_test)
print(f" Accuracy = {accuracy}")

  0%|          | 0/205 [00:00<?, ?it/s]

 Accuracy = 0.8780487804878049
