In [None]:
'''
Functions and imports for Bayes Assignment #1
'''
import math
import random
import matplotlib.pyplot as plt
import numpy as np
from matplotlib import colors
from matplotlib.ticker import PercentFormatter
import pandas as pd

#Return the Gaussian probability density function for x given mean = m and sigma = s
def gauss_val(x, m, s):
  val = math.exp(-math.pow((x-m)/s,2)/2.0)/(s*math.sqrt(math.pi*2))
  return(val)

#Generate Gaussian values for each class
#  Add uniform noise
def gen_gauss_list(stats, cl_names, num, noise_fact):
  g_vals = []
  for i in range(num):
    c = random.random()
    #Is the sample from class 0 or from class 1?
    if (c < stats[2]):
      new_val = random.gauss(stats[0], stats[1])
      noise_val = random.uniform(-noise_fact*stats[1], noise_fact*stats[1])
      new_val += noise_val
      g_vals.append([cl_names[0], new_val])
    else:
      new_val = random.gauss(stats[3], stats[4])
      noise_val = random.uniform(-noise_fact*stats[4], noise_fact*stats[4])
      new_val += noise_val      
      g_vals.append([cl_names[1], new_val])
    return(g_vals)

# read data
data = pd.read_csv("BayesAssign1_01.csv",header = None,names=['class', 'predictor_1'])

# randomly shuffle the data
data = data.sample(frac = 1, random_state = 1)

# split the data into training set and test set with 3:7
print(len(data))
trainingData = data[:300]
testingData = data[300:]

# compute the percentage of classes in the training data set
probNeg = trainingData["class"].value_counts()["NEG"]/len(trainingData)
probPos = trainingData["class"].value_counts()["POS"]/len(trainingData)

# find the mean and standard deviation of the predictors of negative class subset and positive class subset respectively
meanNeg = sum(trainingData[trainingData["class"]=="NEG"]["predictor_1"]) / len(trainingData[trainingData["class"] == "NEG"])
sdNeg = (sum((trainingData[trainingData["class"]=="NEG"]["predictor_1"] - meanNeg)**2) / (len(trainingData[trainingData["class"] == "NEG"]) - 1)) ** 0.5
meanPos = sum(trainingData[trainingData["class"]=="POS"]["predictor_1"]) / len(trainingData[trainingData["class"] == "POS"])
sdPos = (sum((trainingData[trainingData["class"]=="POS"]["predictor_1"] - meanPos)**2) / (len(trainingData[trainingData["class"] == "POS"]) - 1)) ** 0.5

# find the number of correct predictions in the testing data using Bayes
numCorrect = 0
for i, row in testingData.iterrows():
    # compute the indicator for determination of class
    indicatorNeg = probNeg * gauss_val(row['predictor_1'], meanNeg, sdNeg)
    indicatorPos = probPos * gauss_val(row['predictor_1'], meanPos, sdPos)
    # make the decision using the indicator
    if indicatorNeg > indicatorPos:
        myPred = "NEG"
    else:
        myPred = "POS"
    # add the list of predicted class to the test data 
    testingData.at[i, 'myPrediction'] = myPred
    # add to count if the prediction is correct
    if(myPred == row['class']):
        numCorrect = numCorrect + 1
print(numCorrect)

# determine the accuracy 
accuracyBayes = numCorrect / len(testingData)

# derive the numbers of true positive, true negative, false positive, false negative 
TP = len(testingData.loc[(testingData["class"] == "POS") & (testingData["myPrediction"] == "POS")])
TN = len(testingData.loc[(testingData["class"] == "NEG") & (testingData["myPrediction"] == "NEG")])
FP = len(testingData.loc[(testingData["class"] == "NEG") & (testingData["myPrediction"] == "POS")])
FN = len(testingData.loc[(testingData["class"] == "POS") & (testingData["myPrediction"] == "NEG")])
print("The true positive, true negative, false positive, false negative values are: %d, %d, %d, %d respectively" % (TP, TN, FP, FN))

# determine the prevalence, accuracy, sensitivity, specificity, and precision using the TP, TN, FP, and FN
prevalence = TP / len(testingData)
accuracy = (TP + TN)/ len(testingData)
sensitivity = TP / (TP + FN)
specificity = TN / (FP + TN)
precision = TP / (TP + FP)
print("The prevalence, accuracy, sensitivity, specificity, and precision are: %f, %f, %f, %f, %f respectively" % (prevalence, accuracy, sensitivity,specificity, precision))

1000
667
The true positive, true negative, false positive, false negative values are: 135, 532, 11, 22 respectively
The prevalence, accuracy, sensitivity, specificity, and precision are: 0.192857, 0.952857, 0.859873, 0.979742, 0.924658 respectively
