In [1]:
from sklearn import datasets
import math
import numpy
import statistics

# Cross Validation

In [2]:
#Sets the train and test data
id = datasets.load_iris().data
train = numpy.concatenate((id[0:40], id[50:90], id[100:140]))
test = numpy.concatenate((id[40:50], id[90:100], id[140:150]))

# Likelihood

In [3]:
#Calculates the means and standard deviations
means = []
sds = []
for i in range(0,3):
    categoryMeans = []
    categorySds = []
    for f in range(0,4):
        feature = []
        for sample in train[(i*40):(i*40)+40]:
            feature.append(sample[f])
        categoryMeans.append(numpy.mean(feature))
        categorySds.append(statistics.stdev(feature))
    means.append(categoryMeans)
    sds.append(categorySds)
print(means)
print(sds)

[[5.0375, 3.4525000000000006, 1.46, 0.23500000000000001], [6.01, 2.78, 4.3175, 1.35], [6.6225, 2.96, 5.6075, 1.9899999999999998]]
[[0.36210743553784547, 0.3609016486523718, 0.17216569240422333, 0.09753368911725073], [0.5232051124975651, 0.3329741654720301, 0.4511452662746793, 0.20754980866510833], [0.6840630875089722, 0.3364977997413493, 0.5876452357306431, 0.2725002940483005]]


In [4]:
#Calculates the likelihood the feature value is part of the given category
def likelihood(x, c, f):
    mean = means[c][f]
    sd = sds[c][f]
    part1 = 1 / (math.sqrt(2*math.pi) * sd)
    part2 = (-((x-mean)**2)) / (2 * sd**2)
    return part1 * math.exp(part2)

# Priori

In [5]:
#Priori probability for each category
priori = [40/120, 40/120, 40/120]

# Posterior

In [6]:
#Calculates the category the sample is most likely part of
def posterior(s):
    
    #Gets the probabilites for each category
    probs = []
    sampleProb = 0
    for c in range(0,3):
        lh = 1
        for f in range(0,4):
            lh = lh * likelihood(s[f], c, f)
        probs.append(lh * priori[c])
        sampleProb += probs[c]
    for c in range(0,3):
        probs[c] = probs[c] / sampleProb
        
    #Gets the category with the highest probabilility
    biggest = -1
    i = -1
    for c in range(0,3):
        if (probs[c] > biggest):
            biggest = probs[c]
            i = c
    print(f"{s} is predicted to be in category {i} with the probability of {probs[i]}")
    return i

In [7]:
#Predicts the categories for all of the test data
testPredictions = []
for s in range(0,len(test)):
    testPredictions.append(posterior(test[s]))

[5.  3.5 1.3 0.3] is predicted to be in category 0 with the probability of 1.0
[4.5 2.3 1.3 0.3] is predicted to be in category 0 with the probability of 0.9999999999999999
[4.4 3.2 1.3 0.2] is predicted to be in category 0 with the probability of 1.0
[5.  3.5 1.6 0.6] is predicted to be in category 0 with the probability of 0.9999999999652412
[5.1 3.8 1.9 0.4] is predicted to be in category 0 with the probability of 0.9999999999995934
[4.8 3.  1.4 0.3] is predicted to be in category 0 with the probability of 1.0
[5.1 3.8 1.6 0.2] is predicted to be in category 0 with the probability of 1.0
[4.6 3.2 1.4 0.2] is predicted to be in category 0 with the probability of 1.0
[5.3 3.7 1.5 0.2] is predicted to be in category 0 with the probability of 1.0
[5.  3.3 1.4 0.2] is predicted to be in category 0 with the probability of 1.0
[5.5 2.6 4.4 1.2] is predicted to be in category 1 with the probability of 0.999760380828135
[6.1 3.  4.6 1.4] is predicted to be in category 1 with the probability 

# Evaluation

In [9]:
#Checks the accuracy of the test data predictions
third = len(test) / 3
for c in range(0,3):
    TP = 0
    TN = 0
    FP = 0
    FN = 0
    for s in range(0, len(test)):
        sample = testPredictions[s]
        if sample == c:
            if s < (c+1)*third and s >= c*third:
                TP += 1
            else:
                FP += 1
        else:
            if s < (c+1)*third and s >= c*third:
                FN += 1
            else:
                TN += 1
    accuracy = (TP + TN) / (len(test))
    print(f"Category {c} has an accuracy of {accuracy}")

Category 0 has an accuracy of 1.0
Category 1 has an accuracy of 1.0
Category 2 has an accuracy of 1.0


# Challenging Task

In [10]:
#Generates n number of samples based on the priori probability
for i in range(0,10):
    
    #Generates sample
    sample = []
    c = numpy.random.choice(3, p=priori)
    for f in range(0,4):
        sample.append(numpy.random.normal(means[c][f], sds[c][f]))
    print(f"Category {c} random sample is {sample}")

    #Check if correct
    p = posterior(sample)
    print()

Category 2 random sample is [5.857266395649137, 3.2073474950625926, 5.045753769220008, 2.176333949686904]
[5.857266395649137, 3.2073474950625926, 5.045753769220008, 2.176333949686904] is predicted to be in category 2 with the probability of 0.9996335061340504

Category 2 random sample is [6.397426732528828, 2.9074080170598022, 5.323566666714004, 1.5534268213574236]
[6.397426732528828, 2.9074080170598022, 5.323566666714004, 1.5534268213574236] is predicted to be in category 2 with the probability of 0.7464845489413212

Category 0 random sample is [4.323971958323481, 3.851102375605368, 1.2627441957137209, 0.25069447276141604]
[4.323971958323481, 3.851102375605368, 1.2627441957137209, 0.25069447276141604] is predicted to be in category 0 with the probability of 1.0

Category 2 random sample is [6.4298681780003015, 2.848725820667175, 6.007333045058802, 2.0707238361929226]
[6.4298681780003015, 2.848725820667175, 6.007333045058802, 2.0707238361929226] is predicted to be in category 2 with th