In [55]:
from sklearn import datasets
import math
import numpy

# Cross Validation

In [10]:
iris_data = datasets.load_iris().data

In [11]:
train_setosa = iris_data[0:40]
train_versicolor = iris_data[50:90]
train_virginica = iris_data[100:140]
train = [train_setosa, train_versicolor, train_virginica]

In [62]:
test_setosa = iris_data[40:50]
test_versicolor = iris_data[90:100]
test_virginica = iris_data[140:150]
test = numpy.concatenate((test_setosa, test_versicolor, test_virginica))

# Likelihood

In [19]:
means = []
sds = []
for category in range(0,3):
    categoryMeans = []
    categorySds = []
    for feature in range(0,4):
        meanTotal = 0
        for sample in train[category]:
            meanTotal += sample[feature]
        mean = meanTotal / len(train[category])
        categoryMeans.append(mean)
        sdTotal = 0
        for sample in train[category]:
            sdTotal += (sample[feature] - mean) ** 2
        sd = math.sqrt(sdTotal / (len(train[category]) -  1))
        categorySds.append(sd)
    means.append(categoryMeans)
    sds.append(categorySds)
print(means)
print(sds)

[[5.0375, 3.4525000000000006, 1.46, 0.23499999999999993], [6.01, 2.7800000000000007, 4.3175, 1.35], [6.6225, 2.9599999999999995, 5.6075, 1.9899999999999995]]
[[0.3621074355378454, 0.3609016486523718, 0.17216569240422339, 0.09753368911725072], [0.5232051124975651, 0.33297416547203007, 0.4511452662746793, 0.2075498086651083], [0.6840630875089724, 0.33649779974134925, 0.587645235730643, 0.2725002940483005]]


In [57]:
def likelihood(x, category, feature):
    mean = means[category][feature]
    sd = sds[category][feature]
    part1 = 1 / (math.sqrt(2 * math.pi) * sd)
    part2 = -1 * ( ((x[feature] - mean) ** 2) / (2 * (sd ** 2)) )
    return part1 * (math.e ** part2)

# Priori

In [87]:
length = 0
for i in range(0,len(train)):
    length += len(train[i])
prior_setosa = len(train_setosa) / length
prior_versicolor = len(train_versicolor) / length
prior_virginica = len(train_virginica) / length
prior = [prior_setosa, prior_versicolor, prior_virginica]

[0.3333333333333333, 0.3333333333333333, 0.3333333333333333]


# Posterior

In [59]:
def posteriors(x):
    topLines = []
    topTotal = 0
    for category in range(0,3):
        lh = 1
        for feature in range(0,4):
            lh = lh * likelihood(x, category, feature)
        topLine = lh * prior[category]
        topLines.append(topLine)
        topTotal += topLine
    probs = []
    for category in range(0,3):
        probs.append(topLines[category] / topTotal)
    return probs

In [66]:
def naives(x):
    probs = posteriors(x)
    highestCategory = -1
    highestValue = -1
    for category in range(0,3):
        if probs[category] > highestValue:
            highestCategory = category
            highestValue = probs[category]
    return (highestCategory, highestValue)

In [71]:
results = []
for i in range(0,len(test)):
    result = naives(test[i])
    results.append(result)
    print(f"Sample {i+1} is predicted to be in category {result[0]} with probability {result[1]}")

Sample 1 is predicted to be in category 0 with probability 1.0
Sample 2 is predicted to be in category 0 with probability 0.9999999999999997
Sample 3 is predicted to be in category 0 with probability 1.0
Sample 4 is predicted to be in category 0 with probability 0.999999999939639
Sample 5 is predicted to be in category 0 with probability 0.999999999999211
Sample 6 is predicted to be in category 0 with probability 1.0
Sample 7 is predicted to be in category 0 with probability 1.0
Sample 8 is predicted to be in category 0 with probability 1.0
Sample 9 is predicted to be in category 0 with probability 1.0
Sample 10 is predicted to be in category 0 with probability 1.0
Sample 11 is predicted to be in category 1 with probability 0.9997108265528957
Sample 12 is predicted to be in category 1 with probability 0.9886761171237923
Sample 13 is predicted to be in category 1 with probability 0.9999102746460449
Sample 14 is predicted to be in category 1 with probability 0.9999977236503249
Sample 15 

# Prediction Accuracy

In [78]:
length = len(results)
for category in range(0,3):
    tp = 0
    tn = 0
    fp = 0
    fn = 0
    p = 0
    n = 0
    top = (length / 3) * (category + 1)
    bottom = (length / 3) * category
    for i in range(0,length):    
        if results[i][0] == category:
            p += 1
            if (i < top and i >= bottom):
                tp += 1
            else:
                fp += 1
        else:
            n += 1
            if (i < top and i >= bottom):
                fn += 1
            else:
                tn += 1
    accuracy = (tp + tn) / (p + n)
    print(f"Accuracy for category {category} is {accuracy}")

Accuracy for category 0 is 1.0
Accuracy for category 1 is 1.0
Accuracy for category 2 is 1.0


# Challenging Task

In [93]:
sample = []
category = numpy.random.choice(3, p=prior)
for feature in range(0,4):
    sample.append(numpy.random.normal(means[category][feature], sds[category][feature]))
print(f"Category {category} random sample is {sample}")
result = naives(sample)
print(f"Sample is predicted to be in category {result[0]} with probability {result[1]}")

Category 0 random sample is [5.059115101862955, 3.4151604914435914, 1.6196477577705481, 0.12102076876594636]
Sample is predicted to be in category 0 with probability 1.0
