## Imports

In [83]:
import numpy as np
import pandas as pd

from scipy.stats import multivariate_normal
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

## Data

In [84]:
data = pd.read_csv("IRIS.csv")

In [85]:
# we encode the y labels to discrete values.

def encode_num(x):
    value = None 
    if x == "Iris-setosa":
        value = 0
    if x == "Iris-versicolor":
        value = 1
    if x == "Iris-virginica":
        value = 2

    return value


data['species'] = data['species'].apply(encode_num)

In [88]:
# shuffling the dataset
data = data.sample(frac=1).reset_index(drop=True)

# training set
train_size = int(0.8*len(data))
train_data = data.head(train_size)

# test set
test_size = int(0.2*len(data))
test_data = data.tail(test_size)

In [89]:
x_train = np.array(train_data.drop("species",axis=1))
y_train = train_data["species"]

### Model  

First, we calulate $P(y)$ ,the prior probability

In [102]:
priors = y_train.value_counts()/len(y_train)

Then, we need to find the parameter values needed to calculate the pdf. The parameters are the:

- mean
- covaraince
$$x|y = 0 \thicksim N(\mu_0,\Sigma)$$  

$$x|y = 1 \thicksim N(\mu_1,\Sigma)$$

In [103]:
# we calculate the mean values
mean = data.groupby('species').mean()

# we calculate the variance values
covariance = data.groupby('species').cov()


In [104]:
def predict(x_test, priors, mean , cov):
    
    labels = mean.shape[0]
    results = np.zeros((x_test.shape[0], labels))
    for label in range(labels):
        pdf = multivariate_normal(mean = mean.loc[label], cov = cov.loc[label])
        
        for i, data in enumerate(x_test):
            results[i, label] = np.log(priors[label]) + pdf.logpdf(data)
            
    predictions = np.argmax(results, axis=1)
    return predictions
                

## Evaluation

In [105]:
x_test = test_data[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
y_test = test_data['species']

In [106]:
x_test = np.array(x_test)

In [107]:
preds = predict(x_test,priors,mean,covariance)

In [108]:
print(accuracy_score(y_test,preds) * 100)

96.66666666666667
