In [0]:
# Standard includes
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
# Useful module for dealing with the Gaussian density
from scipy.stats import norm, multivariate_normal #in case you use buit-in library
from sklearn import datasets

In [0]:
# Load data set.
iris = datasets.load_iris()
X = iris.data
Y = iris.target
featurenames = ['petal_length', 'petal_width', 'sepal_length', 'sepal_width']

# Split 150 instances into training set (trainx, trainy) of size 105 and test set (testx, testy) of size 45
np.random.seed(0)
perm = np.random.permutation(150)
trainx = X[perm[0:105],:]
trainy = Y[perm[0:105]]
testx = X[perm[105:150],:]
testy = Y[perm[105:150]]

In [0]:
def fit_generative_model(x,y):
    k = 3  # labels 1,2,...,k
    d = (x.shape)[1]  # number of features
    mu = np.zeros((k,d))
    sigma = np.zeros((k,d,d))
    pi = np.zeros(k)
    for label in range(0,k):
        indices = (y == label)
        sample = x[indices,:]
        for i in range(d):
            mu[label,i] = np.mean(sample[:,i])
        
        for i in range(d):
            for j in range(d):
                sigma[label,i,j] = np.mean(sample[:,i] * sample[:,j]) - mu[label,i] * mu[label,j]
        pi[label] = np.sum(indices) / len(y)
    return mu, sigma, pi

In [0]:
# Fit a Gaussian generative model to the training data
mu, sigma, pi = fit_generative_model(trainx,trainy)

In [0]:
# Now test the performance of a predictor based on a subset of features
def test_model(mu, sigma, pi, features, tx, ty):
    nt = len(ty)
    k = 3
    d = len(features)

    features_bag = [0,1,2,3]
    features_bag = np.delete(features_bag, features, axis=0)
    mean = np.delete(mu, features_bag, axis=1)
    tx = np.delete(tx, features_bag, axis=1)
    cov = np.zeros((k,d,d))

    for label in range(0,k):
        for i in range(d):
            for j in range(d):
                cov[label,i,j] = sigma[label,i,j]

    score = np.zeros((nt,k))
    predictions = np.zeros(nt)
    for i in range(0,nt):
        for label in range(0,k):
            det = np.linalg.det(cov[label,:,])
            inv = np.linalg.inv(cov[label,:,])
            l = np.subtract(tx[i],mean[label]).transpose()
            r = np.subtract(tx[i],mean[label])
            pdf = np.exp((-1/2) * np.dot(np.dot(l,inv),r)) / np.sqrt(np.power(2 * np.pi,4) * det)  
            score[i,label] = np.log(pdf) * pi[label] 
    
    for j in range(nt):
        predictions[j] = np.argmax(score[j,])
        
    # Finally, tally up score
    errors = np.sum(predictions != ty)
    print (str(errors) + '/' + str(nt))

In [229]:
test_model(mu, sigma, pi, [0], testx, testy)

12/45


In [230]:
test_model(mu, sigma, pi, [0, 1], testx, testy)

9/45


In [231]:
test_model(mu, sigma, pi, [0, 1, 2, 3], testx, testy)

1/45
