In [2]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn.datasets as dt

In [3]:
#method to generate random samples following Normal distribution in d dimensions
def multi_gaussian(dim, num_of_samples):
    cov_matrix = dt.make_spd_matrix(dim) #generate random symmetric, positive definite matrix
    mean = np.random.rand(dim)
    x = np.random.multivariate_normal(mean, cov_matrix, (num_of_samples)) #shape of x is num_of_samples x dim
    return x
# multi_gaussian(5,10)

In [49]:
def discriminant_function(x, mean_vec, cov_matrix, dim, prior_prob):
        if dim > 1: # multivariate gaussian
            cov_inverse = np.linalg.inv(cov_matrix)
            cov_det = np.linalg.det(cov_matrix)
            return (
                    ((- 1/2) * (mahalanobis_dist(x, mean_vec, cov_inverse))**2) \
                    - ((dim / 2) * ( np.log(2 * np.pi)) )\
                    - ( (1/2) * np.log(cov_det) ) \
                    + np.log(prior_prob)
            )
        else: # univariate gaussian
#             print("euc_dist",euclidean_dis(x, mean_vec))
#             print(cov_matrix)
            return (
                    -(1/2) * euclidean_dis(x, mean_vec)**2 /  cov_matrix \
                    - (1/2) * (np.log(2 * np.pi * cov_matrix))\
                    + np.log(prior_prob)
            )

In [32]:
#calculate Euclidean distance between 2 points x and y
def euclidean_dis(x, mean_vector):
    cov_inverse = np.identity(x.shape[1])
    return mahalanobis_dist(x, mean_vector, cov_inverse) #

In [35]:
def mahalanobis_dist(x, mean_vector, cov_inverse):
#     print("shape of x in mahalanobis dist", x.shape)
    diff = ( x - mean_vector )
    return ( np.dot( np.dot(diff, cov_inverse), diff.T ) ) ** (1/2)

# Section

In [9]:
data = np.genfromtxt('data_dhs_chap2.csv', delimiter=',', skip_header=1)
data.shape

(30, 4)

In [83]:
feature_1 = data[:20,:1] # first and second class
# print(feature_1.shape)
dim = 1
class1_prior_prob = 0.5
class2_prior_prob = 0.5
predicted_class = []
for instance in feature_1:
    instance = instance.reshape((1,1))
#     print(instance.shape)
    g1 = discriminant_function(instance, np.mean(feature_1[:10]),np.var(feature_1[:10]), dim, class1_prior_prob)
    g2 = discriminant_function(instance, np.mean(feature_1[10:20]),np.var(feature_1[10:20]), dim, class2_prior_prob)
    if g1 > g2 :
        predicted_class.append(0) # class 1
    else :
        predicted_class.append(1) # class 2
# print(predicted_class)

#find misclassification error
target_class = data[:20, 3:].astype(int)

total_instances = len(predicted_class)
target = []
for i in target_class:
    target.append(i.item())
# print(target)
error = 0 
for instance in range(total_instances) :
    error += np.abs( target[instance] - predicted_class[instance] )
avg_error_percent = (100 / total_instances) * error
print("misclassification error:", avg_error_percent)
# avg_error = (100/n) * ()

misclassification error: 35.0


## Classification Error

In [84]:
data.shape

(30, 4)

In [89]:
feature_1 = data[:,:2]
predicted_class = []
# print(np.mean(feature_1[:10],axis=0))
for feature in feature_1[:20]:
    g1 = discriminant_function(feature, np.mean(feature_1[:10],axis=0),np.cov(feature_1[:10].T), 2, 0.5)
    g2 = discriminant_function(feature, np.mean(feature_1[10:20],axis=0),np.cov(feature_1[10:20].T), 2, 0.5)
    if g1 - g2 > 0:
        predicted_class.append(0)
    else :
        predicted_class.append(1)

target_class = data[:20, 3:].astype(int)

total_instances = len(predicted_class)
target = []
for i in target_class:
    target.append(i.item())
# print(target)
error = 0 
for instance in range(total_instances) :
    error += np.abs( target[instance] - predicted_class[instance] )
avg_error_percent = (100 / total_instances) * error
print("misclassification error:", avg_error_percent)    

misclassification error: 45.0
