In [113]:
import numpy as np
import matplotlib.pyplot as plt
import sklearn.datasets as dt

In [114]:
#method to generate random samples following Normal distribution in d dimensions
def multi_gaussian(dim, num_of_samples):
    cov_matrix = dt.make_spd_matrix(dim) #generate random symmetric, positive definite matrix
    mean = np.random.rand(dim)
    x = np.random.multivariate_normal(mean, cov_matrix, (num_of_samples)) #shape of x is num_of_samples x dim
    return x
# multi_gaussian(5,10)

In [115]:
def discriminant_function(x, mean_vec, cov_matrix, dim, prior_prob):
        if dim > 1: # multivariate gaussian
            cov_inverse = np.linalg.inv(cov_matrix)
            cov_det = np.linalg.det(cov_matrix)
            return (
                    ((- 1/2) * (mahalanobis_dist(x, mean_vec, cov_inverse))**2) \
                    - ((dim / 2) * ( np.log(2 * np.pi)) )\
                    - ( (1/2) * np.log(cov_det) ) \
                    + np.log(prior_prob)
            )
        else: # univariate gaussian
#             print("euc_dist",euclidean_dis(x, mean_vec))
#             print(cov_matrix)
            return (
                    -(1/2) * euclidean_dis(x, mean_vec)**2 /  cov_matrix \
                    - (1/2) * (np.log(2 * np.pi * cov_matrix))\
                    + np.log(prior_prob)
            )

In [116]:
#calculate Euclidean distance between 2 points x and y
def euclidean_dis(x, mean_vector):
    cov_inverse = np.identity(x.shape[1])
    return mahalanobis_dist(x, mean_vector, cov_inverse) #

In [117]:
def mahalanobis_dist(x, mean_vector, cov_inverse):
#     print("shape of x in mahalanobis dist", x.shape)
    diff = ( x - mean_vector )
    return ( np.dot( np.dot(diff, cov_inverse), diff.T ) ) ** (1/2)

# Section

In [118]:
data = np.genfromtxt('data_dhs_chap2.csv', delimiter=',', skip_header=1)
data.shape

(30, 4)

In [119]:
def dichotomizer(X, Y, dim):
    if dim == 1 : # only 1 independent feature
        class1_mean_vec = np.mean( X[:5] )
        class2_mean_vec = np.mean( X[5:10] )
        class1_cov = np.cov( X[:5] )
        class2_cov = np.cov( X[5:10] )
        shape = (1,1)
    else: # more than 1
        class1_mean_vec = np.mean( X[:5], axis=0 )
        class2_mean_vec = np.mean( X[5:10], axis=0 )
        class1_cov = np.cov( X[:5].T )
        class2_cov = np.cov( X[5:10].T )
        shape = (dim,)
    class1_prior_prob = 0.5
    class2_prior_prob = 0.5
    predicted_class = []
    for instance in X:
        instance = instance.reshape( shape )
        g1 = discriminant_function(instance, class1_mean_vec, class1_cov, dim, class1_prior_prob)
        g2 = discriminant_function(instance, class2_mean_vec, class2_cov, dim, class2_prior_prob)
        if g1 > g2 :
            predicted_class.append(0) # class 1
        else :
            predicted_class.append(1) # class 2
    return predicted_class

In [120]:
def empirical_training_error(target_class, predicted_class):
    total_instances = len(predicted_class)
    error = 0 
    for instance in range(total_instances) :
        error += np.abs( target_class[instance] - predicted_class[instance] )
    avg_error_percent = (100 / total_instances) * error
    return avg_error_percent

In [121]:
trg_data =  data[ 5:15 , [0,3] ] 
X = trg_data[:, 0] 
Y = trg_data[:, 1].astype(int)
target_class = []
for i in Y:
    target_class.append(i.item())
predicted_class = dichotomizer(X, Y, 1)
print("misclassification error: " , empirical_training_error(target_class, predicted_class) )

misclassification error:  50.0


In [122]:
trg_data =  data[ 5:15 , [0,1,3] ] 
X = trg_data[:, :2] 
Y = trg_data[:, -1].astype(int)
target_class = []
for i in Y:
    target_class.append(i.item())
predicted_class = dichotomizer(X, Y, 2)
print("misclassification error: " , empirical_training_error(target_class, predicted_class) )

misclassification error:  40.0


In [123]:
trg_data =  data[ 5:15 , [0,1,2,3] ] 
X = trg_data[:, :3] 
Y = trg_data[:, -1].astype(int)
target_class = []
for i in Y:
    target_class.append(i.item())
predicted_class = dichotomizer(X, Y, 3)
print("misclassification error: " , empirical_training_error(target_class, predicted_class) )

misclassification error:  0.0
