## Importing relevent libraries and reading data files

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import numpy as np
import scipy as sp

training_data_and_class = np.loadtxt("/content/drive/MyDrive/2학년 2학기/COSE362-MachineLearning/Hw1/train.txt")

FileNotFoundError: /content/drive/MyDrive/2학년 2학기/COSE362-MachineLearning/Hw1/train.txt not found.

## EM on GMM

**Self-defined functions**

In [None]:
def compute_responsibilities_vectorized(theta, data, K):
  component_dist, means, covariances = theta
  data_size, feat_dim = data.shape

  prob_matrix = np.zeros((data_size, K))
  for k in range(K):
      prob_matrix[:, k] = sp.stats.multivariate_normal.pdf(data, means[k], covariances[k], allow_singular=True) * component_dist[k]
  responsibility_matrix = prob_matrix / prob_matrix.sum(axis=1, keepdims=True)
  return responsibility_matrix

In [None]:
def GMM_likelihood(theta_list, xt): #calculates likelihood of xt given GMM parameters
  responsibility, mean_list, cov_list = theta_list
  likelihood = 0
  for c in range(len(responsibility)):
    likelihood += sp.stats.multivariate_normal.pdf(xt, mean_list[c], cov_list[c], allow_singular=True) * responsibility[c]

  return likelihood

In [None]:
def data_binary_classifier(theta_0, theta_1, data):
  model_class = []
  for xt in data:
    if (GMM_likelihood(theta_0, xt) > GMM_likelihood(theta_1, xt)):
      model_class.append(0)
    else:
      model_class.append(1)
  return np.array(model_class)

In [None]:
#@title EM algorithm
def GMM_training(data, K):
  data_size, feat_dim = np.shape(data)

  #initialization
  component_dist = np.zeros((K,))
  mean_k = np.zeros((K, feat_dim))
  cov_k = np.zeros((K, feat_dim, feat_dim))
  component_dist[:] = 1/K #(K,)
  mean_k = data[np.random.choice(data.shape[0], size=K, replace=False)].reshape(K, feat_dim)
  cov_init = np.cov(data, rowvar=False)
  cov_k[:] = cov_init #(K, feat_dim, feat_dim)
  theta = [component_dist, mean_k, cov_k]
  convergence = 0

  #iteration until theta convergence
  while True:
    #Expectation
    responsibility = compute_responsibilities_vectorized(theta, data, K)
    #Mazimization
    new_component_dist = np.zeros(np.shape(component_dist))
    new_mean_k = np.zeros(np.shape(mean_k))
    new_cov_k = np.zeros(np.shape(cov_k))

    for k in range(K):
      new_mean_k_num = np.zeros((feat_dim,))
      new_cov_k_num = np.zeros((feat_dim, feat_dim))
      new_mean_k_denum = 0
      for t in range(data_size):
        new_component_dist[k] += responsibility[t][k]
        new_mean_k_num += responsibility[t][k] * data[t]
        new_cov_k_num += responsibility[t][k] * np.outer(data[t], data[t])
        new_mean_k_denum += responsibility[t][k]

      new_component_dist[k] /= data_size
      new_mean_k[k] = new_mean_k_num / new_mean_k_denum
      new_cov_k[k] = (new_cov_k_num / new_mean_k_denum) - np.outer(new_mean_k[k], new_mean_k[k])

    new_theta = [new_component_dist, new_mean_k, new_cov_k]

    old_component_dist, old_means, old_covariances = theta
    new_component_dist, new_means, new_covariances = new_theta

    # Check for changes in means, covariances, and component distributions
    mean_ratio = np.mean(abs(new_means / old_means))
    cov_ratio = np.mean([np.mean(abs(new_covariances[i] / old_covariances[i])) for i in range(len(old_covariances))])
    comp_dist_ratio = np.mean(abs(new_component_dist / old_component_dist))
    total_diff = abs((sum([mean_ratio, cov_ratio, comp_dist_ratio]) / 3) - 1)
    if (convergence > 3):
      break
    if (total_diff < 10 ** -2):
      convergence += 1
    else:
      convergence = 0
    theta = new_theta

  return new_theta

In [None]:
#@title Data preparation
true_class = training_data_and_class[:, -1] #(60290,)
data_0 = training_data_and_class[:, :][true_class == 0] #(60290, 14)
data_1 = training_data_and_class[:, :][true_class == 1]
#Preparing for K-fold cross validation(K=5)
use_size_0 = (np.shape(data_0)[0] // 5) * 5
use_size_1 = (np.shape(data_1)[0] // 5) * 5

data_0_split = np.reshape(data_0[:use_size_0], (5, -1, 14))
data_1_split = np.reshape(data_1[:use_size_1], (5, -1, 14))
train_data_0 = np.zeros((5, int(use_size_0 * (4/5)), 14))
train_data_1 = np.zeros((5, int(use_size_1 * (4/5)), 14))
train_data_0_nl = np.zeros((5, int(use_size_0 * (4/5)), 13))
train_data_1_nl = np.zeros((5, int(use_size_1 * (4/5)), 13))

test_data_size = np.shape(np.concatenate((data_0_split, data_1_split), axis=1))[1]

test_data_nl = np.zeros((5, test_data_size, 13))
test_true_class = np.zeros((5, test_data_size))

for K in range(5):
  train_data_0[K] = np.concatenate((*data_0_split[:K], *data_0_split[K+1:]))
  train_data_1[K] = np.concatenate((*data_1_split[:K], *data_1_split[K+1:]))
  train_data_0_nl[K] = train_data_0[K][:,:-1]
  train_data_1_nl[K] = train_data_1[K][:,:-1]

  test_data = np.concatenate((data_0_split[K], data_1_split[K]), axis=0)
  test_data_nl[K] = test_data[:, :-1]
  test_true_class[K] = test_data[:, -1]

In [None]:
# @title Component 2-13
error_per_component = [0.15673882391971472, 0.15294020071327857, 0.1510657709214564, 0.15144729202952642, 0.1501368499626773, 0.1497719167288712, 0.14847806253628598, 0.148279008045119, 0.1479970141826325, 0.14778137181720163, 0.148096541428216, 0.1477813718172016]
for c in range(14,17): #number of mixture components
  print()
  print("Assessing performance for number of mixture components:", c)
  MSE_per_component = 0
  for K in range(5):
    print("Performing K-cross validation, current test data section:", K + 1)
    GMM_0_theta = GMM_training(train_data_0_nl[K], c)
    GMM_1_theta = GMM_training(train_data_1_nl[K], c)
    #test model, create a estimate class
    test_model_class = data_binary_classifier(GMM_0_theta, GMM_1_theta, test_data_nl[K])
    #calculate MSE
    MSE_per_component_part = np.mean((test_true_class[K] - test_model_class) ** 2)
    MSE_per_component += MSE_per_component_part #this is the MSE for one data section of K division
    print("done. MSE value for test data section", K + 1, ":", MSE_per_component_part)
  print("K-cross validation complete, overall MSE for mixture component:", c)
  print(MSE_per_component/5)
  error_per_component.append(MSE_per_component / 5)


Assessing performance for number of mixture components: 14
Performing K-cross validation, current test data section: 1
done. MSE value for test data section 1 : 0.1431533548975699
Performing K-cross validation, current test data section: 2
done. MSE value for test data section 2 : 0.12498963257858506
Performing K-cross validation, current test data section: 3
done. MSE value for test data section 3 : 0.16322468275690472
Performing K-cross validation, current test data section: 4
done. MSE value for test data section 4 : 0.15094965580160902
Performing K-cross validation, current test data section: 5
done. MSE value for test data section 5 : 0.15642365430870034
K-cross validation complete, overall MSE for mixture component: 14
0.1477481960686738

Assessing performance for number of mixture components: 15
Performing K-cross validation, current test data section: 1
done. MSE value for test data section 1 : 0.14331923364020902
Performing K-cross validation, current test data section: 2
don

In [None]:
print(error_per_component)

[0.15673882391971472, 0.15294020071327857, 0.1510657709214564, 0.15144729202952642, 0.1501368499626773, 0.1497719167288712, 0.14847806253628598, 0.148279008045119, 0.1479970141826325, 0.14778137181720163, 0.148096541428216, 0.1477813718172016, 0.1477481960686738, 0.14783113543999338, 0.14914157750684248]


In [None]:
# @title Printing results
print("Evaluation complete for optimal mixture components. Result, starting from 2 mixture components, is:")
print_error_per_component = [[idx + 2, error_per_component[idx]] for idx in range(len(error_per_component))]
for result in print_error_per_component:
  print("MSE of", result[0], "number of mixture components is:", result[1])
optimal_c = np.argmin(np.array(error_per_component)) + 2
print("Optimal number of mixture components is:", optimal_c)

Evaluation complete for optimal mixture components. Result, starting from 2 mixture components, is:
MSE of 2 number of mixture components is: 0.15673882391971472
MSE of 3 number of mixture components is: 0.15294020071327857
MSE of 4 number of mixture components is: 0.1510657709214564
MSE of 5 number of mixture components is: 0.15144729202952642
MSE of 6 number of mixture components is: 0.1501368499626773
MSE of 7 number of mixture components is: 0.1497719167288712
MSE of 8 number of mixture components is: 0.14847806253628598
MSE of 9 number of mixture components is: 0.148279008045119
MSE of 10 number of mixture components is: 0.1479970141826325
MSE of 11 number of mixture components is: 0.14778137181720163
MSE of 12 number of mixture components is: 0.148096541428216
MSE of 13 number of mixture components is: 0.1477813718172016
MSE of 14 number of mixture components is: 0.1477481960686738
MSE of 15 number of mixture components is: 0.14783113543999338
MSE of 16 number of mixture componen

##Performing evaluation of final model with test data

In [None]:
#train final model
Final_MSE = 0
for i in range(3):
  print("Training optimal GMM_0 model...")
  GMM_final_0 = GMM_training(data_0[:, :-1], optimal_c)
  print("done")

  print("Training optimal GMM_1 model...")
  GMM_final_1 = GMM_training(data_1[:, :-1], optimal_c)
  print("done")

  #test final model
  test_data_and_class = np.loadtxt("/content/drive/MyDrive/2학년 2학기/COSE362-MachineLearning/Hw1/test.txt")
  test_true_class = test_data_and_class[:, -1] #(60290,)
  test_data = test_data_and_class[:, :-1]

  print("Finding MSE value...")
  test_model_class = data_binary_classifier(GMM_final_0, GMM_final_1, test_data)
  Final_MSE += np.mean((test_true_class - test_model_class) ** 2)
print("done. Test MSE value for optimal mixture component", optimal_c, ":", Final_MSE/3)

Training optimal GMM_0 model...
done
Training optimal GMM_1 model...
done
Finding MSE value...
Training optimal GMM_0 model...
done
Training optimal GMM_1 model...
done
Finding MSE value...
Training optimal GMM_0 model...
done
Training optimal GMM_1 model...
done
Finding MSE value...
done. Test MSE value for optimal mixture component 14 : 0.14716062736614385
