### Importing relevent libraries and reading data files

In [75]:
import numpy as np
import scipy as sp

training_data_and_class = np.loadtxt("/content/drive/MyDrive/2학년 2학기/COSE362-MachineLearning/Hw1/train.txt") #(60290, 14(13 feature + 1 class))
#training_data_and_class = np.loadtxt("/content/drive/MyDrive/2학년 2학기/COSE362-MachineLearning/Hw1/personal_test.txt")
true_class = training_data_and_class[:, -1] #(60290,)
train_data_0 = training_data_and_class[:, :-1][true_class == 0]
train_data_1 = training_data_and_class[:, :-1][true_class == 1]

### Self-defined functions

In [76]:
def compute_responsibilities_vectorized(theta, data, K):
    """Vectorized computation of responsibilities."""
    component_dist, means, covariances = theta
    data_size, feat_dim = data.shape

    # Precompute the multivariate normal densities for all components and all data points
    prob_matrix = np.zeros((data_size, K))
    for k in range(K):
        prob_matrix[:, k] = sp.stats.multivariate_normal.pdf(data, means[k], covariances[k], allow_singular=True) * component_dist[k]

    # Calculate responsibilities by normalizing over all components
    responsibility_matrix = prob_matrix / prob_matrix.sum(axis=1, keepdims=True)
    return responsibility_matrix

### GMM model

In [130]:
def GMM_training(data, K):
  it = 0
  data_size, feat_dim = np.shape(data)

  #initialization

  component_dist = np.zeros((K,))
  mean_k = np.zeros((K, feat_dim))
  cov_k = np.zeros((K, feat_dim, feat_dim))
  component_dist[:] = 1/K #(K,)
  mean_k = data[np.random.choice(data.shape[0], size=K, replace=False)]
  cov_init = np.cov(data, rowvar=False)
  cov_k[:] = cov_init #(K, feat_dim, feat_dim)
  theta = [component_dist, mean_k, cov_k]
  convergence = 0

  #iteration until theta convergence
  while True:
    it += 1
    #Expectation
    responsibility = compute_responsibilities_vectorized(theta, data, K)
    #Mazimization
    new_component_dist = np.zeros(np.shape(component_dist))
    new_mean_k = np.zeros(np.shape(mean_k))
    new_cov_k = np.zeros(np.shape(cov_k))

    for k in range(K):
      new_mean_k_num = np.zeros((feat_dim,))
      new_cov_k_num = np.zeros((feat_dim, feat_dim))
      new_mean_k_denum = 0
      for t in range(data_size):
        new_component_dist[k] += responsibility[t][k]
        new_mean_k_num += responsibility[t][k] * data[t]
        new_cov_k_num += responsibility[t][k] * np.outer(data[t], data[t])
        new_mean_k_denum += responsibility[t][k]

      new_component_dist[k] /= data_size
      new_mean_k[k] = new_mean_k_num / new_mean_k_denum
      new_cov_k[k] = (new_cov_k_num / new_mean_k_denum) - np.outer(new_mean_k[k], new_mean_k[k])


    new_theta = [new_component_dist, new_mean_k, new_cov_k]
    #check convergence
    """
    below method was discarded as it was too computationally expensive
    log_likelihood = 0
    log_likelihood_new = 0
    for t in range(data_size):
      likelihood_inner = 0
      likelihood_inner_new = 0
      for k in range(K):
        likelihood_inner += sp.stats.multivariate_normal.pdf(data[t], mean_k[k], cov_k[k]) * component_dist[k]
        likelihood_inner_new += sp.stats.multivariate_normal.pdf(data[t], new_mean_k[k], new_cov_k[k]) * new_component_dist[k]
      log_likelihood += np.log(likelihood_inner)
      log_likelihood_new += np.log(likelihood_inner_new)
    print("old, new likelihood calculated")

    if (abs(log_likelihood_new - log_likelihood) < 10 ** -4):
      break
    else:
      theta = new_theta
    """
    old_component_dist, old_means, old_covariances = theta
    new_component_dist, new_means, new_covariances = new_theta

    # Check for changes in means, covariances, and component distributions
    mean_ratio = np.mean(abs(new_means / old_means))
    cov_ratio = np.mean([np.mean(abs(new_covariances[i] / old_covariances[i])) for i in range(len(old_covariances))])
    comp_dist_ratio = np.mean(abs(new_component_dist / old_component_dist))
    total_diff = abs((sum([mean_ratio, cov_ratio, comp_dist_ratio]) / 3) - 1)
    print("total_diff:", total_diff)
    if (convergence > 3):
      break
    if (total_diff < 10 ** -2):
      convergence += 1
    else:
      convergence = 0
    theta = new_theta

  return new_theta, it

### Running

In [132]:


for K in range(1, 11):
  GMM_0_theta, iterations = GMM_training(train_data_0, K)
  print("K:", K, "iterations:", iterations - 4)


total_diff: 0.2117121024245785
total_diff: 0.0
total_diff: 0.0
total_diff: 0.0
total_diff: 0.0
total_diff: 0.0
K: 1 iterations: 2
total_diff: 0.7091523038679497
total_diff: 0.21064059968997872
total_diff: 0.03647987208744241
total_diff: 0.09582272621454524
total_diff: 0.021360363925228443
total_diff: 0.016235087722640706
total_diff: 0.004215144449933428
total_diff: 0.02856763572777865
total_diff: 0.011570168665048008
total_diff: 0.00664629445498921
total_diff: 0.0015624570348746758
total_diff: 0.0370304347101893
total_diff: 0.011251652010584312
total_diff: 0.002025925270128237
total_diff: 0.011484373022278005
total_diff: 0.01244671843240397
total_diff: 0.029615168917768386
total_diff: 0.012805134666196816
total_diff: 0.0031207531526196153
total_diff: 0.00031312106352643365
total_diff: 0.0012244921822679444
total_diff: 0.0007632051674495433
total_diff: 0.004190271451908312
K: 2 iterations: 19
total_diff: 1.001786196053668
total_diff: 0.06751763187815363
total_diff: 0.04726018933530818
t