# ME Assignment
1. Implement the EM algorithm for clustering with multivariate Gaussian models
discussed in the LearningGM section. Initialize the algorithm with the K-means result
(for example Matlab provides a kmeans function). Assume there are only two clusters.
Report the parameters $\pi_k$, $\mu_k$, $\Sigma_k$, $k = 1, 2$ for the following datasets containing
$500-600$ observations from $\mathbb{R}^2$:



#### Import dependencies

In [61]:
import numpy as np
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans
from os.path import join
from scipy.stats import multivariate_normal

#### Data loading functions

In [62]:
def get_xeasy():
    return load_data("xeasy.txt")

def get_x1():
    return load_data("x1.txt")

def get_x2():
    return load_data("x2.txt")

def load_data(filename):
    path = "data/EM/"
    data = np.loadtxt(join(path,filename), delimiter=',')
    return data


#### EM Algorithm

In [63]:
def EM(data):
    num_clusters = 2   #number of clusters

    #use kmeans to initialize cluster centers
    kmeans = KMeans(n_clusters=num_clusters)
    prediction_labels = np.asarray(kmeans.fit_predict(data))
    cluster_centroids = np.asarray(kmeans.cluster_centers_)
    cluster_variances = np.asarray([np.sum(np.square(data[prediction_labels==k]))/(prediction_labels.size-1)
                         for k in range(num_clusters)])
    mixture_proportions = np.asarray([np.sum(prediction_labels==k)/prediction_labels.size for k in range(num_clusters)])

    pdf = lambda centroid, variance, proportion: multivariate_normal.pdf(data, centroid, variance)*proportion

    converged = False
    counter = 0
    while not converged:
        counter += 1
        if counter == 10000:
            print(counter)
            break

        # expectation step
        denominator = np.sum(list(map(pdf, cluster_centroids, cluster_variances, mixture_proportions)),axis=0)
        prediction_labels=[pdf(centroid, variance, proportion)/denominator for centroid, variance, proportion in
                           zip(cluster_centroids, cluster_variances, mixture_proportions)]
        prediction_labels = np.asarray(prediction_labels).T

        # maximization step
        new_cluster_centroids = np.asarray([np.average(data, weights=prediction_labels[:,k], axis=0) for k in range(num_clusters)])
        new_mixture_proportions = np.asarray([np.mean(prediction_labels[:,k]) for k in range(num_clusters)])
        new_cluster_variances = np.asarray([np.average(np.sum(np.square(data), axis=1), weights=prediction_labels[:,k])
                         for k in range(num_clusters)])

        if(np.allclose(new_cluster_centroids, cluster_centroids, atol=1e-5) and
            np.allclose(new_cluster_variances, cluster_variances, atol=1e-5) and
            np.allclose(new_mixture_proportions, mixture_proportions, atol=1e-5)):
            converged = True
            print(counter)
        cluster_centroids = new_cluster_centroids
        cluster_variances = new_cluster_variances
        mixture_proportions = new_mixture_proportions







## 1a. xeasy


In [64]:
EM(get_xeasy())

18


In [65]:
EM(get_x1())

16


In [66]:
EM(get_x2())


37
