In [147]:
%matplotlib notebook

import numpy as np
import pandas as pd

import seaborn as sns 
import matplotlib.pyplot as plt
sns.set()

from IPython import display
import time
from scipy.stats import multivariate_normal
from scipy.spatial import distance_matrix

# Data visualization

In [2]:
data_train = pd.read_csv('EMGaussian.data', sep=' ', header=None)
data_train.columns=['x', 'y']
data_test = pd.read_csv('EMGaussian.test', sep=' ', header=None)
data_test.columns=['x', 'y']

In [3]:
plt.figure()
sns.scatterplot(x="x", y="y", data=data_train);

<IPython.core.display.Javascript object>

In [4]:
plt.figure(figsize=(5,5))
sns.scatterplot(x="x", y="y", data=data_test);

<IPython.core.display.Javascript object>

# K-means

In [5]:
def Jfunc(X, mu, z, k):
    J=0
    for i in range(k):
         J+=np.sum(np.power(np.linalg.norm(X[z==i]-mu[i,:],axis=1),2))
    return J

In [6]:
def kmeans(X, k, eps, display='all', timing=0.4):
    #initialization
    mu = X[np.random.randint(0, X.shape[0],k),:]
    convergence = False
    
    if display=='all':
        fig = plt.figure()
        ax = fig.add_subplot(111)
        palette =  ['red', 'blue', 'green', 'purple']
        plt.ion()
        ax.clear()
        ax.scatter(x=X[:,0], y=X[:,1], c='red')
        ax.scatter(x=mu[:,0],y=mu[:,1],c='black',marker='X')
        fig.canvas.draw()   # draw
        time.sleep(timing)    #sleep
        fig.show();
        
    j = np.inf

    while not(convergence):
        #optimize over z
        z = np.argmin(distance_matrix(X, mu),axis=1)
        
        if display=='all':
            ax.clear() # - Clear
            sns.scatterplot(x=X[:,0], y=X[:,1], hue=z, palette=palette)
            fig.canvas.draw()   # draw
            time.sleep(timing)    #sleep
        
        #optimize over mu
        for i in range(0,k):
            mu[i,:] = X[z==i].mean(axis=0)
            
        if display == 'all':
            ax.scatter(x=mu[:,0],y=mu[:,1],c='black',marker='X')
            fig.canvas.draw()   # draw
            time.sleep(timing) 
            
        #convergence evaluation
        if np.abs(j-Jfunc(X, mu, z, k))<eps:
            convergence = True
        else:
            j = Jfunc(X, mu, z, k)
        
        if display == 'all':
            fig.canvas.set_window_title('J: {}'.format(j))
    return mu, z

In [7]:
mu, z = kmeans(data_train.values, 4, 0.1, display='all', timing=0.2)

<IPython.core.display.Javascript object>

In [8]:
def gaussian(x, mu, sigma):
    x = -1/2*(x-mu).T.dot(x-mu)/(sigma**2)
    if x>30:
        x = 30
    return np.exp(x)/sigma

In [9]:
cmaps_dic={'blue':'Blues',
           'red':'Reds',
           'green':'Greens',
           'purple':'Purples'}

In [10]:
def make_grid(data=None, xmin=-5, xmax=5, ymin=-5, ymax=5, step=20):
    """
    Create a grid in the form of a list of points stored in a 2D array
    """

    if data is not None:
        xmax, xmin, ymax, ymin = np.max(data[:, 0]),  np.min(data[:, 0]), np.max(data[:, 1]), np.min(data[:, 1])

    x, y = np.meshgrid(np.arange(xmin, xmax, (xmax-xmin)*1./step), np.arange(ymin, ymax, (ymax-ymin)*1./step))
    grid = np.c_[x.ravel(), y.ravel()]

    return grid, x, y

In [217]:
class GaussianMixture():
    def __init__(self, X, k, covariance_mode, eps=0.1, display='all', timing=0.1, n_itermax=100):
        self.k = k
        self.X = X
        self.covariance_mode = covariance_mode
        mu,z = kmeans(X,k,0.1,display=None)
        self.z = pd.get_dummies(z).values.astype(float)
        self.eps = eps
        self.n_itermax = n_itermax
        if covariance_mode == 'isotrope':
            self.sigmas = np.zeros(k)
        else:
            self.sigmas = np.zeros((k,self.X.shape[1],self.X.shape[1]))
        self.mus = np.zeros((self.X.shape[1],k))
        self.m_step()
        if display=='all':
            self.timing = timing
            self.set_display()
            self.display()
        
    def set_display(self):
            self.fig = plt.figure()
            self.ax = self.fig.add_subplot(111)
            #self.ax.axis('equal') # allow to visualize isotropy
            self.palette =  ['red', 'blue', 'green', 'purple']
            plt.ion()
            grid, x_grid, y_grid = make_grid(data=self.X, step=100)
            self.x_grid = x_grid
            self.y_grid = y_grid
            self.grid = grid
            self.ax.axis([np.min(x_grid), np.max(x_grid), np.min(y_grid), np.max(y_grid)])
            
    def display(self):
            self.ax.clear()
            for j in range(self.k):
                self.ax.contour(self.x_grid,
                                 self.y_grid,
                                 multivariate_normal.pdf(self.grid, mean=self.mus[:, j], cov=self.sigmas[j]).reshape(self.x_grid.shape),
                                 levels = 10,
                                 alpha=0.4,
                                 vmin=0,
                                 linestyles='solid',
                                 cmap =cmaps_dic[self.palette[j]])
            sns.scatterplot(x=self.X[:,0], y=self.X[:,1], hue=np.argmax(self.z,axis=1), palette=self.palette)
            self.ax.scatter(x=self.mus[0,:],y=self.mus[1,:],c='black',marker='X')
            self.fig.canvas.draw()   # draw
            time.sleep(self.timing)    #sleep
            
    def e_step(self):
        for j in range(self.k):
                self.z[:, j] = self.pis[j] * multivariate_normal.pdf(self.X, mean=self.mus[:, j], cov=self.sigmas[j])
        self.z = self.z / self.z.sum(axis=1).reshape(-1,1)
        

    def m_step(self):
        self.pis = self.z.mean(axis=0)
        for i in range(self.k):
            self.mus[:,i] = (self.X*(self.z[:,i].reshape(-1,1))).sum(axis=0)/self.z[:,i].sum()
            
        if self.covariance_mode == 'isotrope':
            for i in range(self.k):
                self.sigmas[i]=np.sqrt((self.X - self.mus[:,i]).dot((self.X-self.mus[:,i]).T).dot(np.diag(self.z[:,i])).trace()/np.diag(self.z[:,i]).sum())
        else:
            for i in range(self.k):
                self.sigmas[i]=((self.X - self.mus[:,i]).T.dot(np.diag(self.z[:,i]))).dot(self.X-self.mus[:,i])/np.diag(self.z[:,i]).sum()
    def jfunc(self):
        res=0
        for j in range(self.k):
            res += (self.z[:,j] * np.log(multivariate_normal.pdf(self.X, mean=self.mus[:, j], cov=self.sigmas[j]))).sum() + (self.z[:,j] * np.log(self.pis[j])).sum()
        return res
    
    def fit(self):
        convergence = False
        niter = 1
        self.j = np.inf 
        while not(convergence):
            self.e_step()
            self.m_step()
            self.display()
            if (np.abs(self.j-self.jfunc())<self.eps) or (niter > self.n_itermax):
                convergence = True
            else:
                self.j = self.jfunc()
                niter+=1

In [218]:
g = GaussianMixture(data_train.values, 4,'non-isotrope')

<IPython.core.display.Javascript object>

In [39]:
x = np.random.rand(100,2)

In [54]:
y = multivariate_normal.pdf(g.X, mean=np.zeros((2)), cov=0.5);

In [66]:
g.mus[:, 1].shape

(2,)