In [345]:
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
import numpy as np
import random as rd

First need to create k Gaussians for a mixture

In [346]:
# The book suggested first running k means a few times and then using that as those as the Gaussians

iris = load_iris()
data = iris.data
target = iris.target

XFull = data
XTrain, XTest, yTrain, yTest = train_test_split(XFull, target)

yTrain = yTrain.reshape(-1,1)
yTest = yTest.reshape(-1,1)
XTrain.shape, yTrain.shape

((112, 4), (112, 1))

In [347]:
k = 3
kmeans = KMeans(n_clusters = k, max_iter= 1)
kmeans.fit(XTrain, yTrain)

pointsInEachCluster = [XTrain[kmeans.labels_ == i] for i in range(k)]
print(len(pointsInEachCluster[0]))
print(len(pointsInEachCluster[1]))
print(len(pointsInEachCluster[2]))

47
39
26


  super()._check_params_vs_input(X, default_n_init=10)


In [348]:
centers = kmeans.cluster_centers_
means = []
covMats = []
for i in range(len(pointsInEachCluster)):
    points = pointsInEachCluster[i]
    means.append(centers[i])
    covMat = np.cov(points.T)
    covMats.append(covMat)

means[0], covMats[0]

(array([5.88510638, 2.72765957, 4.42765957, 1.45319149]),
 array([[0.18086031, 0.05172525, 0.12129047, 0.03015726],
        [0.05172525, 0.09421832, 0.05443571, 0.03240981],
        [0.12129047, 0.05443571, 0.27813136, 0.13045328],
        [0.03015726, 0.03240981, 0.13045328, 0.0960222 ]]))

In [349]:
prior = 1 / len(means)
priors = [prior for i in range(len(means))]

In [350]:
def multivariate_normal(x, mean, cov):
    d = x.shape[0]
    det = np.linalg.det(cov)
    inv = np.linalg.inv(cov)
    exponent = -0.5 * np.matmul(np.matmul((x - mean).T, inv), (x - mean))
    coefficient = 1 / ((2 * np.pi) ** (d / 2) * np.sqrt(det))
    return coefficient * np.exp(exponent)


In [351]:
multivariate_normal(np.array([1, 2]), np.array([1, 2]), np.array([[1, 0], [0, 1]]))

0.15915494309189535

In [352]:
print(covMats[0])
print(means[0])
# I need take the current values of means, covariance and priors and calculate the h_ti

# h_ti is the posterior probability of the ith data point belonging to the ith cluster for the t_th data point given the point
# and the previous values of the means, covariance and priors

# so just a normal distribution with the mean and covariance of the cluster


def getPosteriorForAllClusters(xt, currMeans, currCov, currPriors):
    posterior = []
    for i in range(len(currMeans)):
        mean = currMeans[i]
        cov = currCov[i]
        prior = currPriors[i]
        probab = multivariate_normal(xt, mean, cov)
        posterior.append(probab * prior)
    
    # normalize
    posterior = np.array(posterior)
    posterior = posterior / np.sum(posterior)

    return posterior


def getPosteriorForAllDataPoints(X, currMeans, currCov, currPriors):
    posterior = []
    for i in range(len(X)):
        xt = X[i]
        posterior.append(getPosteriorForAllClusters(xt, currMeans, currCov, currPriors))
    return np.array(posterior)

# Print which cluster each data point has most probability of belonging to
def printPredictions(X, currMeans, currCov, currPriors):
    posterior = getPosteriorForAllDataPoints(X, currMeans, currCov, currPriors)
    predictions = np.argmax(posterior, axis = 1)
    print(predictions)


def getNewMeans(X, currMeans, posteriors):
    newMeans = []
    # use the conscise numpy way of doing this
    for i in range(len(currMeans)):
        mean = np.sum(posteriors[:, i].reshape(-1, 1) * X, axis = 0) / np.sum(posteriors[:, i])
        newMeans.append(mean)
    
    # This is just for verifying whether the conscise way copilot gave is correct and damn is it nice
    # newMeans2 = []
    # for i in range(len(currMeans)):
    #     s = 0
    #     for j in range(len(X)):
    #         s += posterior[j, i] * X[j]
    #     newMeans2.append(s / np.sum(posterior[:, i]))
    
    # print(newMeans2)
    
    return newMeans

def getNewCovMats(X, currMeans, posteriors):
    newCovMats = []
    for i in range(len(currMeans)):
        mean = currMeans[i]
        covMat = np.zeros((len(mean), len(mean)))
        for j in range(len(X)):
            diff = X[j] - mean
            diff = diff.reshape(-1, 1)
            covMat += posteriors[j, i] * np.matmul(diff, diff.T)
        covMat /= np.sum(posteriors[:, i])
        newCovMats.append(covMat)
    return newCovMats

def getNewPriors(X, currMeans, posteriors):
    newPriors = []
    for i in range(len(currMeans)):
        newPriors.append(np.sum(posteriors[:, i]) / len(X))
    return newPriors

def getNewParams(X, currMeans, currCov, currPriors):
    posteriors = getPosteriorForAllDataPoints(X, currMeans, currCov, currPriors)
    means = getNewMeans(X, currMeans, posteriors)
    covMats = getNewCovMats(X, currMeans, posteriors)
    priors = getNewPriors(X, currMeans, posteriors)
    return means, covMats, priors

for i in range(100):
    means, covMats, priors = getNewParams(XTrain, means, covMats, priors)

print(covMats[0])

print(means[0])

# The outputs are similar so why does my EM give all wrong predictions when the kmeans always gives some right predictions




[[0.18086031 0.05172525 0.12129047 0.03015726]
 [0.05172525 0.09421832 0.05443571 0.03240981]
 [0.12129047 0.05443571 0.27813136 0.13045328]
 [0.03015726 0.03240981 0.13045328 0.0960222 ]]
[5.88510638 2.72765957 4.42765957 1.45319149]


[[0.23677937 0.07778165 0.19107901 0.06231944]
 [0.07778165 0.10606247 0.08153701 0.04018368]
 [0.19107901 0.08153701 0.2409081  0.07378545]
 [0.06231944 0.04018368 0.07378545 0.03516221]]
[5.94146184 2.74170488 4.25580933 1.31485304]
