In [None]:
# Importing numpy 
import numpy as np

""" 
1) Class: matrix
You will code a class called matrix, which will have an attribute called array_2d. 
This attribute is supposed to be a NumPy array containing numbers in two dimensions. The class matrix must have the following methods:
(in these, the parameters are in addition to self)

"""

class matrix:
    """
    Load_from_csv:
    This method should have one parameter, a file name (including, if necessary, its path and extension). 
    This method should read this CSV file and load its data to the array_2d of matrix. Each row in this file should be a row in array_2d.
    """
    def load_from_csv(self, filename : str):
        
        #Loads the matrix from a csv
        f = open(filename, 'r')
        lines = f.readlines()

        data = []
        for l in lines:
            vals = list(map(float, l.split(',')))
            data.append(vals)

        self.array_2d = np.array(data)

        f.close()

        return self.array_2d
    
    def __init__(self, filename=None, arr=None):
        #Constructor function for Matrix
       
        if filename is not None:
            self.array_2d = self.load_from_csv(filename)

        if arr is not None:
            self.array_2d = arr
    """
    Standardize:
    This method should have no parameters. It should standardise the array_2d in the matrix calling this method.

    """
    def standardise(self):  
        
        #Standardizing the array 
        average = np.mean(self.array_2d, axis =0)
        maxValue = np.max(self.array_2d, axis =0)
        minValue = np.min(self.array_2d, axis =0)
        standardizedArray = []
        for eachrow in self.array_2d:
            stdRow = []
            stdStr = ''
            for colNo in range(len(eachrow)):
                stdRow.append((eachrow[colNo] - average[colNo]) / (maxValue[colNo] - minValue[colNo]))
            standardizedArray.append(stdRow)
        return np.array(standardizedArray)
        
    """
    get_distance:
    This method should have three parameters, two matrices (let us call them other_matrix and weights) and a number (let us call it beta). 
    If the matrix calling this method and the matrix weights have only one row, 
    this function should return a matrix containing the weighted Euclidean distance between the row in the matrix calling this method and each of the rows in other_matrix.

    """
    def get_distance(self, other_matrix, weights, beta):
        """
        It calculates the distance between this row and every row in "other_matrix"
        using the weights and beta.
        """

        if self.array_2d.shape[0] == 1:
            dists = np.zeros((other_matrix.array_2d.shape[0], 1))

            # for every row in other_matrix
            for i in range(other_matrix.array_2d.shape[0]):
                v1 = self.array_2d.reshape(1, -1)
                v2 = other_matrix.array_2d[i, :].reshape(1, -1)
                diffvec = np.square(v1 - v2)
                powerweights = np.power(weights, beta).reshape(-1, 1)
                dist = np.sum(np.dot(powerweights, diffvec))
                dists[i, :] = dist

            return dists


    """
    get_count_frequency:
    This method should have no parametes, and it should work if the array_2d of the matrix calling this method has only one column. 
    This method should return a dictionary mapping each element of the array_2d to the number of times this element appears in array_2d.

    """
    def get_count_frequency(self):
        """
        Counts the frequency of elements in the matrix
        """
        unique, counts = np.unique(self.array_2d, return_counts=True)
        s={}
        for i in range(0,len(unique)):
            if unique[i] not in s:
                s[unique[i]]=counts[i]
        return s

        


"""
get_initial_weights:
This function should have one parameter, an integer m. This function should return a matrix with 1
row and m columns containing random values, each between zero and one. The sum of these m
values should be equal to one.
4
"""


def get_initial_weights(m : int):
    
    #Gets initial weights
    weight = np.random.random(m)
    weight = weight/np.sum(weight)
    
    return weight
    

"""
get_centroids:
This function should have three parameters: (i) a matrix containing the data, (iii) the matrix S, (iii) the value of K. This function should implement the Step 9 of the algorithm described in the appendix. 
It should return a matrix containing K rows and the same number of columns as the matrix containing the data.

"""
def get_centroids(mat : matrix, S : np.ndarray, K : int):

    #Updates the centroids
    mat.standardise()

    centroids = np.zeros((K, mat.array_2d.shape[1]))
    for k in range(K):
        corr_rows = []
        for j in range(S.shape[0]):
            if(S[j, 0] == k):
                corr_rows.append(mat.array_2d[j, :])

        centroids[k, :] = sum(corr_rows)/len(corr_rows)


    return matrix(arr=centroids)


"""
get_groups:
This function should have three parameters: a matrix containing the data, and the number of groups to be created (K), 
and a number beta (for the distance calculation). This function follows the algorithm described in the appendix. 
It should return a matrix S (defined in the appendix). 
This function should use the other functions you wrote as much as possible. Do not keep repeating code you already wrote.

"""
def get_groups(mat, K, beta):
    
    #Makes groups with data matrix
  

    # standardizing the matrix
    mat.standardise()
    
    # initializing weights
    n, m = mat.array_2d.shape
    weights = get_initial_weights(m)

    # creating an empty matrix called centroids
    centroids = matrix()

    # Create a matrix called S with n rows and 1 column, 
    # initialise all of its elements to zero
    S = np.zeros((n, 1))

    # selecting K different rows at random
    rows = np.random.choice(n, K, replace=False)

    # populating centroids matrix with K different selected rows
    centroids.array_2d = mat.array_2d[rows, :].copy()


    while True:

        # keep track if centroids change
        changes = 0

        # for every data point
        for i in range(n):
            rowmat = matrix(arr = mat.array_2d[i, :].reshape(1, -1))

            dists = rowmat.get_distance(centroids, weights, beta)
            cent = np.argmin(dists)

            # if centroid is different than the one already, updating it
            if S[i, :] != cent:
                changes += 1
                S[i, :] = cent
            
        if changes == 0:
            return matrix(arr = S)
        
        
        # updating centroids and weights
        centroids = get_centroids(mat, S, K)
        weights = get_new_weights(mat, centroids, S, beta)




"""
get_new_weights:
This function takes three parameters: a matrix containing the data, a matrix containing the centroids, 
and a matrix S (see the algorithm in the Appendix). 
This function should return a new matrix weights with 1 row and as many columns as the matrix containing the data (and the matrix containing the centroids).

"""

def get_new_weights(mat, centroids, S, beta):

    """
    Updates the existing weights by calculating dispersion.
    """
    # initializing matrix
    n, m = mat.array_2d.shape
    wts = np.zeros((1, m))
    K, _ = centroids.array_2d.shape


    # dispersion calculation
    disp = np.zeros((1, m))
    for k in range(K):
        for i in range(n):
            if(S[i, :] == k):
                disp += np.square(mat.array_2d[i, :] - centroids.array_2d[k, :])
    

    for j in range(m):
        if disp[0, j] == 0:
            wts[0, j] = 0
        else:
            val = 0
            for i in range(m):
                val += pow(disp[0, j]/disp[0, i], 1/(beta-1))
            wts[0, j] = val

    return wts

"""
run_test:
The aim of this function is just to run a series of tests. By consequence, 
here (and only here) you can use hard-coded values for the strings containing the filenames of data and values for K.

"""
def run_test():
    m = matrix('Data.csv')
    for k in range(2,5):
        for beta in range(11,25):
            S = get_groups(m, k, beta/10)
            print(str(k)+'-'+str(beta)+'='+str(S.get_count_frequency()))
            


run_test()

2-11={0.0: 56, 1.0: 122}
2-12={0.0: 55, 1.0: 123}
2-13={0.0: 55, 1.0: 123}
2-14={0.0: 123, 1.0: 55}
2-15={0.0: 122, 1.0: 56}
2-16={0.0: 122, 1.0: 56}
2-17={0.0: 55, 1.0: 123}
2-18={0.0: 122, 1.0: 56}
2-19={0.0: 122, 1.0: 56}
2-20={0.0: 55, 1.0: 123}
2-21={0.0: 122, 1.0: 56}
2-22={0.0: 56, 1.0: 122}
2-23={0.0: 122, 1.0: 56}
2-24={0.0: 56, 1.0: 122}
3-11={0.0: 27, 1.0: 102, 2.0: 49}
3-12={0.0: 49, 1.0: 102, 2.0: 27}
3-13={0.0: 62, 1.0: 69, 2.0: 47}
3-14={0.0: 47, 1.0: 69, 2.0: 62}
3-15={0.0: 47, 1.0: 62, 2.0: 69}
3-16={0.0: 47, 1.0: 69, 2.0: 62}
3-17={0.0: 47, 1.0: 69, 2.0: 62}
3-18={0.0: 69, 1.0: 62, 2.0: 47}
3-19={0.0: 69, 1.0: 47, 2.0: 62}
3-20={0.0: 69, 1.0: 47, 2.0: 62}
3-21={0.0: 69, 1.0: 62, 2.0: 47}
3-22={0.0: 47, 1.0: 62, 2.0: 69}
3-23={0.0: 69, 1.0: 62, 2.0: 47}
3-24={0.0: 62, 1.0: 69, 2.0: 47}
4-11={0.0: 59, 1.0: 25, 2.0: 57, 3.0: 37}
4-12={0.0: 37, 1.0: 59, 2.0: 57, 3.0: 25}
4-13={0.0: 59, 1.0: 37, 2.0: 57, 3.0: 25}
4-14={0.0: 37, 1.0: 59, 2.0: 57, 3.0: 25}
4-15={0.0: 57, 1.0