<a href="https://colab.research.google.com/github/MANICKAVIGNESH/Identifying-groups-of-similar-wines/blob/main/Copy_of_Anubavam_Interview_Task.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import numpy as np
import pandas as pd

class Matrix:
    def __init__(self, filename=None):
        self.array_2d = None
        if filename:
            self.load_from_csv(filename)

    def load_from_csv(self, filename):
        # Read CSV file using pandas and convert to numpy array
        data = pd.read_csv(filename)
        self.array_2d = data.values

    def standardize(self):
        # Standardize each column (axis=0 for columns)
        self.array_2d = (self.array_2d - self.array_2d.mean(axis=0)) / (self.array_2d.max(axis=0) - self.array_2d.min(axis=0))

    def get_distance(self, other_matrix, row_i):
        # Calculate Euclidean distance between row_i of self and each row in other_matrix
        distances = np.linalg.norm(self.array_2d[row_i] - other_matrix.array_2d, axis=1)
        return Matrix.from_array(distances.reshape(-1, 1))

    def get_weighted_distance(self, other_matrix, weights, row_i):
        weights_array = weights.array_2d.flatten()
        weighted_diff = (self.array_2d[row_i] - other_matrix.array_2d) ** 2
        weighted_distances = np.sum(weighted_diff * weights_array, axis=1)
        return Matrix.from_array(weighted_distances.reshape(-1, 1))

    def get_count_frequency(self):
        if self.array_2d.shape[1] != 1:
            return 0
        unique, counts = np.unique(self.array_2d, return_counts=True)
        return dict(zip(unique, counts))

    @staticmethod
    def from_array(array):
        new_matrix = Matrix()
        new_matrix.array_2d = array
        return new_matrix

    @staticmethod
    def get_initial_weights(m):
        weights = np.random.rand(1, m)
        return Matrix.from_array(weights / np.sum(weights))

    @staticmethod
    def get_centroids(data_matrix, K):
        random_rows = np.random.choice(data_matrix.array_2d.shape[0], K, replace=False)
        centroids = data_matrix.array_2d[random_rows, :]
        return Matrix.from_array(centroids)

    @staticmethod
    def get_separation_within(data_matrix, centroids, S, K):
        # Initialize separation_within matrix with one row and m columns (features)
        separation_within = np.zeros((1, data_matrix.array_2d.shape[1]))

        # Iterate over each feature j
        for j in range(data_matrix.array_2d.shape[1]):
            # Calculate aj by summing over clusters (k) and data points (i)
            for k in range(1, K + 1):
                # Sum distances for all data points assigned to cluster k
                for i in range(data_matrix.array_2d.shape[0]):
                    # Check if data point i belongs to cluster k
                    if S.array_2d[i, 0] == k:
                        # Compute the Euclidean distance between the j-th feature of the i-th data point and the j-th feature of centroid k
                        distance = (data_matrix.array_2d[i, j] - centroids.array_2d[k - 1, j]) ** 2
                        separation_within[0, j] += distance

        # Return the result as a matrix object
        return Matrix.from_array(separation_within)

    @staticmethod
    def get_separation_between(data_matrix, centroids, S, K):
        # Initialize the separation_between matrix with 1 row and m columns
        separation_between = np.zeros((1, data_matrix.array_2d.shape[1]))

        # Compute the overall mean for each feature (D_j')
        overall_mean = data_matrix.array_2d.mean(axis=0)

        # Iterate over each feature (j)
        for j in range(data_matrix.array_2d.shape[1]):
            # Iterate over each cluster (k)
            for k in range(1, K + 1):
                # Get the points assigned to cluster k
                cluster_points = data_matrix.array_2d[S.array_2d.flatten() == k]
                N_k = len(cluster_points)  # Number of points in cluster k

                # Get the centroid value for the j-th feature of cluster k
                c_kj = centroids.array_2d[k - 1, j]

                # Calculate the distance between c_kj and the overall mean D_j'
                distance = np.linalg.norm(c_kj - overall_mean[j])

                # Update the separation for the j-th feature by adding N_k * distance
                separation_between[0, j] += N_k * distance

        # Return the result as a matrix object
        return Matrix.from_array(separation_between)

    @staticmethod
    def get_new_weights(data_matrix, centroids, old_weights, S, K):
        separation_within = Matrix.get_separation_within(data_matrix, centroids, S, K).array_2d.flatten()
        separation_between = Matrix.get_separation_between(data_matrix, centroids, S, K).array_2d.flatten()

        # Calculate the ratios b_j / a_j
        ratio = separation_between / separation_within

        # Normalize the ratio: (b_j / a_j) / sum(b_v / a_v)
        normalized_ratio = ratio / np.sum(ratio)

        # Convert old_weights to a NumPy array for addition
        old_weights_array = old_weights.array_2d.flatten()

        # Update the weights using the formula: w'_j = 1/2 * (w_j + normalized_ratio)
        new_weights = 0.5 * (old_weights_array + normalized_ratio)

        return Matrix.from_array(new_weights.reshape(1, -1))

    @staticmethod
    def get_groups(data_matrix, K):
        S = Matrix.from_array(np.zeros((data_matrix.array_2d.shape[0], 1)))
        centroids = Matrix.get_centroids(data_matrix, K)
        print(f"Initial Centroids: {centroids.array_2d}")
        weights = Matrix.get_initial_weights(data_matrix.array_2d.shape[1])

        for _ in range(10):  # Iterative process
            print("Current Weights:", weights.array_2d)
            for i in range(data_matrix.array_2d.shape[0]):
                distances = data_matrix.get_weighted_distance(centroids, weights, i)
                S.array_2d[i, 0] = np.argmin(distances.array_2d) + 1
            print(f"Updated Cluster Assignments: {S.array_2d.flatten()}")

            for k in range(1, K + 1):
                centroids.array_2d[k - 1] = data_matrix.array_2d[S.array_2d.flatten() == k].mean(axis=0)
            print(f"New Centroids: {centroids.array_2d}")

            weights = Matrix.get_new_weights(data_matrix, centroids, weights, S, K)

        # Generate the output DataFrame with the required structure
        result_df = pd.DataFrame({
            'Wine_ID': np.arange(1, data_matrix.array_2d.shape[0] + 1),
            'Group_Assignment': S.array_2d.flatten(),
            'Centroid': [centroids.array_2d[int(group) - 1] if group > 0 else np.nan for group in S.array_2d.flatten()],
            'Average_Quality': [data_matrix.array_2d[S.array_2d.flatten() == group].mean(axis=0)[0] if group > 0 else np.nan for group in S.array_2d.flatten()],
            'Average_Alcohol': [data_matrix.array_2d[S.array_2d.flatten() == group].mean(axis=0)[1] if group > 0 else np.nan for group in S.array_2d.flatten()],
        })

        print(result_df)
        return S

def run_test():
    m = Matrix('/content/drive/MyDrive/Company Project/Anubavam/Data (2).csv')
    for k in range(2, 11):
        for i in range(20):
            S = Matrix.get_groups(m, k)
            print(str(k) + '=' + str(S.get_count_frequency()))


if __name__ == "__main__":
    print("Starting test...")
    run_test()


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  8.93333333e+01 1.83666667e+00 1.31666667e+00 4.00000000e-01
  1.22833333e+00 4.80833333e+00 8.64166667e-01 2.25583333e+00
  5.16000000e+02]
 [1.26810000e+01 3.06850000e+00 2.40350000e+00 2.16300000e+01
  9.40000000e+01 1.92950000e+00 1.26000000e+00 4.25500000e-01
  1.32550000e+00 5.20500000e+00 8.32500000e-01 2.24250000e+00
  5.85350000e+02]
 [1.39205000e+01 1.76900000e+00 2.49750000e+00 1.72000000e+01
  1.06650000e+02 2.90800000e+00 3.08150000e+00 2.95500000e-01
  1.90850000e+00 6.32250000e+00 1.11700000e+00 3.00850000e+00
  1.36085000e+03]
 [1.36988462e+01 1.97807692e+00 2.37115385e+00 1.69423077e+01
  1.03807692e+02 2.83846154e+00 2.96076923e+00 2.77692308e-01
  1.89730769e+00 5.22807692e+00 1.05000000e+00 3.16423077e+00
  1.07269231e+03]
 [1.31510000e+01 2.46700000e+00 2.44850000e+00 1.91750000e+01
  1.11950000e+02 2.35350000e+00 2.05350000e+00 3.35000000e-01
  1.70850000e+00 5.32450000e+00 9.14800000e-01 2.80800000