In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random as rand

# Clustering Exercise 3 

## Data Preprocessing 

In [7]:
marks_data = pd.read_csv("Math_English_marks.csv", index_col = 0)

In [8]:
features = (marks_data.columns[:])
features

Index(['Math', 'English'], dtype='object')

In [9]:
# Drop any missing value from the dataset
marks_data_clean = marks_data.dropna()

In [10]:
centroids = {"Math":[4,8,3],"English":[9,4,5]}
centroids = pd.DataFrame(centroids)
centroids

Unnamed: 0,Math,English
0,4,9
1,8,4
2,3,5


In [11]:
marks_data_clean

Unnamed: 0_level_0,Math,English
No,Unnamed: 1_level_1,Unnamed: 2_level_1
A,4,9
B,4,5
C,8,4
D,5,8
E,7,5
F,3,5
G,10,5
H,1,7


## Random Centroid Generator

In [6]:
# random centroids
def random_centroids(data, k, features):
    centroids=[]
    for i in range(k):
        random_centroid_feature = {}
        for feature in features:
            centroid = rand.choice(data[f'{feature}'])
            random_centroid_feature[feature]=centroid
        centroids.append(random_centroid_feature)
    print(f"Generated {k} centroids for {len(features)} features")
    return pd.DataFrame(centroids)

In [7]:
random_centroids(marks_data_clean, 4, features)

Generated 4 centroids for 2 features


Unnamed: 0,Math,English
0,4,9
1,7,5
2,1,4
3,3,5


=====================================================================================================================

## Manhattan Distance Function

In [12]:
def get_distance(data, centroids):
    """This Function calculates the distance for each cluster it accepts a dataframe of datas, and a dataframe of clusters"""
    # Get All the Index to make it into the rows
    row = data.index
    
    # Get all the rows in the centroid we are going to iterate through the centroid row for each row data
    centroid_row = centroids.shape[0]
    
    # a dictionary to store the result
    dict_data = {}
    
    # This loop will iterate through all the rows in the dataframe
    for i in range(0, len(row)):
        
        # an array/list to capture each result of the cluster
        clusters_value = []
        
        # This loop will iterate through all the centroid
        for j in range(0, centroid_row):
            distance = 0
            
            # This Loop will iterate through all the column that the centroid has then
            # its going to get the absolute value of the row of data - each centroid data for each corresponding
            # column
            for centroid in centroids:
                distance = np.abs(data[centroid].iloc[i]-centroids[centroid].iloc[j]) + distance
            # stores the sum data of the centroid column into the list
            clusters_value.append(distance)
            # print(clusters_value)
        # stores the list of centroid value into the dictionary
        dict_data[row[i]] = clusters_value
    dataframe = pd.DataFrame(dict_data).T
    print(dataframe)
    # return the closest value to the centroid
    return dataframe.idxmin(axis = 1)
         
    

In [13]:
get_distance(marks_data_clean, centroids)

    0   1  2
A   0   9  5
B   4   5  1
C   9   0  6
D   2   7  5
E   7   2  4
F   5   6  0
G  10   3  7
H   5  10  4


A    0
B    2
C    1
D    0
E    1
F    2
G    1
H    2
dtype: int64

In [18]:
def manhattan_distance_easy(data, centroids):
    """This will use lambda function to make it easier"""
    centroids = centroids.T # we need to transpose the data first to the correct form
    distance = centroids.apply(lambda x: np.abs(data - x).sum(axis = 1))
    print(distance)
    return distance.idxmin(axis = 1)

In [19]:
manhattan_distance_easy(marks_data_clean, centroids)

     0   1  2
No           
A    0   9  5
B    4   5  1
C    9   0  6
D    2   7  5
E    7   2  4
F    5   6  0
G   10   3  7
H    5  10  4


No
A    0
B    2
C    1
D    0
E    1
F    2
G    1
H    2
dtype: int64

In [56]:
# marks_data_clean.index
# marks_data_clean
# centroids

In [20]:
# To Where Does our data get into the cluster
labels = get_distance(marks_data_clean, centroids)

    0   1  2
A   0   9  5
B   4   5  1
C   9   0  6
D   2   7  5
E   7   2  4
F   5   6  0
G  10   3  7
H   5  10  4


In [21]:
# how many data per cluster
labels.value_counts()

2    3
1    3
0    2
dtype: int64

=====================================================================================================================

# Update the Distance for each Cluster

In [22]:
def easy_update_centroid(labels, data):
    # split the data by our labels and the apply the log mean (if the number is big log x will slim it down thus giving us
    # a better performance)  the apply methods means, we will apply the np.exp .... to each of our data group
    result = data.groupby(labels).apply(lambda x: np.exp(np.log(x).mean()))
    print(result.T) # the original result will gives us the cluster as a row
    return result

In [23]:
easy_update_centroid(labels, marks_data_clean)

                0         1         2
Math     4.472136  8.242571  2.289428
English  8.485281  4.641589  5.593445


Unnamed: 0,Math,English
0,4.472136,8.485281
1,8.242571,4.641589
2,2.289428,5.593445


In [24]:
def update_centroid(labels, data):
    # cari mean dari tiap data yang telah di group
    grouped_data = data.groupby(labels).mean()
    #for key, item in grouped_data:
        #print(grouped_data.get_group(key), "\n\n")
    return grouped_data

In [25]:
update_centroid(labels, marks_data_clean)

Unnamed: 0,Math,English
0,4.5,8.5
1,8.333333,4.666667
2,2.666667,5.666667


=====================================================================================================================

# Visualize the Clustering Algorithm

In [98]:
# imports
from sklearn.decomposition import PCA # to help visualize N dimension data, PCA summarise our variabels
from IPython.display import clear_output # this will clear our graph so we can have a graph that's continously updating

In [131]:
def plot_clusters(data, labels, centroid, iteration):
    pca = PCA(n_components = 2) # this will turn our data into two columns
    data_2d = pca.fit_transform(data)
    centroids_2d = pca.transform(centroids.T)
    clear_output(wait=True) # clear our output
    plt.title(f'{iteration}')
    plt.scatter(x=data_2d[:,0], y=data_2d[:1], c=labels)
    plt.scatter(x=centroids_2d[:,0], y=centroids_2d[:,1])
    plt.show()

=====================================================================================================================

# K-Means Main Loop

In [26]:
marks_data = pd.read_csv("Math_English_marks.csv", index_col = 0)
features = (marks_data.columns[:])
centroids = {"Math":[4,8,3],"English":[9,4,5]}
centroids = pd.DataFrame(centroids)
marks_data_clean = marks_data.dropna()

In [27]:
max_iteration = 8
iteration = 1
old_centroid = pd.DataFrame()
# stop the loop if our iteration has reached its maximum value or if it the centroid has stabilized 

while iteration < max_iteration and not centroids.equals(old_centroid):
    print(f"Iteration Count: {iteration}")
    old_centroid = centroids
    labels = get_distance(marks_data_clean, centroids)
    centroids = update_centroid(labels, marks_data_clean)
    print(labels)
    print(centroids)
    # plot_clusters(marks_data_clean, labels, centroids, iteration)
    iteration = iteration + 1
    print("\n")
print("\n")
labels

Iteration Count: 1
    0   1  2
A   0   9  5
B   4   5  1
C   9   0  6
D   2   7  5
E   7   2  4
F   5   6  0
G  10   3  7
H   5  10  4
A    0
B    2
C    1
D    0
E    1
F    2
G    1
H    2
dtype: int64
       Math   English
0  4.500000  8.500000
1  8.333333  4.666667
2  2.666667  5.666667


Iteration Count: 2
     0         1         2
A  1.0  8.666667  4.666667
B  4.0  4.666667  2.000000
C  8.0  1.000000  7.000000
D  1.0  6.666667  4.666667
E  6.0  1.666667  5.000000
F  5.0  5.666667  1.000000
G  9.0  2.000000  8.000000
H  5.0  9.666667  3.000000
A    0
B    2
C    1
D    0
E    1
F    2
G    1
H    2
dtype: int64
       Math   English
0  4.500000  8.500000
1  8.333333  4.666667
2  2.666667  5.666667






A    0
B    2
C    1
D    0
E    1
F    2
G    1
H    2
dtype: int64

In [33]:
# print out each members of each cluster
for i in labels.unique():
    print(f"Cluster {i}")
    print(f"Cluster Member:")
    print(list(marks_data_clean[labels == i].T))
    print("\n")

Cluster 0
Cluster Member:
['A', 'D']


Cluster 2
Cluster Member:
['B', 'F', 'H']


Cluster 1
Cluster Member:
['C', 'E', 'G']


