In [152]:
import pandas as pd
import numpy as np

#read the csv into a dataframe
f = pd.read_csv('matrix.csv')

#add a field for cluster with initial assignments
f.insert(7, "Cluster", [0, 1, 0, 2, 3, 2, 3])

print('Original Assignments')
print()
print(f)

Original Assignments

  Unnamed: 0  Term1  Term2  Term3  Term4  Term5  Term6  Cluster
0       Doc1      2      2      1      0      2      1        0
1       Doc2      1      3      0      2      7      0        1
2       Doc3      0      2      3      3      0      2        0
3       Doc4      2      1      0      3      8      4        2
4       Doc5      6      5      0      1      4      0        3
5       Doc6      2      1      6      8      0      4        2
6       Doc7      5      3      1      0      2      0        3


In [164]:
import math

#function for computing cosine similarity b/w two vectors
def cos_sim(x, y):
    '''function that takes as input two vectors and outputs the cosine similarity between them'''
    num = 0
    d1 = 0
    d2 = 0
    for i in range(len(x)):
        num = num + (x[i] * y[i])
        d1 = d1 + (x[i] * x[i])
        d2 = d2 + (y[i] * y[i])
    d1 = math.sqrt(d1)
    d2 = math.sqrt(d2)
    cos = num / (d1 * d2)
    return cos

#function for finding cluster with highest cosine similarity for a particular vector
def new_cluster(x, y):
    ''' function that takes as input a vector and list containing centroids for each cluster and outputs 
    a cluster assignment for the vector'''
    cos_sims = []
    for item in y:
        p = cos_sim(x, item)
        cos_sims.append(p)
    f = cos_sims.index(max(cos_sims))
    if f == 0:
        new_cluster = 1
        return new_cluster
    if f == 1:
        new_cluster = 2
        return new_cluster
    if f == 2:
        new_cluster = 3
        return new_cluster

#the below code shows the first iteration of k-means 
    
#creates separate dataframes for each cluster
unassigned = f.loc[f['Cluster'] == 0]
cluster1 = f.loc[f['Cluster'] == 1]
cluster2 = f.loc[f['Cluster'] == 2]
cluster3 = f.loc[f['Cluster'] == 3]

#computes the centroid of each cluster
avg1 = list(cluster1.mean())[0:6]
avg2 = list(cluster2.mean())[0:6]
avg3 = list(cluster3.mean())[0:6]
averages = [avg1, avg2, avg3]

#extracts values from unassigned vectors
doc3 = list(unassigned.loc[2])[1:7]
doc1 = list(unassigned.loc[0])[1:7]

#assigns new cluster to doc3 and doc1
clustersv2 = f.copy(deep=True)
clustersv2.at[2, 'Cluster'] = new_cluster(doc3, averages)
clustersv2.at[0, 'Cluster'] = new_cluster(doc1, averages)
print('Iteration 1')
print()
print(clustersv2)

Iteration 1

  Unnamed: 0  Term1  Term2  Term3  Term4  Term5  Term6  Cluster
0       Doc1      2      2      1      0      2      1        3
1       Doc2      1      3      0      2      7      0        1
2       Doc3      0      2      3      3      0      2        2
3       Doc4      2      1      0      3      8      4        2
4       Doc5      6      5      0      1      4      0        3
5       Doc6      2      1      6      8      0      4        2
6       Doc7      5      3      1      0      2      0        3


In [166]:
#the code in this cell executes iteractions of k-means via a while loop until no vectors are re-assigned 
#to a different cluster

n = 0
while n == 0:
    
    #creates separate dataframes for each cluster 
    cluster1 = clustersv2.loc[clustersv2['Cluster'] == 1]
    cluster2 = clustersv2.loc[clustersv2['Cluster'] == 2]
    cluster3 = clustersv2.loc[clustersv2['Cluster'] == 3] 
    
    #computes the centroid of each cluster
    avg1 = list(cluster1.mean())[0:6]
    avg2 = list(cluster2.mean())[0:6]
    avg3 = list(cluster3.mean())[0:6]
    averages = [avg1, avg2, avg3]
    
    #variable to count number of changes made during iteration
    m = 0
    
    #loops through tuples, finds new closest cluster(if any)
    for i in range(len(clustersv2)):
        a = clustersv2.loc[i][1:7]
        b = new_cluster(a, averages)
        if clustersv2.at[i, 'Cluster'] != b:
            m = m + 1
            clustersv2.at[i, 'Cluster'] = b
    if m > 0: 
        n = 0
    
    #ends loop if no new re-assignments are found
    if m == 0:
        n = 1

print('Final Output')
print()
print(clustersv2)

Final Output

  Unnamed: 0  Term1  Term2  Term3  Term4  Term5  Term6  Cluster
0       Doc1      2      2      1      0      2      1        3
1       Doc2      1      3      0      2      7      0        1
2       Doc3      0      2      3      3      0      2        2
3       Doc4      2      1      0      3      8      4        1
4       Doc5      6      5      0      1      4      0        3
5       Doc6      2      1      6      8      0      4        2
6       Doc7      5      3      1      0      2      0        3
