In [None]:
import numpy as np
from numpy.linalg import norm


class Kmeans:
 

    def __init__(self, n_clusters, max_iter=100):  #delete random state
        self.n_clusters = n_clusters
        self.max_iter = max_iter
  

    def initialize_centroids(self, X):
      
      random_index = np.random.permutation(X.shape[0]) # shuffle the row indices
      centroids = X[random_index[:self.n_clusters]] # select n of the randomized indices and select the data point related to them
      return centroids


    def compute_distance(self, X, centroids):
        distance = np.zeros((X.shape[0], self.n_clusters))  # for example 500*5
        for k in range(self.n_clusters): # k is the centroid, 2-norm on row
            row_norm = norm(X - centroids[k, :], axis=1) # distance of each datapoint from each centroid, row norm is a vector
            distance[:, k] = np.square(row_norm) # column k is the square norm , assign distance of all datapoints from centroid k, 
        return distance  #result is 500*5


    def assign_closest_cluster(self, distance):
        return np.argmin(distance, axis=1) # argmin returns the indices of the min values in the rows, this index indicates the closest centroind, result is a vector of 500 elements



    def compute_centroids(self, X, labels): #label is the closest centroid assigned to each data point
        centroids = np.empty((self.n_clusters, X.shape[1])) # empty matrix , the columns are the same as the number of data columns
        for k in range(self.n_clusters):# k indicates which cluster
            centroids[k, :] = np.mean(X[labels == k, :], axis=0)# the new centroid is the average of all datapoints in a certain cluster(labels ==k)
        return centroids


    def compute_sse(self, X, labels, centroids): # compute sum of squared errors
        distance = np.zeros(X.shape[0])
        for k in range(self.n_clusters):
            distance[labels == k] = norm(X[labels == k] - centroids[k], axis=1) # labels == k means where the cluster is cluster k
        return np.sum(np.square(distance))
    

    def fit(self, X):
        self.centroids = self.initialize_centroids(X)
        for i in range(self.max_iter):
            old_centroids = self.centroids
            distance = self.compute_distance(X, old_centroids)
            self.labels = self.assign_closest_cluster(distance)# Assignment of datapoints to clusters, label is the closest cluster centroid
            self.centroids = self.compute_centroids(X, self.labels) # compute new centroids according to old ones(labels)
            if np.all(old_centroids == self.centroids): # if the new centroids haven't changed from the old ones then  stop
                break
        self.error = self.compute_sse(X, self.labels, self.centroids) #sum of squared errors, error of model
    

    def predict(self, X): # assign a new data matrix to some  clusters
        distance = self.compute_distance(X, self.centroids) # compute distance from the last centroids, self.centroid?
        return self.assign_closest_cluster(distance)

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, Normalizer

""" Mean must be 0 and variance 1"""

df_train = pd.read_csv('/content/sample_data/california_housing_train.csv')
df_new={}
df_new = df_train[['longitude','latitude']]
X_train = np.array(df_new)
print(X_train.shape)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)


df_test=pd.read_csv('/content/sample_data/california_housing_test.csv')
df_new_test={}
df_new_test = df_test[['longitude','latitude']]
X_test = np.array(df_new_test)
X_test = scaler.transform(X_test)





(17000, 2)


In [None]:

Kmeans_model = Kmeans(n_clusters = 5)
Kmeans_model.fit(X_train)
results = Kmeans_model.predict(X_test)
print('results = ', results)
print('results shape = ', results.shape)


results =  [1 4 4 ... 0 2 0]
results shape =  (3000,)
