In [1]:
import pandas as pd 
import plotly.express as px
from sklearn.cluster import KMeans
from random import randint, gauss

In [2]:
#mock data maker
def make_df(total_size, number_of_clusters, gauss_dist = 20):
    df = pd.DataFrame()
    x = []
    y = []
    cluster_num = []
    for i in range(number_of_clusters):
        center_x = randint(1, 100)
        center_y = randint(1, 100)
        for j in range(total_size//number_of_clusters):
            x.append(center_x+gauss(0, gauss_dist))
            y.append(center_y+gauss(0, gauss_dist))
            cluster_num.append(str(i+1))
    df['X'] = x
    df['y'] = y
    df['Cluster'] = cluster_num
    return df

my_df = make_df(1000, 3)
fig = px.scatter(my_df, x='X', y = 'y', color='Cluster')
fig.show()

In [36]:
class AutoKMeanCluster:
    def __init__(self):
        self.df = None
        self.x_label = ''
        self.y_label = ''
        self.centroids = []
        self.last_centroids = None
    
    def assign_first_cluster(self, number_of_clusters):
        """Initialize random first clusters within range of data"""
        x_min = int(self.df[self.x_label].min())
        x_max = int(self.df[self.x_label].max())
        y_min = int(self.df[self.x_label].min())
        y_max = int(self.df[self.x_label].max())
        self.centroids = [[randint(x_min, x_max), randint(y_min, y_max)] for i in range(number_of_clusters)]
        return self.centroids

    def distance(self, point_1, point_2):
        return sum([(point_1[i]-point_2[i])**2 for i in range(len(point_1))])**.5

    def assign_to_centroid(self, x, y): 
        shortest_distance = -1
        cluster_number = None
        for centroid in self.centroids:
            d = self.distance([x, y], centroid)
            if shortest_distance< d or shortest_distance == -1:
                shortest_distance = d
                cluster_number = self.centroids.index(centroid)+1
        return cluster_number

    def new_centroids(self):
        #TODO it seems like they just keep switching spots
        new_centroids = []
        print('START')
        for centroid in self.centroids:
            print(centroid)
            label = self.centroids.index(centroid)+1
            temp = self.df[self.df['Cluster']==label]
            new_centroids.append([temp[self.x_label].mean(), temp[self.y_label].mean()])
        print('New centroids', new_centroids)
        print('END')
        self.last_centroids = self.centroids
        self.centroids = new_centroids

    def still_adjusting(self):
        if self.last_centroids == None:
            return True
        for i in range(len(self.centroids)):
            if self.distance(self.centroids[i], self.last_centroids[i])>.0001:
                #print(self.distance(self.centroids[i], self.last_centroids[i]))
                return True
        return False

    def RUN(self, df, x_label, y_label, max_clusters):
        self.df = df
        self.x_label = x_label
        self.y_label = y_label
        for i in range(max_clusters):
            num_of_clusters = i+1
            self.assign_first_cluster(num_of_clusters)
            while self.still_adjusting():
                self.df['Cluster'] = self.df.apply(lambda x: self.assign_to_centroid(x[self.x_label], x[self.y_label]), axis=1)
                self.new_centroids()
            print('Cluster set complete', i)
        return self.centroids


In [37]:
akmc = AutoKMeanCluster()
test_df = make_df(1000, 3)
centroids = akmc.RUN(test_df, 'X', 'y', 6)
print(centroids)

 x 3 columns]
[6.299934707042329, 43.673214615553285]
[76.1721394145074, 54.97551160914423]
New centroids [[76.1721394145074, 54.97551160914423], [6.299934707042329, 43.673214615553285]]
END
START
             X           y  Cluster
0    52.599968   49.645333        2
1     2.551455   55.320344        1
2     0.915052   59.505233        1
3    30.928074   55.818565        1
4    -0.739406   38.495345        1
..         ...         ...      ...
994  75.768264   80.014813        2
995  77.013334  109.449011        2
996  90.413963   91.357467        2
997  70.256993   71.557339        2
998  48.407402   57.872313        2

[999 rows x 3 columns]
[76.1721394145074, 54.97551160914423]
[6.299934707042329, 43.673214615553285]
New centroids [[6.299934707042329, 43.673214615553285], [76.1721394145074, 54.97551160914423]]
END
START
             X           y  Cluster
0    52.599968   49.645333        1
1     2.551455   55.320344        2
2     0.915052   59.505233        2
3    30.928074   55.

KeyboardInterrupt: 

# SciKit Learn KNN


In [None]:
df = make_df()