In [21]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
import geopy.distance
import random
from sklearn.cluster import KMeans as skKMeans
import time

In [37]:
house_df = pd.read_csv('kc_house_data.csv', sep=',')
house_df

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.00,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.7210,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.00,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.00,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.00,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,263000018,20140521T000000,360000.0,3,2.50,1530,1131,3.0,0,0,...,8,1530,0,2009,0,98103,47.6993,-122.346,1530,1509
21609,6600060120,20150223T000000,400000.0,4,2.50,2310,5813,2.0,0,0,...,8,2310,0,2014,0,98146,47.5107,-122.362,1830,7200
21610,1523300141,20140623T000000,402101.0,2,0.75,1020,1350,2.0,0,0,...,7,1020,0,2009,0,98144,47.5944,-122.299,1020,2007
21611,291310100,20150116T000000,400000.0,3,2.50,1600,2388,2.0,0,0,...,8,1600,0,2004,0,98027,47.5345,-122.069,1410,1287


# **K-MEANS**

**My realisation**

In [32]:
class KMeans:
    def __init__(self, k, repeat=10, loss='wcss'):
        self.k = k
        self.repeat=repeat
        self.loss = loss
        self.x_in_cluster_dict = {}
        self.centers = None
        self.clusters = None
        self.losses = []
        self.max_itiration = 100

    @staticmethod
    def _distance(arr1, arr2):
        assert arr1.shape[1] == arr2.shape[-1], 'arr1 colls nb != arr2 colls nb'
        return np.linalg.norm(arr1 - arr2, axis=1)

    def _wcss(self, clusters_centers):
        result = 0
        for cluster in self.x_in_cluster_dict:
            result += np.sum(self._distance(self.x_in_cluster_dict[cluster], clusters_centers[cluster]))
        return result

    def _get_new_centers(self, x):
        new_centers = np.zeros((self.k, x.shape[1]), dtype="float16")
        for i in range(self.k):
            x_in_cluster = x[self.clusters == i]
            self.x_in_cluster_dict[i] = x_in_cluster
            new_centers[i] = np.sum(x_in_cluster, axis=0) / x_in_cluster.shape[0] if x_in_cluster.shape[0] != 0 else self.centers[i]
        return new_centers

    def _get_clusters(self, x, clust_cent):
        dist_clust_array = np.zeros((x.shape[0], clust_cent.shape[0]), dtype="float16")
        for i in range(self.k):
            dist_clust_array[:, i] = self._distance(x, clust_cent[i])
        self.clusters = np.argmin(dist_clust_array, axis=1)

    def single_fit_predict(self, X):
        dimension = X.shape[1]
        self.centers = np.zeros((self.k, dimension), dtype="float16")
        new_centers = np.zeros((self.k, dimension), dtype="float16")
        for i in range(self.k):
            new_centers[i] = np.array([
                random.uniform(np.min(X[:, j]), np.max(X[:, j])) for j in range(dimension)
            ])
        i = 0
        while (new_centers != self.centers).any() and i <= self.max_itiration:
            self.centers = new_centers
            self._get_clusters(X, new_centers)
            new_centers = self._get_new_centers(X)
            if self.loss == 'wcss':
                self.losses.append(self._wcss(new_centers))
            else:
                pass
            i += 1

        else:
            return {
                'clusters': self.clusters,
                'centers': new_centers,
                'loss': self.losses
            }

    def fit_predict(self, X):
        quality = float('inf')
        ultimate_results = {
            'clusters': None,
            'centers': None,
            'loss': None
        }
        for i in range(self.repeat):
            single_results = self.single_fit_predict(X)
            if single_results['loss'][-1] < quality:
                ultimate_results = single_results
        return ultimate_results

**compare mine to sklearn**

**MINE**

In [35]:
start = time.time()
km = KMeans(15, repeat=100)
res = km.fit_predict(house_df[['lat', 'long']].to_numpy())
res['clusters']
print('time:', time.time() - start)

time: 3.1569607257843018


**SKLEARN**

In [36]:
start = time.time()
sk_km = skKMeans(n_clusters=15, n_init=100)
sk_results = sk_km.fit_predict(house_df[['lat', 'long']])
print('time:', time.time() - start)

time: 1.3909621238708496


In [14]:
sk_results

array([12,  5,  0, ...,  9,  8,  9])

In [16]:
house_df_clust = house_df.copy()
house_df_clust['sk_clust'] = sk_results.astype(str)
house_df_clust['mine_clust'] = res['clusters'].astype(str)

In [17]:
fig = px.scatter_mapbox(house_df_clust, lat="lat", lon="long", color="sk_clust", zoom=5, height=1000, size='price')
fig.update_layout(
    mapbox_style="white-bg",
    mapbox_layers=[
        {
            "below": 'traces',
            "sourcetype": "raster",
            "sourceattribution": "United States Geological Survey",
            "source": [
                "https://basemap.nationalmap.gov/arcgis/rest/services/USGSImageryOnly/MapServer/tile/{z}/{y}/{x}"
            ]
        },
      ])
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.write_html('SK_Clust_Geolocation_Map_cluster.html')

In [18]:
fig = px.scatter_mapbox(house_df_clust, lat="lat", lon="long", color="mine_clust", zoom=5, height=1000, size='price')
fig.update_layout(
    mapbox_style="white-bg",
    mapbox_layers=[
        {
            "below": 'traces',
            "sourcetype": "raster",
            "sourceattribution": "United States Geological Survey",
            "source": [
                "https://basemap.nationalmap.gov/arcgis/rest/services/USGSImageryOnly/MapServer/tile/{z}/{y}/{x}"
            ]
        },
      ])
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.write_html('Mine_Clust_Geolocation_Map_cluster.html')

# **Hierarchical clustering**

# **DBSCAN**

In [38]:
class DBSCAN:
    def __init__(self, eps, min_poins=3, metric='euclidean'):
        self.eps = eps
        self.min_poins = min_poins
        self.visited = set()
        self.possibly_noise = set()
        self.clusterred_points = set()
        self.clusters = {0: set()}
        self.metric = metric
        self.current_cluster = 0
        self.cluster_vector = None

    @staticmethod
    def euclidean(x_1, x_2):
        return np.linalg.norm(x_1 - x_2, axis=0)

    def get_neighbors(self, point, x):
        if self.metric == 'euclidean':
            possibly_close = x
            for i in range(x.shape[1]):
                possibly_close = possibly_close[abs(possibly_close[:, i] - point[i]) <= self.eps]
            return np.array([obj for obj in possibly_close if self.euclidean(obj, point) <= self.eps])
        else:
            exit('any other metrics are not supported yet!')

    def expand_cluster(self, root, neighbors):
        self.clusterred_points.add(root)
        self.clusters[self.current_cluster].add(root)
        for each in neighbors:
            if root in self.visited and each not in self.possibly_noise:
                continue
            elif root in self.visited and each in self.possibly_noise:
                self.possibly_noise.remove(root)
            new_neighbors = self.get_neighbors(each, root)
            for new_neighbor in new_neighbors:
                self.expand_cluster(new_neighbor, new_neighbors)

    def fit_predict(self, x):
        for each in x:
            if each in self.visited:
                continue
            self.visited.add(each)
            neighbors = self.get_neighbors(each, x)
            if neighbors.shape[0] >= self.min_poins:
                self.current_cluster += 1
                self.expand_cluster(each, neighbors)
        self.clusters[-1] = self.possibly_noise
        self.cluster_vector = np.zeros(x.shape)
        for key in self.clusters:
            for obj in self.clusters[key]:
                self.cluster_vector[x == obj] = key

In [39]:
mine_db = DBSCAN(0.01)
mine_db.fit_predict(house_df[['lat', 'long']].to_numpy())

TypeError: unhashable type: 'numpy.ndarray'