In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn import metrics
import sys

In [None]:
housing_data = pd.read_csv("housing.csv")
house_value = housing_data['MedianHouseValue']

housing_data

In [4]:
housing_data.drop(['HouseAge','AveRooms','AveBedrms','Population','AveOccup','MedianHouseValue'],axis=1,inplace = True)

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

cat_pipeline = Pipeline([
                         ('encoder', OneHotEncoder(sparse=False)),
                         ('median_imputer', SimpleImputer(strategy="median")),
                         ('std_scaler', StandardScaler()),
])

num_pipeline = Pipeline([
                         ('mean_imputer', SimpleImputer(strategy="median")),
                         ('std_scaler', StandardScaler()),
])

housing_num = housing_data.select_dtypes(include=[np.number]).columns
housing_cat = housing_data.select_dtypes(include=['object']).columns


full_pipeline = ColumnTransformer([
                                   ("num", num_pipeline, housing_num),
                                   ("cat", cat_pipeline, housing_cat),
])

housing_prepared = full_pipeline.fit_transform(housing_data)
rows, cols = housing_prepared.shape

In [5]:
def k_means(dataset, k):
    rows, cols = dataset.shape
    centroids = initialize_centroids(rows, k, dataset)

    centroids_changing = True
    while centroids_changing == True:
        dist_centroid_pairs = calculate_dist_centroid_pairs(centroids, dataset, k, rows)
        new_centroids = recalculate_centroids(dist_centroid_pairs, k, dataset)

        if no_changes_centroids(centroids, new_centroids):
            centroids_changing = False
    
        centroids = new_centroids
  
    return centroids, dist_centroid_pairs

def initialize_centroids(rows, k, dataset):
    centroids = []
    indices = np.random.choice(rows, size=k, replace=False)

    for i in range(k):
        index = indices[i]
        centroid = dataset[index]
        centroids.append(centroid)
  
    return centroids

def calculate_dist_centroid_pairs(centroids, dataset, k, rows):
    pairs = []
    for i in range(rows):
        minDist = sys.maxsize
        minCentroid = -1

        for j in range(k):
            dist = calculate_distance( centroids[j], dataset[i])
            if dist < minDist:
                minDist = dist
                minCentroid = j

        pairs.append(tuple((i, minCentroid)))
  
    return pairs

def recalculate_centroids(pairs, k, dataset):
    n = dataset[0].size
    centroids = []
    for i in range(k):
        count = 0
        
        sums = [0] * 3
        
        for r in range(rows):
            if pairs[r][1] == i:
                for c in range(n):
                    sums[c] += dataset[pairs[r][0]][c]
                    count = count + 1
                
        for x in range(len(sums)):
            sums[x] = sums[x]/count
        
        centroids.append(sums)
  
    return centroids

def calculate_distance(centroid, point):
    n = len(point)
    sum = 0.0
    for i in range(n):
        dist = centroid[i] - point[i]
        dist = pow(dist, 2)
        sum += dist

    sum = sum ** (0.5)
    return sum

def no_changes_centroids(old_centroids, new_centroids):
    return np.array_equal(old_centroids, new_centroids)

In [None]:
#finding the most appropriate value of k using davies_bouldin_score 

min_score = float('inf');
davies_k = 2;

for k in range(2,7):
    #running k-means on the normalized data
    final_centroids, final_pairs = k_means(housing_prepared,k)
    
    #holds for every index the cluster that it belongs to 
    cluster_indices = list(x[1] for x in final_pairs)
    
    score = metrics.davies_bouldin_score(housing_prepared, cluster_indices)
    print("Davis-Bouldin Score for k = "+str(k)+" is : "+str(score))
    
    if(score < min_score):
        min_score = score
        davies_k = k
        
print("\nThe Most appropriate k value according to Davis-Bouldin Score is "+str(davies_k))

In [None]:
#also finding the most appropriate value of k using silhouette_score 

max_score = -1;
sil_k = 2;

for k in range(2,7):
    #running k-means on the normalized data
    final_centroids, final_pairs = k_means(housing_prepared,k)
    
    #holds for every index the cluster that it belongs to 
    cluster_indices = list(x[1] for x in final_pairs)
    
    score = metrics.silhouette_score(housing_prepared, cluster_indices)
    print("Silhouette Score for k = "+str(k)+" is : "+str(score))
    
    if(score > max_score):
        max_score = score
        sil_k = k
        
print("\nThe Most appropriate k value according to Silhouette Score is "+str(sil_k))

In [None]:
k = davies_k
final_centroids, final_pairs = k_means(housing_prepared,k)

print("The final value of k we're taking is "+str(k))

In [None]:
for ind in range(k):
    h = []
    for i in final_pairs:
        if i[1] == ind:
            h.append(house_value.iloc[i[0]])
    print("Box Plot for Cluster "+str(ind))
    plt.boxplot(h)
    plt.show()