In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
data_dir='D:\\ML\\Clustering\\Data'

In [None]:
os.chdir(data_dir)

In [None]:
data=pd.read_csv("kc_housingdata.csv")

In [None]:
data.head()

In [None]:
data.dtypes

In [None]:
## Choose columns that are numeric and have a numeric interpretation
data_num=data[['price','bedrooms','bathrooms','sqft_living']]

In [None]:
data_num.dtypes

In [None]:
## Scale the data, using pandas
def scale(x):
    return (x-np.mean(x))/np.std(x)
data_scaled=data_num.apply(scale,axis=0)

In [None]:
data_scaled.head()

In [None]:
## Scale the data using sklearn
import sklearn.preprocessing as preprocessing
dat_scaled=preprocessing.scale(data_num,axis=0)

In [None]:
print (dat_scaled)
print ("Type of output is "+str(type(dat_scaled)))
print ("Shape of the object is "+str(dat_scaled.shape))

In [None]:
## Create a cluster model
import sklearn.cluster as cluster

In [None]:
kmeans=cluster.KMeans(n_clusters=3,init="k-means++")
kmeans=kmeans.fit(dat_scaled)

In [None]:
kmeans.labels_

In [None]:
kmeans.cluster_centers_

In [None]:
from scipy.spatial.distance import cdist
np.min(cdist(dat_scaled, kmeans.cluster_centers_, 'euclidean'),axis=1)

In [None]:
## Elbow method
from scipy.spatial.distance import cdist
K=range(1,20)
wss = []
for k in K:
    kmeans = cluster.KMeans(n_clusters=k,init="k-means++")
    kmeans.fit(dat_scaled)
    wss.append(sum(np.min(cdist(dat_scaled, kmeans.cluster_centers_, 'euclidean'), 
                                      axis=1)) / dat_scaled.shape[0])


In [None]:
plt.plot(K, wss, 'bx')
plt.xlabel('k')
plt.ylabel('Average distortion')
plt.title('Selecting k with the Elbow Method')
plt.show()

In [None]:
import sklearn.metrics as metrics
labels=cluster.KMeans(n_clusters=8,random_state=200).fit(dat_scaled).labels_

In [None]:
metrics.silhouette_score(dat_scaled,labels,metric="euclidean",sample_size=10000,random_state=200)

In [None]:
for i in range(7,13):
    labels=cluster.KMeans(n_clusters=i,random_state=200).fit(dat_scaled).labels_
    print ("Silhoutte score for k= "+str(i)+" is "+str(metrics.silhouette_score(dat_scaled,labels,metric="euclidean",
                                 sample_size=1000,random_state=200)))

In [None]:
## Let's try to find the cluster profiles
import os
os.chdir("E:\\Work\\Machine Learning Course\\Python\\Module 6 Clustering\\Codes")

In [None]:
#!pip install cluster_profiles
#import cluster_profiles as cluster_profiles

In [None]:
## Let's look for profiles for 8,9,10 clusters
kmeans=cluster.KMeans(n_clusters=8,random_state=200).fit(dat_scaled)

In [None]:
cluster_profiles.get_zprofiles(data=data_num.copy(),kmeans=kmeans)

In [None]:
cluster_profiles.get_profiles(data=data_num.copy(),kmeans=kmeans)