In [15]:
import cluster
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
from sklearn.cluster import KMeans 
from sklearn import metrics 
from scipy.spatial.distance import cdist 

In [12]:
from sklearn.preprocessing import StandardScaler, PowerTransformer, MinMaxScaler, RobustScaler, QuantileTransformer
from sklearn.model_selection import train_test_split

In [3]:
df, lat_long = cluster.get_data()

In [4]:
df.shape

(73424, 61)

In [5]:
df2 = cluster.make_structure_data(df)

In [6]:
df2.shape

(65266, 11)

In [7]:
train, test = cluster.split_data(df2)

In [8]:
train.shape

(52212, 9)

In [None]:
train.head()

In [18]:
std_train, std_test, blob = cluster.standardize_train_test(train,test)

In [None]:
std_train.shape

In [10]:
df_with_clusters = cluster.make_clusters(std_train)

In [None]:
df_with_clusters.info()

In [None]:
np.unique(df_with_clusters.cluster_labels, return_counts=True)

In [None]:
cluster.show_clusters_on_map(df_with_clusters)

## EXPLORE THESE CLUSTERS

In [None]:
df_with_clusters.groupby('cluster_labels').mean()

In [None]:
rich_houses = df_with_clusters[df_with_clusters.cluster_labels == 0]

In [None]:
poor_houses = df_with_clusters[df_with_clusters.cluster_labels == 1]
std_houses = df_with_clusters[df_with_clusters.cluster_labels == 2]

In [None]:
poor_houses.shape

In [None]:
std_houses.shape

In [None]:
df_with_clusters.logerror.mean()

In [None]:
sns.distplot(df_with_clusters.logerror)
plt.xlim(-2,2)

In [None]:
sns.distplot(rich_houses.logerror)
plt.xlim(-2,2)

In [None]:
sns.distplot(std_houses.logerror)
plt.xlim(-2,2)

In [None]:
sns.distplot(poor_houses.logerror)
plt.xlim(-2,2)

In [None]:
distortions = [] 
inertias = [] 
mapping1 = {} 
mapping2 = {} 
K = range(1,10) 
  
for k in K: 
    #Building and fitting the model 
    kmeanModel = KMeans(n_clusters=k).fit(X) 
    kmeanModel.fit(X)     
      
    distortions.append(sum(np.min(cdist(X, kmeanModel.cluster_centers_, 
                      'euclidean'),axis=1)) / X.shape[0]) 
    inertias.append(kmeanModel.inertia_) 
  
    mapping1[k] = sum(np.min(cdist(X, kmeanModel.cluster_centers_, 
                 'euclidean'),axis=1)) / X.shape[0] 
    mapping2[k] = kmeanModel.inertia_ 


In [11]:
cluster.list_inertia_scores(std_train)

1 : 2.544122942912377
2 : 2.067289033052436
3 : 1.840093958062726
4 : 1.7363143168632316
5 : 1.6913354840724377
6 : 1.6185027971969383
7 : 1.595944881881733
8 : 1.5247732626953
9 : 1.5183590948022798


In [13]:
def my_inv_transform(scaler, train_scaled, test_scaled):
    train = pd.DataFrame(scaler.inverse_transform(train_scaled), columns=train_scaled.columns.values).set_index([train_scaled.index.values])
    test = pd.DataFrame(scaler.inverse_transform(test_scaled), columns=test_scaled.columns.values).set_index([test_scaled.index.values])
    return scaler, train, test

In [19]:
scaler, not_scaled_train, not_scaled_test = my_inv_transform(blob, std_train, std_test)

In [20]:
not_scaled_train.head()

Unnamed: 0,bathroomcnt,bedroomcnt,calculatedfinishedsquarefeet,heatingorsystemtypeid,lotsizesquarefeet,yearbuilt,structuretaxvaluedollarcnt,taxvaluedollarcnt,logerror
72193,4.5,4.0,3458.0,2.0,4719.0,2013.0,620885.0,1290424.0,0.037904
12203,1.0,3.0,973.0,7.0,5717.0,1952.0,20474.0,39915.0,0.134868
1581,2.0,3.0,1292.0,2.0,6915.0,1986.0,66557.0,88742.0,0.033469
62782,2.0,4.0,2356.0,2.0,8674.0,1959.0,204174.0,654564.0,0.027875
14477,3.0,3.0,3210.0,2.0,87195.0,1999.0,514915.0,691754.0,-0.012186
