# Creating housing archetypes using K-means method
### Q4 2020-21

## Introduction
This workbook will ...

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy import stats
from kneed import KneeLocator 
from sklearn.datasets import make_blobs 
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import ParameterGrid
from itertools import permutations
from itertools import combinations

## Data Source
The data represents a subset of ERS records used for initial analysis.

In [None]:
ers_sample_records = pd.read_csv(r"C:\Users\owner\Documents\NRCan\code\practice\InitialHousingData.csv",)
ers_sample_records 

#original amount of obsevations/rows 
og_obs = ers_sample_records.shape[0]

In [None]:
ers_sample_records

## Database Preperation and Varibable Selection 


### Select variables for clustering
Placeholder for this process. 

In [None]:
# Air50P selected for 1D clustering test
cl_variables = ['Air50P']
test_data = ers_sample_records[cl_variables]


### Missing values

In [None]:
#Remove rows with blank values
test_data_cleaned = test_data.dropna()
#display how many rows removed

###  Inconsistent data removal 
each variable has own range that is should be within. Airttightness >0 ??


### Ensure all values are numerical
and positive? 

In [None]:
# if there is catagorical information use LabelEncoder,onehotenoder from scikitlearn..

# ensure all vales are floats 
test_data_cleaned.astype('float') #float64?

### Remove outliers

Visualize with box plot to confirm outlier removal.

In [None]:
#box plot before outlier removal
test_data_cleaned.boxplot() 

#### Local outlier factor
Local outlier factor (LOF) values identify an outlier based on the local neighborhood. It gives better results than the global approach to find outliers. A point will be considered as an outlier if it is at a small distance to the extremely dense cluster. 

In [None]:
#define the model

lof = LocalOutlierFactor()
lof_pred = lof.fit_predict(test_data_cleaned) 

#extract the negative outputs as the outliers.
mask = lof_pred != -1

#remove rows with outliers 
lof_data = test_data_cleaned[mask] #i dont think this works for sets with more than 1 varibale
print([lof_data])

#print amount of points deleated

outliers_rem = og_obs - lof_data.shape[0]
print ('amount of outliers removed: %d' %outliers_rem )

#plot without after removal

lof_data.boxplot()

#### Z-score 

In [None]:
#find absolute value of z-score for each observation
z = np.abs(stats.zscore(test_data_cleaned))

#only keep rows in dataframe with all z-scores less than absolute value of 3 
z_data = test_data_cleaned[(z<3).all(axis=1)]

#print amount of outliers removed
z_data

outliers_rem = og_obs - z_data.shape[0]
print ('amount of outliers removed: %d' %outliers_rem )

In [None]:
#plotafter outlier removal
z_data.boxplot()

#### IRQ

In [None]:
#find Q1, Q3, and interquartile range for each column
Q1 =test_data_cleaned.quantile(q=.25)
Q3 = test_data_cleaned.quantile(q=.75)
IQR =test_data_cleaned.apply(stats.iqr)

#only keep rows in dataframe that have values within 1.5*IQR of Q1 and Q3
iqr_data = test_data_cleaned[~((test_data_cleaned < (Q1-1.5*IQR)) | (test_data_cleaned > (Q3+1.5*IQR))).any(axis=1)]

#print amount of outliers removed

outliers_rem = og_obs - iqr_data.shape[0]
print ('amount of outliers removed: %d' %outliers_rem )

In [None]:
#plot of IQR outlier removal
iqr_data.boxplot()


 Mahalanobis distance - calc when multi varibale later
 https://www.statology.org/mahalanobis-distance-python/

### Clustering
Find the best mix of parameters for clustering. Parameters include oulier removal, scaling, initalization, and k.

In [None]:
# create sets of pre-processed data ready for clustering 
scaled_data_sets = [iqr_data,lof_data, z_data]


In [None]:
#list of scalers
standard = StandardScaler()
minimax = MinMaxScaler()
scalers = [standard, minimax]
#list of initalizers 
r = 'random'
plus = 'k-means++' 
initalizer = [r, plus]



In [None]:
# datasets being transfromed with standard scaler and random init
standard_rand=[]
for x in range(len(scaled_data_sets)):
    #first set of data to scale
    scaled_features = scalers[0].fit_transform(scaled_data_sets[x]) #not going to work for if sets change
   
    #determine amount of clusters elbow elblw method
    
    sse=[] #determine SSE for 1 to 11 clusters
    
    kmeans_kwargs = {
    "init":"random",  #for random 
    "n_init":10, 
    "max_iter":300,
    "random_state":42,}
    
    for k in range(1,11): #pre determined amount
        kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
        kmeans.fit(scaled_features)
        sse.append(kmeans.inertia_)
    k1 = KneeLocator(range(1,11),sse,curve="convex", direction="decreasing")
    clamount=k1.elbow
    
    #cluster with amount of clusters 
    
    kmeans = KMeans(
    init="random",
    n_clusters= clamount,
    n_init=10,
    max_iter=300,
    random_state=42)
    
    kmeans.fit(scaled_features)
    
    standard_rand.append(kmeans.inertia_) # can use other validations too 
    

In [None]:
#datasets being transfromed with mini scaler and random init
mini_rand=[]
for x in range(len(scaled_data_sets)):
    #first set of data to scale
    scaled_features = scalers[1].fit_transform(scaled_data_sets[x]) #not going to work for if sets change
   
    #determine amount of clusters elbow elblw method
    
    sse=[] #determine SSE for 1 to 11 clusters
    
    kmeans_kwargs = {
    "init":"random",  #for random 
    "n_init":10, 
    "max_iter":300,
    "random_state":42,}
    
    for k in range(1,11): #pre determined amount
        kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
        kmeans.fit(scaled_features)
        sse.append(kmeans.inertia_)
    k1 = KneeLocator(range(1,11),sse,curve="convex", direction="decreasing")
    clamount=k1.elbow
    
    #cluster with amount of clusters 
    
    kmeans = KMeans(
    init="random",
    n_clusters= clamount,
    n_init=10,
    max_iter=300,
    random_state=42)
    
    kmeans.fit(scaled_features)
    
    mini_rand.append(kmeans.inertia_) # can use other validations too 

In [None]:
#datasets being transfromed with mini scaler and k++ init
mini_plus=[]
for x in range(len(scaled_data_sets)):
    #first set of data to scale
    scaled_features = scalers[1].fit_transform(scaled_data_sets[x]) #not going to work for if sets change
   
    #determine amount of clusters elbow elblw method
    
    sse=[] #determine SSE for 1 to 11 clusters
    
    kmeans_kwargs = {
    "init":"k-means++",  #for random 
    "n_init":10, 
    "max_iter":300,
    "random_state":42,}
    
    for k in range(1,11): #pre determined amount
        kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
        kmeans.fit(scaled_features)
        sse.append(kmeans.inertia_)
    k1 = KneeLocator(range(1,11),sse,curve="convex", direction="decreasing")
    clamount=k1.elbow
    
    #cluster with amount of clusters 
    
    kmeans = KMeans(
    init="k-means++",
    n_clusters= clamount,
    n_init=10,
    max_iter=300,
    random_state=42)
    
    kmeans.fit(scaled_features)
    
    mini_plus.append(kmeans.inertia_) # can use other validations too 

In [None]:
#datasets being transfromed with standard scaler and k++ init
standard_plus=[]
for x in range(len(scaled_data_sets)):
    #first set of data to scale
    scaled_features = scalers[0].fit_transform(scaled_data_sets[x]) #not going to work for if sets change
   
    #determine amount of clusters elbow elbow method
    
    sse=[] #determine SSE for 1 to 11 clusters
    
    kmeans_kwargs = {
    "init":"k-means++",  #for random 
    "n_init":10, 
    "max_iter":300,
    "random_state":42,}
    
    for k in range(1,11): #pre determined amount
        kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
        kmeans.fit(scaled_features)
        sse.append(kmeans.inertia_)
    k1 = KneeLocator(range(1,11),sse,curve="convex", direction="decreasing")
    clamount=k1.elbow
    
    #cluster with amount of clusters 
    
    kmeans = KMeans(
    init="k-means++",
    n_clusters= clamount,
    n_init=10,
    max_iter=300,
    random_state=42)
    
    kmeans.fit(scaled_features)
    
    standard_plus.append(kmeans.inertia_) # can use other validations too 

    
  
  

In [None]:

results = pd.DataFrame({ 'st rand':standard_rand, 'st plus':standard_plus,'min rand':mini_rand, 'min plus': mini_plus}, index= ['IRQ','LOF','Z-score'])
results


In [None]:
# choose best scheme based on interia 
# lower the interia the better
# should use more validaion parameters


## Result visualization 
Visulaizing the results of one method. 

LOF outlier ommision, MinMax scaling, elbow method k determination, k-means++ initalization. Independant od Clustering performed above.

In [None]:
#data
data = lof_data
data

In [None]:
#merging original data with cleaned data for plotting realtionships later
lof_data_full = pd.merge(data, ers_sample_records, right_index=True, left_index =True) 
lof_data_full
lof_data_full = lof_data_full.drop(columns=['Air50P_x'])
#re-setting index for mergeing with cluster lables later
data_all = lof_data_full
data_all = data_all.reset_index(drop=True)
data_all


In [None]:
#scaling with MinMax scaler
minmax = MinMaxScaler()
scaled_features = minmax.fit_transform(data)
#scaled feature into a data frame 
scaled_features = pd.DataFrame(scaled_features)

In [None]:
#compare Stats for scaled and unsclaed data
#show scaled stats
scaled_features.describe()

In [None]:
#show unscaled stats
data.describe()

In [None]:
#plot histogram of scaled feaures 
scaled_features.hist(bins=10)

In [None]:
data.hist(bins=10)

In [None]:
#clustering

kmeans_kwargs = {
    "init":"k-means++", 
    "n_init":10, 
    "max_iter":300,
    "random_state":42,}

sse=[] #determine SSE for 1 to 11 clusters    
for k in range(1,11): 
        kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
        kmeans.fit(scaled_features)
        sse.append(kmeans.inertia_)
        
k1 = KneeLocator(range(1,11),sse,curve="convex", direction="decreasing")
clamount=k1.elbow
    
#cluster with k determined above 
    
kmeans = KMeans(
    init="k-means++",
    n_clusters= clamount,
    n_init=10,
    max_iter=300,
    random_state=42)

In [None]:
# create a dataframe including all variabels and cluster labels 
kmeans.fit(scaled_features)
labels = pd.DataFrame(kmeans.labels_,columns=['cluster label'])

cluster1=pd.concat([labels,data_all], axis = 1) 

cluster1

Looking at clusters individually 

In [None]:
#cluster 0 dataframe
cluster_0 = cluster1.loc[cluster1['cluster label'] == 0, ['Air50P_y']]
cluster_0.boxplot()


In [None]:
cluster_0.describe()

In [None]:
#cluster 1 dataframe
cluster_1 = cluster1.loc[cluster1['cluster label'] == 1, ['Air50P_y']]
cluster_1.boxplot()

In [None]:
cluster_1.describe()

In [None]:
#cluster 2 dataframe
cluster_2 = cluster1.loc[cluster1['cluster label'] == 2, ['Air50P_y']]
cluster_2.boxplot()

In [None]:
cluster_2.describe()

In [None]:
#Cluser Centroids 

centroids = minmax.inverse_transform(kmeans.cluster_centers_) # transform scaled cenroids back

centroids

In [None]:
#Scatter plot of custers 
sns.scatterplot('Air50P_y', 'YearBuilt', data=cluster1, hue= 'cluster label')

In [None]:
# plot all varibles against eachother to find patterns in clusters  
sns.pairplot(cluster1, hue='cluster label')

In [None]:
#silhouette coefficent visualizer
from yellowbrick.cluster import SilhouetteVisualizer

# Instantiate the clustering model and visualizer

visualizer = SilhouetteVisualizer(kmeans, colors='yellowbrick')

visualizer.fit(scaled_features)        # Fit the data to the visualizer
visualizer.show()        # Finalize and render the figure
#In SilhouetteVisualizer plots, clusters with higher scores have wider silhouettes, but clusters 
#that are less cohesive will fall short of the average score across all clusters, which is plotted as a
#vertical dotted red line.

In [None]:
#inter cluster distance maps
from yellowbrick.cluster import InterclusterDistance

visualizer = InterclusterDistance(kmeans)

visualizer.fit(scaled_features)    # Fit the data to the visualizer
visualizer.show()        # Finalize and render the figure
#the closer to centers are in the visualization, the closer they are in the original feature space.