# Heirarchical Clustering
This workbook executes heirarchical clustering, rebalances the results, and vizualizes the results

In [23]:
import pandas as pd
from sklearn.cluster import AgglomerativeClustering

In [24]:
# Loading in the time matrix
timeMatrix = pd.read_csv('C:/Users/Aidan/OneDrive - Simon Fraser University (1sfu)/Garbage Route Optimization/timeMatrix.csv', index_col=0)

In [25]:
# Loading in the zones
zones = pd.read_csv('C:/Users/Aidan/OneDrive - Simon Fraser University (1sfu)/Garbage Route Optimization/poco-allzones.csv', index_col=0)

In [26]:
# Ensuring uniformity in the indeces of the data sets
zones.index = zones.index + ', Port Coquitlam, BC, Canada'

In [29]:
t = timeMatrix

In [30]:
# Normalizing the times to produce better clusters
from sklearn.preprocessing import normalize
t = normalize(t)

In [31]:
# Executing the clustering
from scipy.cluster.hierarchy import ClusterWarning
from warnings import simplefilter
simplefilter("ignore", ClusterWarning)
model = AgglomerativeClustering(n_clusters=5, affinity='euclidean', 
                                linkage='average')
model.fit_predict(t)
labels = model.labels_

In [49]:
# Labelling each address with its cluster
clusters = pd.DataFrame({'cluster':labels, 'address':timeMatrix.index})

In [None]:
# Creating function that adjusts the clusters
def adjustClusters(matrix,clusters, deviation, numElementsPerCluster):
    clusters = clusters.set_index('address')
    clusters['address'] = clusters.index
    
    numElementsInMin = clusters.groupby('cluster').nunique().sort_values(by='address').values[0][0]
    minCluster = clusters.groupby('cluster').nunique().sort_values(by='address').index[0]
    
    numElementsInMax = clusters.groupby('cluster').nunique().sort_values(by='address').values[-1][0]
    maxCluster = clusters.groupby('cluster').nunique().sort_values(by='address').index[-1]
    
    maxDeviation = not (numElementsInMax <= numElementsPerCluster + deviation and numElementsInMax >= numElementsPerCluster - deviation)
    minDeviation = not (numElementsInMin <= numElementsPerCluster + deviation and numElementsInMin >= numElementsPerCluster - deviation)
    
    while minDeviation or maxDeviation:
        
        # Determining the elements in the smallest and largest cluster
        minClusterElements = clusters[clusters['cluster'] ==  minCluster]['address']
        maxClusterElements = clusters[clusters['cluster'] ==  maxCluster]['address']      
          

        # Filtering the matrix so that the rows have only the min cluster elements and removing the min cluster
        # elements from the columns
        filteredMatrix = matrix.iloc[matrix.index.isin(minClusterElements.values),matrix.columns.isin(maxClusterElements.values)]
        
        # Determining the closest element to the min cluster
        closestElement = filteredMatrix.mean(axis=0).idxmin()
        
  
        clusters.loc[closestElement,'cluster'] = minCluster
        
        
        numElementsInMin = clusters.groupby('cluster').nunique().sort_values(by='address').values[0][0]
        minCluster = clusters.groupby('cluster').nunique().sort_values(by='address').index[0]

        numElementsInMax = clusters.groupby('cluster').nunique().sort_values(by='address').values[-1][0]
        maxCluster = clusters.groupby('cluster').nunique().sort_values(by='address').index[-1]



        maxDeviation = not (numElementsInMax <= numElementsPerCluster + deviation and numElementsInMax >= numElementsPerCluster - deviation)
        minDeviation = not (numElementsInMin <= numElementsPerCluster + deviation and numElementsInMin >= numElementsPerCluster - deviation)
        
        print(clusters.groupby('cluster').nunique().sort_values(by='address'))
        print("***********************************")
        
    return clusters

# Adjusting clusters        
newClusters = adjustClusters(timeMatrix,clusters, 50, 1981)  

In [None]:
newClusters.groupby('cluster').nunique().sort_values(by='address')

In [None]:
coordinates = pd.read_csv('C:/Users/Aidan/OneDrive - Simon Fraser University (1sfu)/Garbage Route Optimization/locations.csv')
coordinates = coordinates.rename(columns={'Unnamed: 0':'address'})

In [None]:
newClusters = newClusters.reset_index(drop=True)

In [None]:
data = newClusters.merge(coordinates)