# Agglomerative Clustering

In [5]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.cluster import AgglomerativeClustering
import functools
from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.preprocessing import StandardScaler
from itertools import combinations
import matplotlib.pyplot as plt

In [2]:
def hh_mm_ss2seconds(hh_mm_ss):
    return functools.reduce(lambda acc, x: acc*60 + x, map(int, hh_mm_ss.split(':')))

## Pre-Processing Data

In [3]:
# Paths to datasets
datasets = ['./Data/set1.csv', './Data/set2.csv', './Data/set3noVID.csv']

# Standardize features
features = ['SEQUENCE_DTTM', 'LAT', 'LON', 'SPEED_OVER_GROUND', 'COURSE_OVER_GROUND']

def process_dataset(file_path):
    df = pd.read_csv(file_path, converters={'SEQUENCE_DTTM' : hh_mm_ss2seconds})
    X = df[features]
    scaler = StandardScaler()
    return scaler.fit_transform(X)

# Load and pre-process each dataset
processed_data = [process_dataset(file) for file in datasets]

## Feature Selection

Use set1 & set2 to determing which features to choose

In [9]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import adjusted_rand_score
from itertools import combinations
import pandas as pd

# Features for set1 and set2
X1 = processed_data[0]
X2 = processed_data[1]

# Labels for set1 and set2
y1 = pd.read_csv('./Data/set1.csv')['VID']
y2 = pd.read_csv('./Data/set2.csv')['VID']

num_clusters = 20  # unique VID in set1 and set2

features = range(X1.shape[1])
best_score = 0
best_combo = None

for i in range(1, len(features) + 1):
    for combo in combinations(features, i):
        # Select the features for this combination for both sets
        X1_subset = X1[:, combo]
        X2_subset = X2[:, combo]

        # Agglomerative clustering on Set 1
        agglo1 = AgglomerativeClustering(n_clusters=num_clusters)
        predictions1 = agglo1.fit_predict(X1_subset)
        score1 = adjusted_rand_score(y1, predictions1)

        # Agglomerative clustering on Set 2
        agglo2 = AgglomerativeClustering(n_clusters=num_clusters)
        predictions2 = agglo2.fit_predict(X2_subset)
        score2 = adjusted_rand_score(y2, predictions2)

        # Calculate the average ARI
        average_score = (score1 + score2) / 2
        print(f"Testing features {combo}: Average ARI = {average_score} | set1: {score1:.4f} | set2: {score2:.4f}")

        # Check if this combination gave a better average score
        if average_score > best_score:
            best_score = average_score
            best_combo = combo

print(f"Best Average ARI: {best_score} with features {best_combo}")


Testing features (0,): Average ARI = 0.052937951694586136 | set1: 0.0177 | set2: 0.0881
Testing features (1,): Average ARI = 0.09406632127986468 | set1: 0.1276 | set2: 0.0605
Testing features (2,): Average ARI = 0.10181886290730681 | set1: 0.1434 | set2: 0.0602
Testing features (3,): Average ARI = 0.23316536106074354 | set1: 0.1062 | set2: 0.3601
Testing features (4,): Average ARI = 0.225805940708544 | set1: 0.1252 | set2: 0.3265
Testing features (0, 1): Average ARI = 0.1474341179111534 | set1: 0.1165 | set2: 0.1784
Testing features (0, 2): Average ARI = 0.17661706609228706 | set1: 0.1210 | set2: 0.2322
Testing features (0, 3): Average ARI = 0.2006922846079403 | set1: 0.0949 | set2: 0.3065
Testing features (0, 4): Average ARI = 0.2137897841906293 | set1: 0.1315 | set2: 0.2961
Testing features (1, 2): Average ARI = 0.12910161654152197 | set1: 0.1467 | set2: 0.1115
Testing features (1, 3): Average ARI = 0.24857829045230118 | set1: 0.1373 | set2: 0.3598
Testing features (1, 4): Average AR

Seems that it performs the best with featrues 1, 2 ,3 and 4.

Note that max score was acheived using only 3 and 4 but this is because it overperformed on set2

## Grid Search

In [10]:
# Features for set1 and set2
X1 = processed_data[0]
X2 = processed_data[1]

X1 = X1[:, 1:5] # Get featues 3 and 4 only
X2 = X2[:, 1:5]


# Grid search parameters
n_clusters = [20, 25, 30]
linkages = ['ward', 'average', 'complete', 'single']
affinities = ['euclidean', 'manhattan', 'cosine']

# Initialize the best score and combination tracking variables
best_score = 0
best_params = {}

# Function to validate the combination of affinity and linkage
def valid_combination(linkage, affinity):
    if linkage == 'ward' and affinity != 'euclidean':
        return False
    return True

# Grid search loop
for n_cluster in n_clusters:
    for linkage in linkages:
        for affinity in affinities:
            if not valid_combination(linkage, affinity):
                continue  # Skip invalid combinations

            # Agglomerative clustering on Set 1
            agglo1 = AgglomerativeClustering(n_clusters=n_cluster, linkage=linkage, metric=affinity)
            predictions1 = agglo1.fit_predict(X1)
            score1 = adjusted_rand_score(y1, predictions1)

            # Agglomerative clustering on Set 2
            agglo2 = AgglomerativeClustering(n_clusters=n_cluster, linkage=linkage, metric=affinity)
            predictions2 = agglo2.fit_predict(X2)
            score2 = adjusted_rand_score(y2, predictions2)

            # Calculate the average ARI
            average_score = (score1 + score2) / 2
            print(f"Testing {n_cluster} clusters, {linkage} linkage, {affinity} affinity: Average ARI = {average_score}")

            # Update the best parameters and score
            if average_score > best_score:
                best_score = average_score
                best_params = {'n_clusters': n_cluster, 'linkage': linkage, 'affinity': affinity}

# Output the best parameters and score
print(f"Best Average ARI: {best_score}")
print(f"Best Parameters: {best_params}")


Testing 20 clusters, ward linkage, euclidean affinity: Average ARI = 0.29718825497268964
Testing 20 clusters, average linkage, euclidean affinity: Average ARI = 0.3362404018215226
Testing 20 clusters, average linkage, manhattan affinity: Average ARI = 0.358682251635327
Testing 20 clusters, average linkage, cosine affinity: Average ARI = 0.3691034121942901
Testing 20 clusters, complete linkage, euclidean affinity: Average ARI = 0.303116835390497
Testing 20 clusters, complete linkage, manhattan affinity: Average ARI = 0.3549237718565946
Testing 20 clusters, complete linkage, cosine affinity: Average ARI = 0.4109809179965021
Testing 20 clusters, single linkage, euclidean affinity: Average ARI = 0.2090711591489875
Testing 20 clusters, single linkage, manhattan affinity: Average ARI = 0.2081610727208823
Testing 20 clusters, single linkage, cosine affinity: Average ARI = 0.04743385100631472
Testing 25 clusters, ward linkage, euclidean affinity: Average ARI = 0.28058672056595046
Testing 25 cl

This seems to perform better than K-means with this combination of hyper-params.

Best Parameters: {'n_clusters': 25, 'linkage': 'complete', 'affinity': 'manhattan'}