# Imports

In [None]:
import pandas as pd
from collections import Counter
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt 
from itertools import combinations

from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from tslearn.clustering import TimeSeriesKMeans

import warnings
warnings.filterwarnings("ignore")

# Loading Data

In [None]:
with open("options.txt", 'r') as f:
    options = f.readlines()
    options = {option.split("=")[0]: option.split("=")[1].strip() for option in options}
print(options)

In [None]:
RUG = pd.read_pickle(options['RUG_no_outliers'])

# Preparing and Transforming Data

In [None]:
RUG.interpolate(method='linear', inplace=True, limit=20)

In [None]:
def get_data(col_name):
    # copy data to avoid changing the original data
    df = RUG[col_name].copy()
    
    # group data by day
    groups = df.groupby(pd.Grouper(freq='D'))

    # get the calender date of the groups
    days = list(groups.first().index.strftime('%Y:%m:%d'))

    # create a list of dataframes for each day
    gro = [groups.get_group(x).reset_index(drop=True) for x in groups.groups]

    # create a single dataframe with all days as columns
    temp = pd.concat(gro, axis=1, keys=days)

    # set index to hours and minutes
    temp.index = pd.date_range("00:00", "23:59", freq="1min").strftime('%H:%M')

    # drop all columns of temp dataframe which contain nan values
    temp.dropna(axis=1, how='any', inplace=True)

    # reduce data to every 10 minutes
    temp = temp[::10]
    
    # return transformed data 
    return temp

In [None]:
def scale_data(data):
    # copy data to avoid changing the original data
    data_copy = data.copy()

    # create train and test set based on train_percentage
    train_percentage = 0.8
    train_size = int(len(data_copy.columns) * train_percentage)

    train = data_copy.iloc[:, :train_size]
    test = data_copy.iloc[:, train_size:]

    # create scaler object
    scaler = MinMaxScaler(feature_range=(0, 1))

    # fit and transform scaler to train data
    scaled_list_train = [train[col] for col in train]
    scaled_list_train = scaler.fit_transform(scaled_list_train)
    
    # transform test data 
    scaled_list_test = [test[col] for col in test]
    scaled_list_test = scaler.transform(scaled_list_test)

    return scaler, scaled_list_train, scaled_list_test

# Principal Component Analysis

In [None]:
def create_pca(data):
    # copy data to avoid changing the original data
    data_copy = data.copy()
    
    pca = PCA(n_components=0.85, svd_solver='full')
    
    # Fit and transform data
    pca_features = pca.fit_transform(data_copy)

    return pca_features

In [None]:
def create_kmeans(pca_data, scaled_train, scaled_test, clusters=4):
    # copy data to avoid changing the original data
    temp_pca_data = pca_data.copy()
    temp_scaled_train = scaled_train.copy()
    temp_scaled_test = scaled_test.copy()

    # fit kmeans to pca data
    kmeans_pca = TimeSeriesKMeans(n_clusters=clusters, metric="dtw", n_jobs=-1).fit(temp_pca_data)
    
    # extract and predict cluster labels
    train_pca_features = kmeans_pca.labels_
    test_pca_features = kmeans_pca.predict(temp_scaled_test)

    return train_pca_features, test_pca_features

# Num of clusters per column

based on elbow method and silhouette score

In [None]:
clusters = [4, 4, 3, 3, 4, 4, 4, 3, 3, 4, 3, 4, 4]

# amount of iterations to use for the average
n_iterations = 3

In [None]:
complete_results = []
for location, clust_n in zip(RUG.columns, clusters):
    print(location)
    indice_results = []
    # for n_iterations times, get all training data cluster labels
    for it in range(n_iterations):
        print(it)
        data = get_data(location)

        scaler, scaled_list_train, scaled_list_test = scale_data(data)
        
        pca_features = create_pca(scaled_list_train)

        train_pca_features, test_pca_features = create_kmeans(pca_features, scaled_list_train, scaled_list_test, clust_n)

        indice_results.append(train_pca_features)
    complete_results.append(indice_results)


In [None]:
col_stabilty = {}
# for each location/column, check how many indices intersect between all combinations of labels
for column, name in zip(complete_results, RUG.columns):
    print(name)
    run_stability = []
    
    # get all combinations of runs to avoid dumplicate runs
    run_combinations = combinations(np.arange(n_iterations), 2)
    for run_base, run_compare in run_combinations:
        print(f"Run {run_base} -> Run {run_compare}")

        # count how many of each cluster label there are in each run
        run_base_stats = dict(Counter(column[run_base]))
        run_compare_stats = dict(Counter(column[run_compare]))
        
        # get all combinations of labels between the two runs
        label_combinations = set(combinations(np.concatenate((np.unique(column[run_base]), np.unique(column[run_compare]))), 2))

        highest = []
        # for each combination of labels, check how many indices intersect
        for label_base in np.unique(column[run_base]):
            temp_high = 0
            for label_compare in np.unique(column[run_compare]):
                
                # avoid duplicate combinations
                if (label_base, label_compare) in label_combinations:

                    # get the amount of indices that intersect
                    res = len(np.intersect1d((column[run_base] == label_base).nonzero(), (column[run_compare] == label_compare).nonzero()))

                    # calculate the percentage of indices that intersect and average it between the two runs
                    # this is done to avoid bias towards one run due to possibility of different total sizes
                    res2 = (res/run_base_stats[label_base]*100 + res/run_compare_stats[label_compare]*100)/2
                    
                    # keep track of the highest percentage since this is the most likely to be of the same cluster
                    if res2 > temp_high:
                        temp_high = res2
                highest.append(temp_high)

        # average the highest percentage of each combination
        run_stability.append(np.mean(highest))

    # average the stability of all combinations of runs
    col_stabilty[name] = np.mean(run_stability)


In [None]:
# create dataframe with stability scores
pd.DataFrame(col_stabilty, index=[0]).T