# Imports

In [1]:
import pandas as pd
from collections import Counter
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt 

from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from tslearn.clustering import TimeSeriesKMeans

import warnings
warnings.filterwarnings("ignore")

# Loading Data

In [2]:
with open("options.txt", 'r') as f:
    options = f.readlines()
    options = {option.split("=")[0]: option.split("=")[1].strip() for option in options}
print(options)

{'hanoi_scenario_dir': 'C:\\Users\\mjnst\\Desktop\\Thesis\\Hanoi_CMH\\Scenario-1', 'RUG_dir': 'C:\\Users\\mjnst\\Desktop\\Thesis\\RUG_data_5years', 'RUG_raw_csv': 'C:\\Users\\mjnst\\Desktop\\Thesis\\rug_csv.csv', 'RUG_timeseries': 'C:\\Users\\mjnst\\Desktop\\Thesis\\rug_timeseries.pkl', 'RUG_obfuscated': 'C:\\Users\\mjnst\\Desktop\\Thesis\\obfuscated_data.pkl', 'RUG_no_outliers': 'C:\\Users\\mjnst\\Desktop\\Thesis\\obfuscated_data_rm_outlier.pkl'}


In [3]:
RUG = pd.read_pickle(options['RUG_no_outliers'])

# Preparing and Transforming Data

In [4]:
RUG.interpolate(method='linear', inplace=True, limit=20)

In [5]:
def get_data(col_name):
    df = RUG[col_name].copy()
    
    groups = df.groupby(pd.Grouper(freq='D'))

    # get the calender date of the groups
    days = list(groups.first().index.strftime('%Y:%m:%d'))

    gro = [groups.get_group(x).reset_index(drop=True) for x in groups.groups]

    temp = pd.concat(gro, axis=1, keys=days)

    temp.index = pd.date_range("00:00", "23:59", freq="1min").strftime('%H:%M')

    # drop all columns of temp dataframe which contain nan values
    temp.dropna(axis=1, how='any', inplace=True)
    return temp[::10]

In [6]:
def scale_data(data):

    temp = data.copy()

    train_percentage = 0.8
    train_size = int(len(temp.columns) * train_percentage)
    
    train = temp.iloc[:, :train_size]
    test = temp.iloc[:, train_size:]

    scaler = MinMaxScaler(feature_range=(0, 1))

    scaled_list_train = [train[col] for col in train]
    scaled_list_train = scaler.fit_transform(scaled_list_train)

    scaled_list_test = [test[col] for col in test]
    scaled_list_test = scaler.transform(scaled_list_test)

    return scaler, scaled_list_train, scaled_list_test

# Principal Component Analysis

In [7]:
def create_pca(data):
    temp = data.copy()
    
    pca = PCA(n_components=0.85, svd_solver='full')
 
    # Fit and transform data
    pca_features = pca.fit_transform(temp)

    return pca_features

In [8]:
def create_kmeans(pca_data, scaled_train, scaled_test, clusters=4):
    temp_pca_data = pca_data.copy()
    temp_scaled_train = scaled_train.copy()
    temp_scaled_test = scaled_test.copy()

    kmeans_pca = TimeSeriesKMeans(n_clusters=clusters, metric="dtw", n_jobs=-1).fit(temp_pca_data)
    train_pca_features = kmeans_pca.labels_
    test_pca_features = kmeans_pca.predict(temp_scaled_test)

    return train_pca_features, test_pca_features

# Num of clusters per column

based on elbow method and silhouette score

In [9]:
clusters = [4, 4, 3, 3, 4, 4, 4, 3, 3, 4, 3, 4, 4]

n_iterations = 3

In [None]:
complete_results = []
for location, clust_n in zip(RUG.columns, clusters):
    print(location)
    indice_results = []
    for it in range(n_iterations):
        print(it)
        data = get_data(location)

        scaler, scaled_list_train, scaled_list_test = scale_data(data)
        
        pca_features = create_pca(scaled_list_train)

        train_pca_features, test_pca_features = create_kmeans(pca_features, scaled_list_train, scaled_list_test, clust_n)
        # print(Counter(train_pca_features), Counter(test_pca_features))
        indice_results.append(train_pca_features)
    complete_results.append(indice_results)


In [172]:
from itertools import combinations

In [None]:
col_stabilty = {}
# check how many indices intersect between all combinations of labels
for column, name in zip(complete_results, RUG.columns):
    print(name)
    run_stability = []
    

    run_combinations = combinations(np.arange(n_iterations), 2)
    for run_base, run_compare in run_combinations:
        print(f"Run {run_base} -> Run {run_compare}")

        run_base_stats = dict(Counter(column[run_base]))
        run_compare_stats = dict(Counter(column[run_compare]))

        label_combinations = set(combinations(np.concatenate((np.unique(column[run_base]), np.unique(column[run_compare]))), 2))
        # print(label_combinations)
        highest = []
        for label_base in np.unique(column[run_base]):
            temp_high = 0
            for label_compare in np.unique(column[run_compare]):
                
                if (label_base, label_compare)  in label_combinations:
                    res = len(np.intersect1d((column[run_base] == label_base).nonzero(), (column[run_compare] == label_compare).nonzero()))
                    # print(run_base_stats[label_base])
                    # print(label_base, run_base_stats, label_compare, run_compare_stats, res)
                    res2 = (res/run_base_stats[label_base]*100 + res/run_compare_stats[label_compare]*100)/2
                    # print(res2)
                    if res2 > temp_high:
                        temp_high = res2
                highest.append(temp_high)
        run_stability.append(np.mean(highest))
    # break
    col_stabilty[name] = np.mean(run_stability)


In [None]:
pd.DataFrame(col_stabilty, index=[0]).T