# Imports

In [None]:
import pandas as pd
from pathlib import Path
from sklearn.decomposition import PCA
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.preprocessing import MinMaxScaler
from tslearn.clustering import TimeSeriesKMeans

import warnings
warnings.filterwarnings("ignore")

# Loading Data

In [None]:
with open("options.txt", 'r') as f:
    options = f.readlines()
    options = {option.split("=")[0]: option.split("=")[1].strip() for option in options}
print(options)

In [None]:
RUG = pd.read_pickle(options['RUG_no_outliers'])

# Preparing and Transforming Data

In [None]:
RUG.interpolate(method='linear', inplace=True, limit=20)

In [None]:
def create_groups(data):
    # copy data to avoid changing the original data
    data_copy = data.copy()

    # group data by day
    groups = data_copy.groupby(pd.Grouper(freq='D'))

    # get the calender date of the groups
    days = list(groups.first().index.strftime('%Y:%m:%d'))

    # create a list of dataframes for each day
    gro = [groups.get_group(x).reset_index(drop=True) for x in groups.groups]

    # create a single dataframe with all days as columns
    temp = pd.concat(gro, axis=1, keys=days)

    # set index to hours and minutes
    temp.index = pd.date_range("00:00", "23:59", freq="1min").strftime('%H:%M')

    # drop all columns of temp dataframe which contain nan values
    temp.dropna(axis=1, how='any', inplace=True)

    # reduce data to every 10 minutes
    temp = temp[::10]
    
    # return transformed data 
    return temp

In [None]:
def scale_data(data):
    # copy data to avoid changing the original data
    data_copy = data.copy()

    # create train and test set based on train_percentage
    train_percentage = 0.8
    train_size = int(len(data_copy.columns) * train_percentage)

    train = data_copy.iloc[:, :train_size]
    test = data_copy.iloc[:, train_size:]

    # create scaler object
    scaler = MinMaxScaler(feature_range=(0, 1))

    # fit and transform scaler to train data
    scaled_list_train = [train[col] for col in train]
    scaled_list_train = scaler.fit_transform(scaled_list_train)
    
    # transform test data 
    scaled_list_test = [test[col] for col in test]
    scaled_list_test = scaler.transform(scaled_list_test)

    return scaled_list_train, scaled_list_test

In [None]:
def create_pca(data):
    # copy data to avoid changing the original data
    data_copy = data.copy()
    
    pca = PCA(n_components=0.85, svd_solver='full')
    
    # Fit and transform data
    pca_features = pca.fit_transform(data_copy)

    return pca_features

In [None]:
def create_kmeans(pca_data, scaled_train, scaled_test, clusters=4):
    # copy data to avoid changing the original data
    temp_pca_data = pca_data.copy()
    temp_scaled_train = scaled_train.copy()
    temp_scaled_test = scaled_test.copy()

    # fit kmeans to pca data
    kmeans_pca = TimeSeriesKMeans(n_clusters=clusters, metric="dtw", n_jobs=-1).fit(temp_pca_data)
    
    # extract and predict cluster labels
    train_pca_features = kmeans_pca.labels_
    test_pca_features = kmeans_pca.predict(temp_scaled_test)

    return train_pca_features, test_pca_features

In [None]:
def plot_scores(scaled_list_train, train_lab, column):
    # create figure
    fig, ax = plt.subplots((len(set(train_lab))))
    fig.suptitle(column)

    # For each assigned cluster label, grab all columns of the complete dataframe which have that label
    for pos, label in enumerate(set(train_lab)):
        values = scaled_list_train[(train_lab == label).nonzero()[0]]

        # plot the average cluster silhouette and the silhouette of the individual sub-timeseries.
        for value in values:
            ax[pos].plot(value,c="gray",alpha=0.4)
        ax[pos].plot(np.average(values,axis=0),c="red")

    # set subplot titles
    for i, ax in enumerate(ax.ravel()): 
        ax.set_title("Cluster {}".format(i)) 

    fig.tight_layout()  
    plt.show()
    return

In [None]:
def average_cluster(column, n_cluster):
    '''Driver function to call all other functions in order'''

    grouped_data = create_groups(RUG[column])

    scaled_list_train, scaled_list_test = scale_data(grouped_data)

    pca_features = create_pca(scaled_list_train)
  
    train_lab, test_lab = create_kmeans(pca_features, scaled_list_train, n_cluster)

    plot_scores(scaled_list_train, train_lab, test_lab, column)
    return 

In [None]:
clusters = [4, 4, 3, 3, 4, 4, 4, 3, 3, 4, 3, 4, 4]

In [None]:
# calls the driver function for each column in the dataframe 
# in combination with the appropriate number of clusters
for column, n_cluster in zip(RUG.columns, clusters):
    print(column)
    average_cluster(column, n_cluster)
    