# Imports

In [None]:
import pandas as pd
from sklearn.decomposition import PCA
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from tslearn.clustering import TimeSeriesKMeans, silhouette_score

# Loading Data

In [None]:
with open("options.txt", 'r') as f:
    options = f.readlines()
    options = {option.split("=")[0]: option.split("=")[1].strip() for option in options}
print(options)

In [None]:
RUG = pd.read_pickle(options['RUG_no_outliers'])

# Preparing and Transforming Data

In [None]:
RUG.interpolate(method='linear', inplace=True, limit=20)

In [None]:
def create_groups(data):
    # copy data to avoid changing the original data
    data_copy = data.copy()

    # group data by day
    groups = data_copy.groupby(pd.Grouper(freq='D'))

    # get the calender date of the groups
    days = list(groups.first().index.strftime('%Y:%m:%d'))

    # create a list of dataframes for each day
    gro = [groups.get_group(x).reset_index(drop=True) for x in groups.groups]

    # create a single dataframe with all days as columns
    temp = pd.concat(gro, axis=1, keys=days)

    # set index to hours and minutes
    temp.index = pd.date_range("00:00", "23:59", freq="1min").strftime('%H:%M')

    # drop all columns of temp dataframe which contain nan values
    temp.dropna(axis=1, how='any', inplace=True)

    # reduce data to every 10 minutes
    temp = temp[::10]
    # return transformed data 
    return temp

In [None]:
def scale_data(data):
    data_copy = data.copy()
    train_percentage = 0.8
    train_size = int(len(data_copy.columns) * train_percentage)

    train = data_copy.iloc[:, :train_size]
    test = data_copy.iloc[:, train_size:]

    scaler = MinMaxScaler(feature_range=(0, 1))

    scaled_list_train = [train[col] for col in train]
    scaled_list_train = scaler.fit_transform(scaled_list_train)
    
    scaled_list_test = [test[col] for col in test]
    scaled_list_test = scaler.transform(scaled_list_test)

    return scaled_list_train, scaled_list_test

In [None]:
def create_pca(data):
    data_copy = data.copy()
    
    pca = PCA(n_components=0.85, svd_solver='full')
    
    # Fit and transform data
    pca_features = pca.fit_transform(data_copy)

    return pca_features

In [None]:
def kmeans_sillouette(data):
    data_copy = data.copy()
    wcss = []
    silhouette_scores = []
    
    for i in range(1,10):
        # print(i)
        kmeans_pca = TimeSeriesKMeans(n_clusters=i, metric="dtw", n_jobs=-1).fit(data_copy)
        wcss.append(kmeans_pca.inertia_)
        try:
            silhouette_scores.append(silhouette_score(data_copy, kmeans_pca.labels_, n_jobs=-1))
        except:
            silhouette_scores.append(0)
    return wcss, silhouette_scores

In [None]:
def plot_scores(column, wcss, silhouette_scores, n_cluster):
    fig, ax1 = plt.subplots()
    x_scale = range(1,10)

    color = 'tab:red'
    ax1.set_xlabel('Number of clusters')
    ax1.set_ylabel('WCSS', color=color)
    ax1.plot(x_scale, wcss, color=color)
    ax1.tick_params(axis='y', labelcolor=color)

    ax2 = ax1.twinx()  

    color = 'tab:blue'
    ax2.set_ylabel('Silhouette score', color=color)  
    ax2.plot(x_scale, silhouette_scores, color=color)
    ax2.tick_params(axis='y', labelcolor=color)
    plt.title(column)

    plt.axvline(x = n_cluster, color = 'r', label = 'axvline - full height', linestyle="dashed")

    fig.tight_layout()  
    plt.show()
    return

In [None]:
def elbow(column, n_cluster):
    grouped_data = create_groups(RUG[column])

    scaled_list_train, scaled_list_test = scale_data(grouped_data)

    pca_data = create_pca(scaled_list_train)

    wcss, silhouette_scores = kmeans_sillouette(pca_data)

    plot_scores(column, wcss, silhouette_scores, n_cluster)
    return (wcss, silhouette_scores)

In [None]:
all_wcss = []
all_silhouette_scores = []

clusters = [4, 4, 3, 3, 4, 4, 4, 3, 3, 4, 3, 4, 4]

In [None]:
all_wcss = []
all_silhouette_scores = []

for column, n_cluster in zip(RUG.columns, clusters):
    print(column)
    scores = elbow(column, n_cluster)
    all_wcss.append(scores[0])
    all_silhouette_scores.append(scores[1])