# Clustering Chunks
Goals:
- Equal representation of patterns in the ML models
- To figure out more about noisy chunks and ways to throw away. However, there is no "clear" description of what can be attributed as noise. 

In [1]:
%matplotlib inline

from sklearn import cluster
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean

import scipy.cluster.hierarchy as hac
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
import calendar
import common

In [2]:
def distance_func(t1, t2, *args):
    return fastdtw(t1, t2, dist=euclidean)[0]

In [3]:
df = common.load_df("../data/insead", "*.csv")

_cols = ["cwshdr"]
df = common.Process.replace_nulls(df, cols=_cols)
df = common.Process.replace_with_near(df, cols=_cols)
df = common.Process.smooth_data(df, cols=_cols)
df = common.Process.get_normalized_df(df, scale=(0.1, 1), cols=_cols)

sample = df["2016-01":"2016-03"]

In [44]:
# this can be done neatly using pandas..
def create_chunks(df, field="cwshdr"):
    periods = np.unique(df.index.strftime("%Y-%m-%d"))
    print("Total days: ", len(periods))
    
    chunks = []
    size = 20 * 60 # points per day
    for p in periods:
        data = df[field][p:p].values
        if data.shape[0] >= size:
            chunks.append(data[:size])

    return np.array(chunks)

chunks = create_chunks(sample)
chunks.shape
# d = pd.DataFrame({"A": [np.nan, 1,2,3,np.nan, 4, np.nan, np.nan]})
# d.index = [dt.datetime(2017, 1, i+1) for i in range(d.shape[0])]
# create_chunks(d, "A")

Total days:  91


(91, 1200)

In [None]:
cluster = hac.fclusterdata(chunks, 1.0, metric=distance_func)
cluster