In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import subprocess

%load_ext autoreload
%autoreload 2

from time import sleep, time
from threading import Thread
from scipy.cluster.hierarchy import fcluster
from scipy.stats import zscore

from testing import TestFactory, ClusteredInfo
from dtw import dtw as cur_dtw
from dtw_wrapper import DtwWrapper

from IPython.display import Markdown
from tqdm import tqdm, tqdm_notebook

In [None]:
CLUSTER_SIZES = [4, 12, 24, 36, 48]
N_CLUST = 30
EXE = "./bin/MDTW_pairwise.exe"

def metric_Q(info):
    Q = 0
    norm_koeff = 0
    for i in range(info.count): 
        for j in range(info.count):
            if info.label[i] == info.label[j]:
                norm_koeff += 1
            if info.label[i] == info.label[j] and info.clusters_labels[i] == info.clusters_labels[j]:
                Q += 1
                
    return Q / norm_koeff


def metric_cluster(info):
    Q = 0
    for i in range(1, info.cluster_num + 1):
        ind, counts = np.unique(info.label[np.where(info.clusters_labels == i)[0]], return_counts=True)
        Q += counts.max() / counts.sum()

    return Q / info.cluster_num

def metric_cluster_vlad(info):
    Q = 0
    for i in range(1, info.cluster_num + 1):
        ind, counts = np.unique(info.label[np.where(info.clusters_labels == i)[0]], return_counts=True)
        Q += (counts.max()) ** 2 / len(np.where(info.clusters_labels == i)[0]) / counts.sum() 

    return Q / info.cluster_num

def norm_1(x, y):
    return np.linalg.norm(x - y, ord=1)
def norm_2(x, y):
    return np.linalg.norm(x - y, ord=2)

def cosine(x, y):
    return 1 - abs(np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)))

def pipeline(info, autoregression=False, show_results=True):
    metrics = {0: [], 1: []}
    
    info.cluster(N_CLUST)
    Q1 = metric_cluster(info)
    Q2 = metric_cluster_vlad(info)
        
    for cluster_size in CLUSTER_SIZES:
        info.cluster(cluster_size)
        Q1 = metric_cluster(info)
        Q2 = metric_cluster_vlad(info)
        metrics[0].append(Q1)
        metrics[1].append(Q2)
        if show_results:
            print("{0:4}: Q1:{1:.4f} | Q2:{2:.4f}".format(cluster_size, Q1, Q2))
    
    if not show_results:
        return metrics[0], metrics[1]

    index = info.stats.head(10).index.values
    classifier_stat = {}
    for i in info.stats.index:
        classifier_stat[i] = pd.Series(info.label[np.where(info.clusters_labels == i)[0]]).value_counts()

    display(pd.DataFrame(classifier_stat).fillna(0).iloc[:, :30])
    
    for i in index[:6]:
        info.clusters_compare_table(label=i, z_normalize=True)
    plt.show()
    
    display(Markdown("#### До выравнивания"))
    for i in index[:6]:
        info.comparing_at_one(i, num_series=10, z_normalize=True)
    plt.show()
    
    if not autoregression:
        display(Markdown("#### С выравниванием"))
        for i in index[:6]:
            info.allignment_to_random(i, num_series=10, z_normalize=True)
        plt.show()
        
    return metrics[0], metrics[1]
        
def repeat_test(args, kwargs, n_repeat, sample_size, autoregression=False, external=True, norm=1):
    metrics = []
    for i in range(n_repeat):
        print(i)
        tests = TestFactory(random_state=i)
        x = tests.set_sample(sample_size)
        if external:
            path = "../data/clustering/to_compute/akselerometr_{0}.csv".format(i)
            np.savetxt(path, np.concatenate(x[0]))
            command = [EXE, path, "3", str(norm), "200", "20"]
            try:
                stdout = subprocess.check_output(command).decode()
            except subprocess.CalledProcessError as e:
                print(e.output)

            kwargs["external"] = True
            kwargs["external_distances_path"] =  "../data/clustering/to_compute/akselerometr_{0}.csv_results".format(i)
        info = tests.ar_clustering() if autoregression else tests.test_dtw(*args, **kwargs)

        metrics.append(pipeline(info, autoregression, show_results=False))

    print("\n---- Metrics ----\n")
    print("  ".join(["{0:0.3f}+-{1:0.3f}".format(s, m) for s, m in zip(
        np.mean(np.array(metrics)[:, 0, :], 0),
        np.std(np.array(metrics)[:, 0, :], 0))]))
    print("  ".join(["{0:0.3f}+-{1:0.3f}".format(s, m) for s, m in zip(
            np.mean(np.array(metrics)[:, 1, :], 0),
            np.std(np.array(metrics)[:, 1, :], 0))]))

    print("{0:0.4f} +- {1:0.4f}".format(np.mean(np.array(metrics)[:, 0, :]), np.std(np.array(metrics)[:, 0, :])))
    print("{0:0.4f} +- {1:0.4f}".format(np.mean(np.array(metrics)[:, 1, :]), np.std(np.array(metrics)[:, 1, :])))

    return metrics, info

# DTW
Стоит играться с функциями расстояния между кластерами.  
Сейчас стоит `complete`: $$d(X, Y) = max(dist(x, y))$$

Более менее работает с `weighted` и `average`.  
[Подробнее](https://docs.scipy.org/doc/scipy/reference/generated/scipy.cluster.hierarchy.linkage.html#scipy.cluster.hierarchy.linkage)

Практически во всех способах, остается первый кластер, размер которого самый большой. В нем, зачастую, все перемемашно.

In [None]:
N_REPEATS = 10
SAMPLE_SIZE = 800
tests = TestFactory(random_state=42)

## L_1

In [None]:
norm = 1
f_norm = norm_1
for cluster_dist in ["complete", "weighted", "average"]:
    display(Markdown("## {}".format(cluster_dist)))
    metrics, info = repeat_test([cur_dtw, f_norm],
                          {"dtw_args": {"z_normalize": False, "l": 0.2}, "cluster_dist": cluster_dist},
                         N_REPEATS, SAMPLE_SIZE, norm=norm)

    pipeline(info)

## L_2

In [None]:
norm = 1
f_norm = norm_1
for cluster_dist in ["complete", "weighted", "average"]:
    display(Markdown("## {}".format(cluster_dist)))
    metrics, info = repeat_test([cur_dtw, f_norm],
                          {"dtw_args": {"z_normalize": False, "l": 0.2}, "cluster_dist": cluster_dist},
                         N_REPEATS, SAMPLE_SIZE, norm=norm)

    pipeline(info)

# Cosine

In [None]:
norm = 3
f_norm = cosine
for cluster_dist in ["complete", "weighted", "average"]:
    display(Markdown("## {}".format(cluster_dist)))
    metrics, info = repeat_test([cur_dtw, f_norm],
                          {"dtw_args": {"z_normalize": False, "l": 0.2}, "cluster_dist": cluster_dist},
                         N_REPEATS, SAMPLE_SIZE, norm=norm)

    pipeline(info)

# Classical version

In [None]:
N_REPEATS = 3
SAMPLE_SIZE = 100
tests = TestFactory(random_state=42)

CLUSTER_SIZES = [4, 12, 24]

## L_1

In [None]:
norm = 1
f_norm = norm_1
for cluster_dist in ["complete", "weighted", "average"]:
    display(Markdown("## {}".format(cluster_dist)))
    metrics, info = repeat_test([cur_dtw, f_norm],
                          {"dtw_args": {"z_normalize": False, "l": 0.2}, "cluster_dist": cluster_dist},
                         N_REPEATS, SAMPLE_SIZE, norm=norm, external=False)

    pipeline(info)

## L_2

In [None]:
norm = 1
f_norm = norm_1
for cluster_dist in ["complete", "weighted", "average"]:
    display(Markdown("## {}".format(cluster_dist)))
    metrics, info = repeat_test([cur_dtw, f_norm],
                          {"dtw_args": {"z_normalize": False, "l": 0.2}, "cluster_dist": cluster_dist},
                         N_REPEATS, SAMPLE_SIZE, norm=norm, external=False)

    pipeline(info)

# Cosine

In [None]:
norm = 3
f_norm = cosine
for cluster_dist in ["complete", "weighted", "average"]:
    display(Markdown("## {}".format(cluster_dist)))
    metrics, info = repeat_test([cur_dtw, f_norm],
                          {"dtw_args": {"z_normalize": False, "l": 0.2}, "cluster_dist": cluster_dist},
                         N_REPEATS, SAMPLE_SIZE, norm=norm, external=False)

    pipeline(info)

## Autoregression

In [None]:
metrics = repeat_test(None, None, N_REPEATS, SAMPLE_SIZE, True)
tests = TestFactory(random_state=1)
_ = tests.set_sample(600)
info = tests.ar_clustering()
pipeline(info, autoregression=True)