In [71]:
import os
import json
import numpy as np
from datetime import datetime
import tracemalloc
import hdbscan
from tslearn.clustering import KShape
from tslearn.metrics import dtw
from tslearn.preprocessing import TimeSeriesScalerMeanVariance
from sklearn.metrics.pairwise import cosine_distances
from sklearn.cluster import AgglomerativeClustering, MiniBatchKMeans
from tslearn.clustering import KShape
from scipy.cluster.hierarchy import linkage as scipy_linkage, fcluster
from scipy.spatial.distance import pdist
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram
from sklearn.metrics import silhouette_score
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [72]:
def save_snippets_as_json(snippets, path):
    serializable_snippets = [
        {
            'index': int(idx), 
            'subsequence': subseq.tolist() 
        }
        for idx, subseq in snippets
    ]
    with open(path, 'w') as f:
        json.dump(serializable_snippets, f, indent=4)


In [73]:
def save_plot(fig, path):
    fig.savefig(path)
    plt.close(fig)

In [74]:
def save_results(output_dir, snippets, metrics):
    os.makedirs(output_dir, exist_ok=True)
    save_snippets_as_json(snippets, os.path.join(output_dir, 'snippets.json'))

    # Salva metrics
    with open(os.path.join(output_dir, 'metrics.json'), 'w') as f:
        json.dump(metrics, f, indent=4)


In [75]:
def parse_number_list(s: str):
    cleaned = s.replace('\n', ',').replace(' ', '')
    parts = [x for x in cleaned.split(',') if x]  
    return [float(x) for x in parts]

In [76]:
dataset = {}

base_path = '/home/guilherme-sales/insight_samsung/snippets/mixed_bag_eval/MixedBag'

for file in os.listdir(base_path):
    with open(os.path.join(base_path, file), 'r') as f:
        lines = f.read()
    dataset[file] = parse_number_list(lines)

In [77]:
def plot_silhouette_scores(k_values, scores):
    plt.figure(figsize=(8, 4))
    plt.plot(k_values, scores, marker='o')
    plt.title('k vs Silhouette Score')
    plt.xlabel('Número de Clusters (k)')
    plt.ylabel('Silhouette Score')
    plt.grid(True)
    plt.show()

In [78]:
def plot_dendrogram(Z, title='Dendrograma'):
    plt.figure(figsize=(10, 5))
    dendrogram(Z)
    plt.title(title)
    plt.xlabel('Samples')
    plt.ylabel('Distance')
    plt.show()

In [79]:
def plot_regime_bar(min_idx_per_segment, title='Regime Bar'):
    plt.figure(figsize=(12, 1))
    plt.imshow([min_idx_per_segment], aspect='auto', cmap='tab10')
    plt.title(title)
    plt.xlabel('Segment Index')
    plt.yticks([])
    plt.show()

In [80]:
def clustering_subsequences(segments_norm, num_clusters=None, method='kshape', linkage='ward', min_cluster_size=5, batch_size=100):
    """
    Clusteriza subsequências usando KShape, Agglomerative, Hierarchical, HDBSCAN ou MiniBatchKMeans.

    Parâmetros:
    - segments_norm: np.array (n_samples, subseq_length)
    - num_clusters: int (ignorado se HDBSCAN)
    - method: 'kshape', 'agglomerative', 'hierarchical', 'hdbscan', 'minibatchkmeans'
    - linkage: usado para aglomerativos
    - min_cluster_size: usado para HDBSCAN
    - batch_size: usado para MiniBatchKMeans

    Retorna:
    - labels: np.array de rótulos de clusters
    - centroids: np.array de centroides
    """
    
    if method == 'kshape':
        kshape = KShape(n_clusters=num_clusters, random_state=0)
        kshape.fit(segments_norm)
        labels = kshape.labels_
        centroids = kshape.cluster_centers_.squeeze()

    elif method == 'agglomerative':
        clustering = AgglomerativeClustering(n_clusters=num_clusters, linkage=linkage)
        labels = clustering.fit_predict(segments_norm)
        centroids = []
        for i in range(num_clusters):
            cluster_segs = segments_norm[labels == i]
            if len(cluster_segs) == 0:
                centroids.append(np.zeros(segments_norm.shape[1]))
            else:
                centroids.append(np.mean(cluster_segs, axis=0))
        centroids = np.array(centroids)

    elif method == 'hierarchical':
        distance_matrix = pdist(segments_norm, metric='euclidean')
        Z = scipy_linkage(distance_matrix, method=linkage)
        labels = fcluster(Z, t=num_clusters, criterion='maxclust') - 1
        centroids = []
        for i in range(num_clusters):
            cluster_segs = segments_norm[labels == i]
            if len(cluster_segs) == 0:
                centroids.append(np.zeros(segments_norm.shape[1]))
            else:
                centroids.append(np.mean(cluster_segs, axis=0))
        centroids = np.array(centroids)

    elif method == 'hdbscan':
        clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, metric='euclidean')
        labels = clusterer.fit_predict(segments_norm)
        unique_labels = set(labels) - {-1}
        centroids = []
        for i in unique_labels:
            cluster_segs = segments_norm[labels == i]
            if len(cluster_segs) == 0:
                centroids.append(np.zeros(segments_norm.shape[1]))
            else:
                centroids.append(np.mean(cluster_segs, axis=0))
        centroids = np.array(centroids)

    elif method == 'minibatchkmeans':
        mbk = MiniBatchKMeans(n_clusters=num_clusters, batch_size=batch_size, random_state=0)
        mbk.fit(segments_norm)
        labels = mbk.labels_
        centroids = mbk.cluster_centers_

    else:
        raise ValueError("Método inválido. Escolha 'kshape', 'agglomerative', 'hierarchical', 'hdbscan' ou 'minibatchkmeans'.")

    return labels, centroids

In [81]:
def select_best_k(segments_norm, method='kshape', linkage='ward', k_range=(2, 10), plot=True):
    best_score = -1
    best_k = k_range[0]
    k_values = []
    scores = []

    for k in range(k_range[0], k_range[1] + 1):
        try:
            labels, _ = clustering_subsequences(
                segments_norm, 
                num_clusters=k, 
                method=method, 
                linkage=linkage
            )
            if len(set(labels)) < 2:
                continue
            score = silhouette_score(segments_norm, labels)
            k_values.append(k)
            scores.append(score)
            if score > best_score:
                best_score = score
                best_k = k
        except:
            continue

    if plot and k_values:
        plot_silhouette_scores(k_values, scores)

    return best_k

In [82]:
def find_snippets_clustering(ts, subseq_size, num_snippets=None, num_clusters=None, 
                              distance_metric='euclidean', weighted='only_profile',
                              clustering_method='kshape', clustering_linkage='ward',
                              min_cluster_size=5, batch_size=100,
                              auto_k_selection=False, k_range=(2, 10)):
    start = datetime.now()
    tracemalloc.start()

    ts = np.array(ts, dtype=float)
    segments_raw = np.array([ts[i:i + subseq_size] for i in range(0, len(ts) - subseq_size + 1)])
    segments_norm = TimeSeriesScalerMeanVariance().fit_transform(segments_raw).squeeze()

    # Seleção automática de k
    if clustering_method not in ['hdbscan'] and auto_k_selection:
        num_clusters = select_best_k(
            segments_norm, 
            method=clustering_method, 
            linkage=clustering_linkage, 
            k_range=k_range,
            plot=True
        )
        print(f"[INFO] Melhor k selecionado: {num_clusters}")

    # Clustering
    labels, centroids = clustering_subsequences(
        segments_norm, 
        num_clusters=num_clusters, 
        method=clustering_method, 
        linkage=clustering_linkage, 
        min_cluster_size=min_cluster_size,
        batch_size=batch_size
    )

    # Dendrograma se hierarchical
    if clustering_method == 'hierarchical':
        distance_matrix = pdist(segments_norm, metric='euclidean')
        Z = scipy_linkage(distance_matrix, method=clustering_linkage)
        plot_dendrogram(Z)

    # Ajuste para num_snippets
    if num_snippets is None:
        if clustering_method == 'hdbscan':
            num_snippets = len(set(labels)) - (1 if -1 in labels else 0)
        else:
            num_snippets = num_clusters

    medoides = []
    cluster_sizes = []
    cluster_densities = []

    unique_clusters = set(labels) - {-1}  # remove ruído se hdbscan

    for i, cluster_id in enumerate(unique_clusters):
        cluster_idxs = np.where(labels == cluster_id)[0]
        if len(cluster_idxs) == 0:
            continue
        segs_cluster = segments_norm[cluster_idxs]
        centroide = centroids[i]

        if distance_metric == 'euclidean':
            dists = np.linalg.norm(segs_cluster - centroide, axis=1)
        elif distance_metric == 'manhattan':
            dists = np.sum(np.abs(segs_cluster - centroide), axis=1)
        elif distance_metric == 'cosine':
            dists = cosine_distances(segs_cluster, centroide.reshape(1, -1)).flatten()
        else:
            dists = np.linalg.norm(segs_cluster - centroide, axis=1)  # default euclidean

        cluster_sizes.append(len(cluster_idxs))
        cluster_densities.append(np.mean(dists))
        medoid_local_idx = cluster_idxs[np.argmin(dists)]
        medoides.append((medoid_local_idx, segments_norm[medoid_local_idx]))

    mpdist_profile = np.full(len(segments_norm), np.inf)
    all_profiles = []
    match_count_coverages = []

    for i, (medoid_idx, medoid) in enumerate(medoides):
        if distance_metric == 'euclidean':
            dists = np.linalg.norm(segments_norm - medoid, axis=1)
        elif distance_metric == 'manhattan':
            dists = np.sum(np.abs(segments_norm - medoid), axis=1)
        elif distance_metric == 'cosine':
            dists = cosine_distances(segments_norm, medoid.reshape(1, -1)).flatten()
        else:
            dists = np.linalg.norm(segments_norm - medoid, axis=1)

        all_profiles.append(dists.copy())
        threshold = np.max(dists) * 0.25
        match_count_coverages.append(np.sum(dists <= threshold) / len(dists))

        if weighted == 'size':
            weight = cluster_sizes[i]
        elif weighted == 'density':
            weight = cluster_densities[i]
        elif weighted == 'size_density':
            weight = cluster_sizes[i] * cluster_densities[i]
        else:
            weight = 1

        mpdist_profile = np.minimum(mpdist_profile, dists / weight)

    all_profiles_array = np.array(all_profiles)
    min_idx_per_segment = np.argmin(all_profiles_array, axis=0)
    profile_area = [(min_idx_per_segment == i).sum() / len(min_idx_per_segment) for i in range(len(medoides))]
    min_profile = np.min(all_profiles_array, axis=0)
    cover_area = np.sum(min_profile)

    top_idxs = np.argsort(mpdist_profile)[:num_snippets]
    snippets = [(idx, segments_raw[idx]) for idx in top_idxs]

    # Regime Bar
    plot_regime_bar(min_idx_per_segment)

    current, peak = tracemalloc.get_traced_memory()
    tracemalloc.stop()
    elapsed = (datetime.now() - start).total_seconds()

    metrics = {
        'execution_time_sec': elapsed,
        'memory_usage_mb': round(current / (1024 ** 2), 2),
        'peak_memory_mb': round(peak / (1024 ** 2), 2)
    }

    return snippets, match_count_coverages, profile_area, cover_area, metrics


In [None]:
def run_all_clusterings_for_series(series_name, series_list, subseq_size, k_range=(2, 10)):
    """
    Executa clustering para várias séries temporais e métodos,
    organizando os resultados por série e método.
    
    Parâmetros:
    - series_list: lista de séries temporais (list of np.array)
    - subseq_size: tamanho das subsequências
    - k_range: intervalo para busca de k
    """
    methods = ['agglomerative', 'hierarchical', 'hdbscan', 'minibatchkmeans']
    base_dir = './resultados'
    os.makedirs(base_dir, exist_ok=True)
    
    for serie_name, ts in zip(series_name, series_list):
        serie_dir = os.path.join(base_dir, serie_name.replace('.txt', ''))
        os.makedirs(serie_dir, exist_ok=True)
        
        print(f"\n[INFO] Processando Série {serie_name.replace('.txt', '')}...")

        for method in methods:
            print(f"[INFO] Executando para método: {method.upper()}")
            output_dir = os.path.join(serie_dir, method)
            os.makedirs(output_dir, exist_ok=True)
            
            # Configurações padrão
            kwargs = {
                'ts': ts,
                'subseq_size': subseq_size,
                'clustering_method': method,
                'auto_k_selection': method not in ['hdbscan'],
                'k_range': k_range,
                'num_snippets': 5,  
            }
            
            if method == 'hdbscan':
                kwargs['min_cluster_size'] = 5
            if method == 'minibatchkmeans':
                kwargs['batch_size'] = 50

            plt.ioff()
            original_plot = plt.show
            plt.show = lambda: None

            snippets, mcc, pa, ca, metrics = find_snippets_clustering(**kwargs)

            # Salvar regime_bar
            fig = plt.figure(figsize=(12, 1))
            min_idx = np.arange(len(ts) - subseq_size + 1) % len(snippets)
            plt.imshow([min_idx], aspect='auto', cmap='tab10')
            plt.title('Regime Bar')
            plt.xlabel('Segment Index')
            plt.yticks([])
            save_plot(fig, os.path.join(output_dir, 'regime_bar.png'))

            # Salvar dendrograma se hierarchical
            if method == 'hierarchical':
                distance_matrix = pdist(TimeSeriesScalerMeanVariance().fit_transform(
                    np.array([ts[i:i + subseq_size] for i in range(0, len(ts) - subseq_size + 1)])
                ).squeeze(), metric='euclidean')
                Z = scipy_linkage(distance_matrix, method='ward')
                fig = plt.figure(figsize=(10, 5))
                dendrogram(Z)
                plt.title('Dendrograma')
                save_plot(fig, os.path.join(output_dir, 'dendrograma.png'))

            # Salvar silhouette se não for HDBSCAN
            if method not in ['hdbscan']:
                k_values = list(range(k_range[0], k_range[1]+1))
                scores = []
                for k in k_values:
                    try:
                        labels, _ = clustering_subsequences(
                            TimeSeriesScalerMeanVariance().fit_transform(
                                np.array([ts[i:i + subseq_size] for i in range(0, len(ts) - subseq_size + 1)])
                            ).squeeze(), 
                            num_clusters=k, 
                            method=method
                        )
                        if len(set(labels)) > 1:
                            score = silhouette_score(
                                TimeSeriesScalerMeanVariance().fit_transform(
                                    np.array([ts[i:i + subseq_size] for i in range(0, len(ts) - subseq_size + 1)])
                                ).squeeze(), 
                                labels
                            )
                            scores.append(score)
                        else:
                            scores.append(0)
                    except:
                        scores.append(0)
                fig = plt.figure(figsize=(8, 4))
                plt.plot(k_values, scores, marker='o')
                plt.title('k vs Silhouette Score')
                plt.xlabel('Número de Clusters (k)')
                plt.ylabel('Silhouette Score')
                plt.grid(True)
                save_plot(fig, os.path.join(output_dir, 'silhouette.png'))

            # Salva snippets e métricas
            save_results(output_dir, snippets, metrics)

            plt.show = original_plot
            plt.ion()

            print(f"[INFO] Resultados salvos em: {output_dir}")

In [None]:
run_all_clusterings_for_series(list(dataset.keys()), list(dataset.values()), subseq_size=150, k_range=(2, 25))


[INFO] Processando Série EEGRat2_10_1000_75...
[INFO] - Executando para método: AGGLOMERATIVE
[INFO] Melhor k selecionado: 8
[INFO]- Resultados salvos em: ./resultados/EEGRat2_10_1000_75/agglomerative
[INFO] - Executando para método: HIERARCHICAL
[INFO] Melhor k selecionado: 8
[INFO]- Resultados salvos em: ./resultados/EEGRat2_10_1000_75/hierarchical
[INFO] - Executando para método: HDBSCAN
[INFO]- Resultados salvos em: ./resultados/EEGRat2_10_1000_75/hdbscan
[INFO] - Executando para método: MINIBATCHKMEANS
[INFO] Melhor k selecionado: 8
[INFO]- Resultados salvos em: ./resultados/EEGRat2_10_1000_75/minibatchkmeans
