In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
from collections import Counter
import scipy.stats as stats # for the breakpoints in SAX
from scipy.stats import norm
from dtw import dtw
import os
from scipy import signal
import seaborn as sns
from statsmodels.tsa.stattools import acf

##Sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, make_scorer, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.metrics import roc_curve, auc

## TSLEARN
from tslearn.neighbors import KNeighborsTimeSeriesClassifier
from tslearn.utils import to_time_series_dataset

## Custom : code implémenté par nous-même
from Symbol import SYMBOLS
from SFA import *
from ASTRIDE import *
from SAX_transf import *
from distances import MINDIST, TRENDIST
import utils
import warnings 
import sax_clustering

warnings.filterwarnings('ignore')

Importing the dtw module. When using in academic works please cite:
  T. Giorgino. Computing and Visualizing Dynamic Time Warping Alignments in R: The dtw Package.
  J. Stat. Soft., doi:10.18637/jss.v031.i07.



In [2]:
ECG200_X_train, ECG200_x_train, ECG200_y_train, ECG200_X_test, ECG200_x_test, ECG200_y_test = utils.generate_data(type="ECG200")
acsf1_X_train, acsf1_x_train, acsf1_y_train, acsf1_X_test, acsf1_x_test, acsf1_y_test = utils.generate_data(type="acsf1")
catsanddogs_X_train, catsanddogs_x_train, catsanddogs_y_train, catsanddogs_X_test, catsanddogs_x_test, catsanddogs_y_test = utils.generate_data(type="catsanddogs")

In [3]:
def k_means_sax(sax, max_iter, num_cluster):
    """
    Implémente l'algorithme K-Means adapté pour des observations sous forme de chaînes.

    Parameters:
    sax (SYMBOLS): comme rendu par SYMBOLS()
    max_iter (int): Nombre maximum d'itérations.
    num_cluster (int): Nombre de clusters à former.

    Returns:
    tuple: Un tuple contenant :
        - Les indices des centres finaux des clusters (list).
        - Les labels des clusters pour chaque observation (np.ndarray de taille (n,)).
        - La distance intra-cluster moyenne (float).
        - La distance inter-cluster moyenne (float).
    """
    data = sax.symbolized_x_train.iloc[1,:]

    # Choix de la mesure de distance
    if sax.method == "TSAX":
        dist = TRENDIST(sax.alphabet_size, sax.train_ts_length, sax.angle_breakpoint_alphabet_size)
    else:
        dist = MINDIST(sax.alphabet_size, sax.train_ts_length)

    # Étape 1 : Initialisation
    num_samples = data.shape[0]  # nombre d'observations
    np.random.seed(42)  # Pour la reproductibilité
    initial_indices = np.random.choice(num_samples, num_cluster, replace=False)
    centroids = initial_indices  # Les centroïdes sont initialisés par des indices

    # Initialiser les labels des clusters (0, 1, ..., num_cluster-1)
    labels = np.zeros(num_samples, dtype=int)

    # Boucle principale de l'algorithme K-Means
    for iteration in range(max_iter):
        # Étape 2 : Assignation des observations aux clusters les plus proches
        for i in range(num_samples):
            distances = []
            for centroid_idx in centroids:
                if sax.method == "TSAX":
                    distances.append(dist.tsax_mindist(data.iloc[i], data.iloc[centroid_idx]))
                else:
                    distances.append(dist.mindist(data.iloc[i], data.iloc[centroid_idx]))
            # Trouver le centroïde le plus proche et assigner l'observation à ce cluster
            labels[i] = np.argmin(distances)

        # Étape 3 : Mise à jour des centroïdes
        new_centroids = []
        for k in range(num_cluster):
            # Extraire les indices des observations appartenant au cluster k
            cluster_indices = np.where(labels == k)[0]
            if len(cluster_indices) > 0:
                # Trouver le point le plus central dans le cluster
                min_distance_sum = float("inf")
                central_index = cluster_indices[0]
                for idx in cluster_indices:
                    # Calculer la somme des distances de ce point à tous les autres du cluster
                    distance_sum = 0
                    for other_idx in cluster_indices:
                        if sax.method == "TSAX":
                            distance_sum += dist.tsax_mindist(data.iloc[idx], data.iloc[other_idx])
                        else:
                            distance_sum += dist.mindist(data.iloc[idx], data.iloc[other_idx])
                    # Mettre à jour le point central si une plus petite somme est trouvée
                    if distance_sum < min_distance_sum:
                        min_distance_sum = distance_sum
                        central_index = idx
                new_centroids.append(central_index)
            else:
                # Si un cluster est vide, réinitialiser son centroïde de manière aléatoire
                new_centroids.append(np.random.choice(num_samples))

        # Vérifier la convergence
        if np.array_equal(centroids, new_centroids):
            print(f"Convergence atteinte après {iteration + 1} itérations.")
            break
        
        # Mettre à jour les centroïdes pour la prochaine itération
        centroids = new_centroids

    # Calcul de la distance intra-cluster moyenne
    intra_cluster_distances = []
    for k in range(num_cluster):
        cluster_indices = np.where(labels == k)[0]
        for idx in cluster_indices:
            if sax.method == "TSAX":
                intra_cluster_distances.append(dist.tsax_mindist(data.iloc[idx], data.iloc[centroids[k]]))
            else:
                intra_cluster_distances.append(dist.mindist(data.iloc[idx], data.iloc[centroids[k]]))
    intra_cluster_mean_distance = np.mean(intra_cluster_distances) if intra_cluster_distances else 0.0

    # Calcul de la distance inter-cluster moyenne
    inter_cluster_distances = []
    for i in range(num_cluster):
        for j in range(i + 1, num_cluster):
            if sax.method == "TSAX":
                inter_cluster_distances.append(
                    dist.tsax_mindist(data.iloc[centroids[i]], data.iloc[centroids[j]])
                )
            else:
                inter_cluster_distances.append(
                    dist.mindist(data.iloc[centroids[i]], data.iloc[centroids[j]])
                )
    inter_cluster_mean_distance = np.mean(inter_cluster_distances) if inter_cluster_distances else 0.0

    return centroids, labels, intra_cluster_mean_distance, inter_cluster_mean_distance

In [None]:
SAX_list_k_means_intra_cluster = []
SAX_list_k_means_inter_cluster = []
sax = SYMBOLS(catsanddogs_x_train, 
              catsanddogs_x_test, 
              'TSAX', 
              num_segments=10, 
              alphabet_size=10)

for k in [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20]:
    _, _, intra_cluster_mean_distance, inter_cluster_mean_distance = k_means_sax(sax, 20, k)
    SAX_list_k_means_inter_cluster.append(inter_cluster_mean_distance)
    SAX_list_k_means_intra_cluster.append(intra_cluster_mean_distance)
    print('*')

Convergence atteinte après 2 itérations.
*


In [8]:
np.save("ECG200_k_means_1_20_intra_cluster_TSAX", SAX_list_k_means_intra_cluster)
np.save("ECG200_k_means_1_20_inter_cluster_TSAX", SAX_list_k_means_inter_cluster)

In [None]:
# store the dataframes in a dictionnary
typical_df = {
    "ECG200": (ECG200_x_train, ECG200_x_test),
    "ACSF1": (acsf1_x_train,acsf1_x_test),
    "Cats and Dogs": (catsanddogs_x_train, catsanddogs_x_test)
}

In [None]:
for index, (df_name, df) in enumerate(typical_df.items()):
    list_intra_cluster = f"{df_name}_list_k_means_intra_cluster"
    list_inter_cluster = f"{df_name}_list_k_means_inter_cluster"
    SAX_list_k_means_intra_cluster = []
    SAX_list_k_means_inter_cluster = []
    sax = SYMBOLS(df[0], 
                  df[1], 
                  'SAX', 
                  num_segments=10, 
                  alphabet_size=10)

    for k in [1, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20]:
        _, _, intra_cluster_mean_distance, inter_cluster_mean_distance = k_means_sax(sax, 30, k)
        SAX_list_k_means_inter_cluster.append(inter_cluster_mean_distance)
        SAX_list_k_means_intra_cluster.append(intra_cluster_mean_distance)
        np.save(list_intra_cluster, SAX_list_k_means_intra_cluster)
        np.save(list_inter_cluster, SAX_list_k_means_inter_cluster)
        print('*')