# Etabs DTW clustering

## imports

In [181]:
import pandas as pd
import numpy as np
from tslearn.clustering import TimeSeriesKMeans
from tslearn.metrics import cdist_dtw
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

## Load data

In [182]:
etabs = ['CH Beaune',
        'CH Chatillon Montbard',
        'CH Chaumont',
        'CH Langres',
        'CH privé Dijon',
        'CH Semur',
        'CHU Besançon',
        'CHU Dijon',
        'HNFC']

fake_data = False
fake_etabs = False

if fake_etabs:
    for i in range(len(etabs)//4):
        etabs.append('CH fake '+str(i))


data = []
for etab in etabs:
    
    # fecthing data
    if not fake_etabs:
        data.append(pd.read_excel('../data/features/hopitalfeatures/Export complet '+etab+'.xlsx', sheet_name='Volumes', usecols='A:B'))
    else:
        np.random.seed(etabs.index(etab))
        df = pd.DataFrame()
        df['date_entree'] = pd.date_range(start='2018-01-01', end='2022-12-31', freq='D')
        data.append(df)
        data[-1]['Total'] = np.random.randint(0, 100, size=len(data[-1]))

    # fake data
    if fake_data:
        np.random.seed(etabs.index(etab))
        data[-1].loc[data[-1]["Total"] == 0, "Total"] = 15
        data[-1]["hospit"] = data[-1]["Total"].values
        data[-1]["hospit"] -= np.random.randint(0, data[-1]["Total"].min(), size=len(data[-1]))
        #data[-1]["hospit"] = np.random.randint(0, data[-1]["Total"], size=len(data[-1]))
        data[-1]["air"] = np.random.normal(0, 10, size=len(data[-1]))
        data[-1]["temperature"] = np.random.normal(0, 30, size=len(data[-1]))
        data[-1]["accident"] = np.random.normal(120, 240, size=len(data[-1]))
    
    data[-1] = data[-1].loc[data[-1]['date_entree'] >= '2018-01-01']

print(data[-1])



     date_entree  Total
731   2018-01-01    217
732   2018-01-02    280
733   2018-01-03    251
734   2018-01-04    239
735   2018-01-05    217
...          ...    ...
2917  2023-12-27    330
2918  2023-12-28    280
2919  2023-12-29    282
2920  2023-12-30    286
2921  2023-12-31    192

[2191 rows x 2 columns]


## Format data

In [183]:
# Convert each hospital's DataFrame to a 2D numpy array (n_series, n_timepoints) for each hospital
hospital_data = []
for i in range(len(etabs)):
    hospital_data.append(data[i].drop(columns='date_entree'))

time_series_data = np.array(hospital_data)

## Scale data

In [184]:
# Assume time_series_data is a 3D array (n_samples, n_timepoints, n_features)
# Reshape to 2D for scaling
n_samples, n_timepoints, n_features = time_series_data.shape
reshaped_data = time_series_data.reshape(n_samples * n_timepoints, n_features)

# Standard scaling
scaler = StandardScaler()
scaled_data = scaler.fit_transform(reshaped_data)

# Reshape back to original 3D shape
scaled_time_series_data = scaled_data.reshape(n_samples, n_timepoints, n_features)
time_series_data = scaled_time_series_data

## Optimal number of clusters (silhouette score)

In [185]:
# Range of clusters to try
range_n_clusters = range(2, len(etabs))

# Placeholder for silhouette scores
silhouette_scores = []

# Placeholder for labels
labels = []

# Loop over cluster sizes to find the optimal number
for n_clusters in range_n_clusters:
    # Apply DTW KMeans clustering
    km_dtw = TimeSeriesKMeans(n_clusters=n_clusters, metric="dtw", random_state=0)
    labels.append(km_dtw.fit_predict(time_series_data))
    
    # Calculate the pairwise DTW distance matrix
    distance_matrix = cdist_dtw(time_series_data)
    
    # Calculate silhouette score
    silhouette_avg = silhouette_score(distance_matrix, labels[-1], metric="precomputed")
    silhouette_scores.append((n_clusters, silhouette_avg))
    
    print(f"Number of clusters: {n_clusters}, Silhouette Score: {silhouette_avg:.3f}")

# Select the number of clusters with the highest silhouette score
best_n_clusters = max(silhouette_scores, key=lambda x: x[1])[0]
print(f"\nOptimal number of clusters: {best_n_clusters} -> {labels[best_n_clusters-2]}")

Number of clusters: 2, Silhouette Score: 0.868
Number of clusters: 3, Silhouette Score: 0.690
Number of clusters: 4, Silhouette Score: 0.364
Number of clusters: 5, Silhouette Score: 0.256
Number of clusters: 6, Silhouette Score: 0.211
Number of clusters: 7, Silhouette Score: 0.115
Number of clusters: 8, Silhouette Score: 0.081

Optimal number of clusters: 2 -> [0 0 0 0 0 0 1 1 1]


## Output clusters of etabs

In [187]:
# Output the cluster labels for each hospital
clusters = {}
for i in range(len(etabs)):
    try:
        clusters[labels[i]].append(etabs[i])
    except KeyError:
        clusters[labels[i]] = [etabs[i]]
for cluster in clusters:
    print("Cluster", cluster)
    print(clusters[cluster])
#print(clusters)


Cluster 0
['CH Beaune', 'CH Chatillon Montbard', 'CH Chaumont', 'CH Langres', 'CH privé Dijon', 'CH Semur']
Cluster 1
['CHU Besançon', 'CHU Dijon', 'HNFC']


In [188]:
# from sklearn.cluster import KMeans
# from sklearn.preprocessing import StandardScaler

# # Placeholder for feature extraction (mean, variance for simplicity)
# list_features = ['Total', 'air', 'temperature', 'population']
# features = []
# for df in data:
#     mean = []
#     std_dev = []
#     for feature in list_features:
#         mean.append(df[feature].mean())   # Mean for each time series
#         std_dev.append(df[feature].std())
#     features.append([mean.mean(), std_dev.mean()])  # Aggregating into a feature vector

# # Convert to DataFrame
# features_df = pd.DataFrame(features, columns=['Mean', 'Std_Dev'])

# # Normalize features
# scaler = StandardScaler()
# features_scaled = scaler.fit_transform(features_df)

# # Apply K-means clustering
# kmeans = KMeans(n_clusters=3, random_state=0)
# labels = kmeans.fit_predict(features_scaled)

# # Assign labels back to the hospitals
# features_df['Cluster'] = labels

# # Display results
# print(features_df)