In [None]:
import os
import numpy as np
import pandas as pd
from glob import glob
from tqdm import tqdm
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from scipy.stats import skew, kurtosis
from sklearn.decomposition import PCA

train_path = './aggregated_7_480/'

In [None]:
def prepare_p_sensor_data(path):
    file_list = glob(os.path.join(path, '*.csv'))
    p_series = []
    sensor_ids = []

    for file in tqdm(file_list, desc="Processing files", leave=False):
        df = pd.read_csv(file)
        p_sensors = [col for col in df.columns if col.startswith('P') and not col.endswith('_flag')]

        for p_sensor in p_sensors:
            # Extract various time series features
            p_data = df[p_sensor].values
            features = [
                np.mean(p_data),       # Mean
                np.median(p_data),     # Median
                np.std(p_data),        # Standard Deviation
                skew(p_data),          # Skewness
                kurtosis(p_data),      # Kurtosis
                np.percentile(p_data, 25),  # 1st Quartile
                np.percentile(p_data, 75),  # 3rd Quartile
                np.max(p_data) - np.min(p_data)  # Range
            ]
            
            p_series.append(features)
            sensor_id = f"{'TRAIN_A' if 'TRAIN_A' in file else 'TRAIN_B'}_{p_sensor}"
            sensor_ids.append(sensor_id)

    # Scaling
    scaler = StandardScaler()
    p_values_scaled = scaler.fit_transform(p_series)

    return p_values_scaled, sensor_ids

def calculate_silhouette_scores(p_values, max_clusters=15):
    # Clustering methods
    clustering_methods = {
        'K-means': KMeans(n_init=10, random_state=42),
        'Hierarchical': AgglomerativeClustering(),
        'Gaussian Mixture': GaussianMixture(random_state=42)
    }

    # Dictionary to store silhouette scores
    silhouette_scores = {method: [] for method in clustering_methods.keys()}

    for method_name, clusterer in clustering_methods.items():
        for n_clusters in range(2, max_clusters + 1):
            try:
                # Perform clustering based on method
                if method_name == 'K-means':
                    clusterer.n_clusters = n_clusters
                    cluster_labels = clusterer.fit_predict(p_values)
                elif method_name == 'Hierarchical':
                    clusterer.n_clusters = n_clusters
                    cluster_labels = clusterer.fit_predict(p_values)
                else:  # Gaussian Mixture
                    clusterer.n_components = n_clusters
                    cluster_labels = clusterer.fit_predict(p_values)

                # Calculate silhouette score only if there's more than one cluster
                if len(np.unique(cluster_labels)) > 1:
                    score = silhouette_score(p_values, cluster_labels)
                    silhouette_scores[method_name].append(score)
                else:
                    silhouette_scores[method_name].append(0)

            except Exception as e:
                print(f"Error with {method_name}, {n_clusters} clusters: {e}")
                silhouette_scores[method_name].append(0)

    return silhouette_scores

In [None]:
# Prepare P sensor data
p_values, sensor_ids = prepare_p_sensor_data(train_path)

# Calculate silhouette scores
silhouette_scores = calculate_silhouette_scores(p_values)

# Plot silhouette scores
plt.figure(figsize=(12, 6))
for method, scores in silhouette_scores.items():
    plt.plot(range(2, len(scores) + 2), scores, marker='o', label=method)

plt.title('Silhouette Score by Number of Clusters and Method')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Score')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Find the best clustering method and number of clusters
best_method = max(
    silhouette_scores.items(), 
    key=lambda x: max(x[1]) if len(x[1]) > 0 else -1
)[0]

best_n_clusters = silhouette_scores[best_method].index(max(silhouette_scores[best_method])) + 2
print(f"\nBest Clustering Method: {best_method}")
print(f"Best Number of Clusters: {best_n_clusters}")

# Sensor ID and cluster mapping (if K-means is best)
if best_method == 'K-means':
    kmeans = KMeans(n_clusters=best_n_clusters, n_init=10, random_state=42)
    cluster_labels = kmeans.fit_predict(p_values)
    
    print("\nSensor Cluster Mapping:")
    for sensor_id, cluster in zip(sensor_ids, cluster_labels):
        print(f"{sensor_id}: Cluster {cluster}")

In [None]:
# Feature distribution visualization
plt.figure(figsize=(20, 10))
feature_names = [
    'Mean', 'Median', 'Std Deviation', 
    'Skewness', 'Kurtosis', '1st Quartile', 
    '3rd Quartile', 'Range'
]

# Perform K-means clustering
kmeans = KMeans(n_clusters=best_n_clusters, n_init=10, random_state=42)
cluster_labels = kmeans.fit_predict(p_values)

# Boxplot for each feature
for i in range(len(feature_names)):
    plt.subplot(2, 4, i+1)
    for cluster in range(best_n_clusters):
        cluster_data = p_values[cluster_labels == cluster, i]
        plt.boxplot(cluster_data, positions=[cluster], widths=0.6)
    
    plt.title(f'{feature_names[i]} Distribution')
    plt.xlabel('Cluster')
    plt.ylabel('Normalized Value')
    plt.xticks(range(best_n_clusters))

plt.tight_layout()
plt.show()

# PCA visualization
pca = PCA(n_components=2)
p_values_pca = pca.fit_transform(p_values)

plt.figure(figsize=(10, 8))
scatter = plt.scatter(
    p_values_pca[:, 0], 
    p_values_pca[:, 1], 
    c=cluster_labels, 
    cmap='viridis'
)
plt.colorbar(scatter)
plt.title('P Sensor Characteristics in PCA Space')
plt.xlabel('First Principal Component')
plt.ylabel('Second Principal Component')

plt.tight_layout()
plt.show()

# Print sensors in each cluster
print("\nSensors in Each Cluster:")
for cluster in range(best_n_clusters):
    cluster_sensors = [sensor_ids[i] for i in range(len(sensor_ids)) if cluster_labels[i] == cluster]
    print(f"Cluster {cluster}: {cluster_sensors}")