In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import LSTM, Dense
from tqdm import tqdm

# === 1. Load and preprocess full dataset ===
df = pd.read_csv("synthetic_trashcan_fill_levels2.csv")
time_series = df.drop(columns=["edgeID", "trashcanID"])
edge_ids = df["edgeID"].values

# Downsample: average every 4 days (200 → 50)
reshaped = time_series.to_numpy().reshape((len(df), 50, 4))
downsampled = reshaped.mean(axis=2)

# Normalize for clustering
scaler = StandardScaler()
normalized = scaler.fit_transform(downsampled)

# === 2. Clustering ===
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
clusters = kmeans.fit_predict(normalized)
df["cluster"] = clusters

# === 3. Sequence prep function ===
def create_sequences(series, window_size=10):
    X, y = [], []
    for i in range(len(series) - window_size):
        X.append(series[i:i+window_size])
        y.append(series[i+window_size])
    return np.array(X), np.array(y)

# === 4. Train & Predict for each cluster ===
selection_dict = {}
for cluster_id in range(4):
    print(f"Processing Cluster {cluster_id}...")
    
    # Filter data for cluster
    cluster_mask = df["cluster"] == cluster_id
    cluster_series = time_series[cluster_mask].to_numpy()
    cluster_edge_ids = df["edgeID"][cluster_mask].values
    
    # Normalize per trashcan
    scaler_lstm = StandardScaler()
    norm_cluster = scaler_lstm.fit_transform(cluster_series)
    
    # LSTM inputs
    # Create sequences per trashcan
    X_all, y_all = [], []
    for series in norm_cluster:
        X, y = create_sequences(series)
        X_all.extend(X)
        y_all.extend(y)
    X_all = np.array(X_all).reshape(-1, 10, 1)
    y_all = np.array(y_all)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.2, random_state=42)

    # LSTM model
    model = Sequential([
        LSTM(64, input_shape=(X_train.shape[1], 1)),
        Dense(1)
    ])
    model.compile(optimizer='adam', loss='mse')
    model.fit(X_train, y_train, epochs=10, batch_size=32, verbose=0)

    # === Predict for each trashcan ===
    for i, series in tqdm(enumerate(norm_cluster), total=len(norm_cluster), desc=f"Cluster {cluster_id}"):
        edge_id = cluster_edge_ids[i]
        actual_latest = series[-1]

        # Predict next day's fill level
        input_seq = series[-10:].reshape((1, 10, 1))
        predicted_next = model.predict(input_seq, verbose=0)[0][0]

        # Unnormalize both predicted and actual (only the last day)
        predicted_real = predicted_next * scaler_lstm.scale_[-1] + scaler_lstm.mean_[-1]
        actual_real = actual_latest * scaler_lstm.scale_[-1] + scaler_lstm.mean_[-1]

        # Threshold-based decision
        if actual_real >= 0.8:
            selection = 2
        elif predicted_real >= 0.8:
            selection = 1
        else:
            selection = 0

        selection_dict[edge_id] = selection


Processing Cluster 0...


  super().__init__(**kwargs)
Cluster 0: 100%|██████████| 875/875 [03:00<00:00,  4.85it/s]


Processing Cluster 1...


  super().__init__(**kwargs)
Cluster 1: 100%|██████████| 854/854 [01:35<00:00,  8.99it/s]


Processing Cluster 2...


  super().__init__(**kwargs)
