In [None]:
import zipfile, os

ZIP_NAME = "Dataset_V2.zip"  # must match the uploaded filename

with zipfile.ZipFile(ZIP_NAME, 'r') as z:
    z.extractall("/content")

print("Extracted folders in /content:")
print([p for p in os.listdir("/content") if "Dataset" in p or "dataset" in p])

Extracted folders in /content:
['Dataset_V2', 'Dataset_V2.zip']


In [None]:
import os, glob, json
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.utils.class_weight import compute_class_weight

# =======================
# CONFIG
# =======================
DATASET_ROOT = "Dataset_V2"

T = 512
SEED = 42
EPOCHS = 60
BATCH = 32
LR = 1e-3

tf.random.set_seed(SEED)
np.random.seed(SEED)

# =======================
# Robust reader
# =======================
def load_csv_robust(fp, expected_cols=10):
    with open(fp, "rb") as f:
        raw = f.read().replace(b"\x00", b"")
    text = raw.decode("utf-8", errors="ignore")

    good_rows = []
    for line in text.splitlines():
        line = line.strip()
        if not line:
            continue
        while line.endswith(","):
            line = line[:-1].strip()
        parts = [p.strip() for p in line.split(",")]
        if len(parts) != expected_cols:
            continue
        if any(p == "" for p in parts):
            continue
        try:
            good_rows.append([float(p) for p in parts])
        except:
            continue

    if not good_rows:
        raise ValueError(f"No valid numeric rows in {fp}")
    return np.array(good_rows, dtype=np.float32)

# =======================
# Cropping (Option 2)
# =======================
def moving_average(x, w=25):
    w = max(1, int(w))
    kernel = np.ones(w, dtype=np.float32) / w
    return np.convolve(x, kernel, mode="same")

def fix_length_center(X, target_len):
    if len(X) >= target_len:
        start = (len(X) - target_len) // 2
        return X[start:start + target_len]
    pad = np.zeros((target_len - len(X), X.shape[1]), dtype=X.dtype)
    return np.vstack([X, pad])

def emg_dc_remove(X):
    X = X.copy()
    X[:, :3] -= X[:, :3].mean(axis=0, keepdims=True)
    return X

def crop_active_region_emg(X, target_len=512, smooth_w=25, thresh_ratio=0.25):
    Traw = X.shape[0]
    if Traw == 0:
        return np.zeros((target_len, X.shape[1]), dtype=np.float32)

    energy = np.sum(np.abs(X[:, :3]), axis=1)
    energy_s = moving_average(energy, w=smooth_w)

    mx = float(np.max(energy_s))
    if mx <= 1e-6:
        return fix_length_center(X, target_len)

    thresh = thresh_ratio * mx
    active = np.where(energy_s >= thresh)[0]
    if len(active) < 5:
        return fix_length_center(X, target_len)

    start = int(active[0])
    end   = int(active[-1])
    center = (start + end) // 2

    half = target_len // 2
    win_start = max(0, center - half)
    win_end = win_start + target_len
    if win_end > Traw:
        win_end = Traw
        win_start = max(0, win_end - target_len)

    cropped = X[win_start:win_end]
    if cropped.shape[0] < target_len:
        pad = np.zeros((target_len - cropped.shape[0], X.shape[1]), dtype=cropped.dtype)
        cropped = np.vstack([cropped, pad])
    return cropped

# =======================
# Dataset loading with DAY metadata (Option A: include IDLE)
# =======================
def build_label_map(root):
    labels = sorted([d for d in os.listdir(root) if os.path.isdir(os.path.join(root, d))])

    # ✅ Force idle to index 0 if present
    if "idle" in labels:
        labels.remove("idle")
        labels = ["idle"] + labels

    return {lbl: i for i, lbl in enumerate(labels)}

def load_one_sample(path, is_idle=False):
    arr = load_csv_robust(path, expected_cols=10)    # (Traw, 10)
    X = arr[:, 1:]                                   # drop timestamp -> (Traw, 9)
    X = emg_dc_remove(X)

    # ✅ Key change: idle should NOT be "active-cropped"
    if is_idle:
        X = fix_length_center(X, T)
    else:
        X = crop_active_region_emg(X, target_len=T, smooth_w=25, thresh_ratio=0.25)

    return X.astype(np.float32)

def load_dataset_with_days(root):
    label2id = build_label_map(root)
    X_list, y_list, day_list = [], [], []
    skipped = 0

    for label, lab_id in label2id.items():
        class_dir = os.path.join(root, label)
        files = sorted(glob.glob(os.path.join(class_dir, "**", "*.txt"), recursive=True))

        for fp in files:
            day_name = os.path.basename(os.path.dirname(fp))  # parent folder: Day_1, Day_2...
            try:
                X = load_one_sample(fp, is_idle=(label == "idle"))
                X_list.append(X)
                y_list.append(lab_id)
                day_list.append(day_name)
            except:
                skipped += 1
                # print("[SKIP]", fp)
                pass

    X_all = np.stack(X_list, axis=0)  # (N, T, 9)
    y_all = np.array(y_list, dtype=np.int64)
    days  = np.array(day_list)

    print("Loaded:", X_all.shape, "classes:", len(label2id), "skipped:", skipped)
    print("Label map:", label2id)
    print("Days present:", sorted(set(days.tolist())))
    return X_all, y_all, days, label2id

def normalize_train_only(X_train, X_val, X_test):
    N, TT, F = X_train.shape
    scaler = StandardScaler()
    scaler.fit(X_train.reshape(-1, F))

    X_train = scaler.transform(X_train.reshape(-1, F)).reshape(N, TT, F)
    X_val   = scaler.transform(X_val.reshape(-1, F)).reshape(X_val.shape[0], TT, F)
    X_test  = scaler.transform(X_test.reshape(-1, F)).reshape(X_test.shape[0], TT, F)

    return X_train, X_val, X_test, scaler

# =======================
# Model
# =======================
def build_cnn_lstm(T, F, num_classes):
    inp = layers.Input(shape=(T, F))

    x = layers.Conv1D(64, 5, padding="same")(inp)
    x = layers.BatchNormalization()(x)
    x = layers.ReLU()(x)
    x = layers.MaxPool1D(2)(x)

    x = layers.Conv1D(128, 3, padding="same")(x)
    x = layers.BatchNormalization()(x)
    x = layers.ReLU()(x)
    x = layers.MaxPool1D(2)(x)

    x = layers.Dropout(0.3)(x)
    x = layers.LSTM(128)(x)
    x = layers.Dropout(0.4)(x)

    x = layers.Dense(128, activation="relu")(x)
    x = layers.Dropout(0.3)(x)

    out = layers.Dense(num_classes, activation="softmax")(x)

    model = models.Model(inp, out)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(LR),
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"]
    )
    return model

def evaluate(model, X_test, y_test, id2label):
    pred = model.predict(X_test, verbose=0).argmax(axis=1)
    print("\nClassification report:")
    print(classification_report(
        y_test, pred,
        target_names=[id2label[i] for i in range(len(id2label))],
        digits=4
    ))
    cm = confusion_matrix(y_test, pred)
    print("\nConfusion matrix (rows=true, cols=pred):")
    print(cm)

# =======================
# RUN: Day-wise split (same as yours)
# =======================
X, y, days, label2id = load_dataset_with_days(DATASET_ROOT)
id2label = {v:k for k,v in label2id.items()}

TRAIN_DAYS = {f"Day_{i}" for i in range(1, 9)}   # Day_1..Day_8
VAL_DAYS   = {"Day_9"}
TEST_DAYS  = {"Day_10"}

train_mask = np.isin(days, list(TRAIN_DAYS))
val_mask   = np.isin(days, list(VAL_DAYS))
test_mask  = np.isin(days, list(TEST_DAYS))

print("Train samples:", train_mask.sum())
print("Val samples:", val_mask.sum())
print("Test samples:", test_mask.sum())

X_train, y_train = X[train_mask], y[train_mask]
X_val,   y_val   = X[val_mask],   y[val_mask]
X_test,  y_test  = X[test_mask],  y[test_mask]

# Normalize
X_train, X_val, X_test, scaler = normalize_train_only(X_train, X_val, X_test)

# ✅ Recommended: class weights (prevents “always idle” collapse)
classes = np.unique(y_train)
weights = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
class_weight = {int(c): float(w) for c, w in zip(classes, weights)}
print("Class weights:", class_weight)

model = build_cnn_lstm(T, X_train.shape[-1], len(label2id))
model.summary()

callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor="val_accuracy", patience=10, restore_best_weights=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=5, min_lr=1e-6),
]

model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=EPOCHS,
    batch_size=BATCH,
    shuffle=True,
    callbacks=callbacks,
    class_weight=class_weight,
    verbose=1
)

loss, acc = model.evaluate(X_test, y_test, verbose=0)
print("✅ Test accuracy (Day_1–8 → Day_10):", acc)

evaluate(model, X_test, y_test, id2label)

# =======================
# SAVE ARTIFACTS (Updated names)
# =======================
# 1) H5 (optional)
model.save("cnn_lstm_with_idle.h5", save_format="h5")
print("Saved cnn_lstm_with_idle.h5")

# 2) Label map
with open("label_map.json", "w") as f:
    json.dump(label2id, f, indent=2)

# 3) Scaler params
scaler_params = {
    "mean": scaler.mean_.tolist(),
    "scale": scaler.scale_.tolist()
}
with open("scaler_params.json", "w") as f:
    json.dump(scaler_params, f, indent=2)

print("Saved label_map.json and scaler_params.json")

# 4) SavedModel (recommended for realtime)
SAVE_DIR = "/content/cnn_lstm_with_idle_savedmodel"
model.export(SAVE_DIR)
print("Saved SavedModel to", SAVE_DIR)

# Zip it for download
!zip -r cnn_lstm_with_idle_savedmodel.zip /content/cnn_lstm_with_idle_savedmodel


Loaded: (1100, 512, 9) classes: 11 skipped: 0
Label map: {'idle': 0, 'ada': 1, 'awidinawa': 2, 'boru': 3, 'hawasa': 4, 'hodai': 5, 'irida': 6, 'narakai': 7, 'pata': 8, 'saduda': 9, 'udasana': 10}
Days present: ['Day_1', 'Day_10', 'Day_2', 'Day_3', 'Day_4', 'Day_5', 'Day_6', 'Day_7', 'Day_8', 'Day_9']
Train samples: 881
Val samples: 110
Test samples: 109
Class weights: {0: 1.0011363636363637, 1: 1.0011363636363637, 2: 1.0011363636363637, 3: 0.9887766554433222, 4: 1.0011363636363637, 5: 1.0011363636363637, 6: 1.0011363636363637, 7: 1.0011363636363637, 8: 1.0011363636363637, 9: 1.0011363636363637, 10: 1.0011363636363637}


Epoch 1/60
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 308ms/step - accuracy: 0.1558 - loss: 2.3717 - val_accuracy: 0.2182 - val_loss: 2.2220 - learning_rate: 0.0010
Epoch 2/60
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 332ms/step - accuracy: 0.2863 - loss: 1.9798 - val_accuracy: 0.2545 - val_loss: 1.9526 - learning_rate: 0.0010
Epoch 3/60
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 328ms/step - accuracy: 0.4129 - loss: 1.6992 - val_accuracy: 0.4818 - val_loss: 1.5477 - learning_rate: 0.0010
Epoch 4/60
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 279ms/step - accuracy: 0.4667 - loss: 1.4167 - val_accuracy: 0.3091 - val_loss: 1.8101 - learning_rate: 0.0010
Epoch 5/60
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 342ms/step - accuracy: 0.4829 - loss: 1.3382 - val_accuracy: 0.5545 - val_loss: 1.0685 - learning_rate: 0.0010
Epoch 6/60
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m




Classification report:
              precision    recall  f1-score   support

        idle     0.7692    1.0000    0.8696        10
         ada     1.0000    0.4000    0.5714        10
   awidinawa     0.6154    0.8000    0.6957        10
        boru     1.0000    1.0000    1.0000        10
      hawasa     0.9000    0.9000    0.9000        10
       hodai     1.0000    1.0000    1.0000        10
       irida     1.0000    0.8889    0.9412         9
     narakai     0.9091    1.0000    0.9524        10
        pata     0.9091    1.0000    0.9524        10
      saduda     0.9000    0.9000    0.9000        10
     udasana     1.0000    0.9000    0.9474        10

    accuracy                         0.8899       109
   macro avg     0.9093    0.8899    0.8845       109
weighted avg     0.9085    0.8899    0.8840       109


Confusion matrix (rows=true, cols=pred):
[[10  0  0  0  0  0  0  0  0  0  0]
 [ 2  4  4  0  0  0  0  0  0  0  0]
 [ 1  0  8  0  0  0  0  1  0  0  0]
 [ 0  0  0 10