# Project 1: Classification

Equipo:

- Jesús Valentín Niño Castañeda

- Angel Toshio Tribeño Hurtado

- Rafael Humberto Ramos Huamaní

- Gabriel Vargas Urmeneta

In [1]:
%pip install h5py pyts tsfresh xgboost

Collecting pyts
  Downloading pyts-0.13.0-py3-none-any.whl (2.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m50.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tsfresh
  Downloading tsfresh-0.21.1-py2.py3-none-any.whl (96 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m96.0/96.0 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xgboost
  Downloading xgboost-3.0.5-py3-none-manylinux_2_28_x86_64.whl (94.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.9/94.9 MB[0m [31m22.5 MB/s[0m eta [36m0:00:00[0m
Collecting scikit-learn>=1.2.0
  Downloading scikit_learn-1.7.2-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (9.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m75.8 MB/s[0m eta [36m0:00:00[0m
Collecting numba>=0.55.2
  Downloading numba-0.62.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.7 MB)
[2K     [90m━━━━━━

In [2]:
import h5py

with h5py.File("train.h5", "r") as f:
    print("Train keys:", list(f.keys()))

with h5py.File("test.h5", "r") as f:
    print("Test keys:", list(f.keys()))


Train keys: ['body_acc_x', 'body_acc_y', 'body_acc_z', 'body_gyro_x', 'body_gyro_y', 'body_gyro_z', 'total_acc_x', 'total_acc_y', 'total_acc_z', 'y']
Test keys: ['body_acc_x', 'body_acc_y', 'body_acc_z', 'body_gyro_x', 'body_gyro_y', 'body_gyro_z', 'total_acc_x', 'total_acc_y', 'total_acc_z']


In [3]:
import h5py
import numpy as np

# Keys (sensors)
SENSOR_KEYS = [
    'body_acc_x', 'body_acc_y', 'body_acc_z',
    'body_gyro_x', 'body_gyro_y', 'body_gyro_z',
    'total_acc_x', 'total_acc_y', 'total_acc_z'
]

def load_h5_file(path, include_labels=True):
    with h5py.File(path, "r") as f:
        # Stack 9 sensor signals along last axis
        signals = [np.array(f[k]) for k in SENSOR_KEYS]
        X = np.stack(signals, axis=-1)   # shape (n_samples, n_timestamps, 9)
        y = np.array(f['y']).flatten() if include_labels and 'y' in f else None
    return X, y

# Load train/test
X_train, y_train = load_h5_file("train.h5", include_labels=True)
X_test, _ = load_h5_file("test.h5", include_labels=False)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)

X_train shape: (7352, 128, 9)
y_train shape: (7352,)
X_test shape: (2947, 128, 9)


## TsFresh

In [4]:
import h5py
import numpy as np
import pandas as pd
from tsfresh import extract_features
from tsfresh.feature_extraction import MinimalFCParameters

# --- Load HAR data from .h5 ---
SENSOR_KEYS = [
    'body_acc_x', 'body_acc_y', 'body_acc_z',
    'body_gyro_x', 'body_gyro_y', 'body_gyro_z',
    'total_acc_x', 'total_acc_y', 'total_acc_z'
]

def load_h5_file(path, include_labels=True):
    with h5py.File(path, "r") as f:
        signals = [np.array(f[k]) for k in SENSOR_KEYS]
        X = np.stack(signals, axis=-1)  # (n_samples, n_timestamps, 9)
        y = np.array(f['y']).flatten() if include_labels and 'y' in f else None
    return X, y

X_train, y_train = load_h5_file("train.h5", include_labels=True)
X_test, _ = load_h5_file("test.h5", include_labels=False)

# --- Convert to long format for tsfresh ---
def to_long_dataframe(X):
    n_samples, n_timestamps, n_features = X.shape
    records = []
    for sample in range(n_samples):
        for feature in range(n_features):
            for t in range(n_timestamps):
                records.append({
                    "id": sample,
                    "time": t,
                    "kind": f"f{feature}",
                    "value": X[sample, t, feature]
                })
    return pd.DataFrame(records)

df_train = to_long_dataframe(X_train)
df_test  = to_long_dataframe(X_test)

# --- Feature extraction ---
features_train = extract_features(
    df_train,
    column_id="id",
    column_sort="time",
    column_kind="kind",
    column_value="value",
    default_fc_parameters=MinimalFCParameters(),
    n_jobs=1
)

features_test = extract_features(
    df_test,
    column_id="id",
    column_sort="time",
    column_kind="kind",
    column_value="value",
    default_fc_parameters=MinimalFCParameters(),
    n_jobs=1
)

# --- Save results ---
features_train["activity"] = y_train  # attach labels for later training
features_train.to_csv("features_tsfresh_train.csv", index=False)
features_test.to_csv("features_tsfresh_test.csv", index=False)

print("tsfresh features saved to CSV")

Feature Extraction: 100%|██████████| 66168/66168 [00:48<00:00, 1372.69it/s]
Feature Extraction: 100%|██████████| 26523/26523 [00:19<00:00, 1360.47it/s]
tsfresh features saved to CSV


## PyTS

In [4]:
import h5py
import numpy as np
from pyts.image import RecurrencePlot

# --- Load HAR dataset ---
SENSOR_KEYS = [
    'body_acc_x', 'body_acc_y', 'body_acc_z',
    'body_gyro_x', 'body_gyro_y', 'body_gyro_z',
    'total_acc_x', 'total_acc_y', 'total_acc_z'
]

def load_h5_file(path, include_labels=True):
    with h5py.File(path, "r") as f:
        signals = [np.array(f[k]) for k in SENSOR_KEYS]
        X = np.stack(signals, axis=-1)  # (n_samples, 128, 9)
        y = np.array(f['y']).flatten() if include_labels and 'y' in f else None
    return X, y

X_train, y_train = load_h5_file("train.h5", include_labels=True)
X_test, _ = load_h5_file("test.h5", include_labels=False)

# --- Pick one channel (example: total_acc_x = index 6) ---
X_train_channel = X_train[:, :, 6]
X_test_channel  = X_test[:, :, 6]

# --- Batch processing with RecurrencePlot ---
def pyts_in_batches(X, batch_size=500, filename="output.npz"):
    rp = RecurrencePlot(threshold="point", percentage=20)
    n_samples = X.shape[0]
    
    with open(filename, "wb") as f:  # placeholder open, actual save with np.savez_compressed later
        pass
    
    batches = []
    for start in range(0, n_samples, batch_size):
        end = min(start + batch_size, n_samples)
        print(f"Processing {start}:{end}/{n_samples}")
        
        batch = rp.fit_transform(X[start:end])
        batch = batch.astype(np.float32)  # save memory
        batches.append(batch)
    
    X_rp = np.vstack(batches)
    np.savez_compressed(filename, X_rp=X_rp)
    return X_rp

# --- Process and save ---
X_rp_train = pyts_in_batches(X_train_channel, batch_size=500, filename="features_pyts_train.npz")
np.savez_compressed("features_pyts_train.npz", X_rp_train=X_rp_train, y_train=y_train)

X_rp_test = pyts_in_batches(X_test_channel, batch_size=500, filename="features_pyts_test.npz")
np.savez_compressed("features_pyts_test.npz", X_rp_test=X_rp_test)

print("PyTS features extracted and saved")

Processing 0:500/7352
Processing 500:1000/7352
Processing 1000:1500/7352
Processing 1500:2000/7352
Processing 2000:2500/7352
Processing 2500:3000/7352
Processing 3000:3500/7352
Processing 3500:4000/7352
Processing 4000:4500/7352
Processing 4500:5000/7352
Processing 5000:5500/7352
Processing 5500:6000/7352
Processing 6000:6500/7352
Processing 6500:7000/7352
Processing 7000:7352/7352
Processing 0:500/2947
Processing 500:1000/2947
Processing 1000:1500/2947
Processing 1500:2000/2947
Processing 2000:2500/2947
Processing 2500:2947/2947
PyTS features extracted and saved


## Preliminary Testing

In [4]:
import pandas as pd
import numpy as np

# === TsFresh data ===
features_train = pd.read_csv("features_tsfresh_train.csv")
features_test  = pd.read_csv("features_tsfresh_test.csv")

# Separate labels
y_train = features_train["activity"].astype(int) - 1
X_train = features_train.drop(columns=["activity"])
X_test  = features_test

# === PyTS data ===
data_train = np.load("features_pyts_train.npz")
X_rp_train = data_train["X_rp_train"]
y_rp_train = data_train["y_train"].astype(int) - 1

data_test = np.load("features_pyts_test.npz")
X_rp_test = data_test["X_rp_test"]

# Flatten images for sklearn classifiers
n_samples, h, w = X_rp_train.shape
X_rp_train = X_rp_train.reshape(n_samples, -1)
X_rp_test  = X_rp_test.reshape(X_rp_test.shape[0], -1)

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

print("=== Benchmarking Multiple Models with TsFresh data ===")

# --- Split train into train/val ---
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

# --- Escalar datos para modelos sensibles (LogReg, SVM, KNN, NB) ---
scaler = StandardScaler()
X_tr_scaled = scaler.fit_transform(X_tr)
X_val_scaled = scaler.transform(X_val)

# --- Definir modelos ---
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "Naive Bayes": GaussianNB(),
    "Linear SVM": LinearSVC(max_iter=2000),
    "RBF SVM": SVC(kernel="rbf"),
    "KNN (k=5)": KNeighborsClassifier(n_neighbors=5),
    "XGBoost": XGBClassifier(eval_metric="mlogloss", random_state=42)
}

# --- Entrenar y evaluar ---
results = {}
for name, model in models.items():
    print(f"\n>>> {name}")
    
    # Algunos modelos necesitan datos escalados
    if name in ["Logistic Regression", "Naive Bayes", "Linear SVM", "RBF SVM", "KNN (k=5)"]:
        model.fit(X_tr_scaled, y_tr)
        y_pred = model.predict(X_val_scaled)
    else:
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)
    
    acc = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred, average="weighted")
    cm = confusion_matrix(y_val, y_pred)
    
    results[name] = {"Accuracy": acc, "F1": f1}
    
    print("Accuracy:", acc)
    print("F1 Score:", f1)
    print("Confusion Matrix:\n", cm)
    # Opcional: reporte detallado
    # print(classification_report(y_val, y_pred))


=== Benchmarking Multiple Models with TsFresh data ===

>>> Logistic Regression
Accuracy: 0.938137321549966
F1 Score: 0.9381939562274736
Confusion Matrix:
 [[236  10   1   0   0   0]
 [  7 191   2   0   0   0]
 [ 10   4 192   0   0   0]
 [  0   0   0 235  27   0]
 [  0   0   0  30 246   0]
 [  0   0   0   0   0 280]]

>>> Decision Tree
Accuracy: 0.9401767505098573
F1 Score: 0.9401592486401132
Confusion Matrix:
 [[226  14   6   0   1   0]
 [ 20 173   7   0   0   0]
 [ 11   8 187   0   0   0]
 [  0   0   0 245  17   0]
 [  0   0   0   4 272   0]
 [  0   0   0   0   0 280]]

>>> Random Forest
Accuracy: 0.9789259007477906
F1 Score: 0.9789219276974194
Confusion Matrix:
 [[244   2   1   0   0   0]
 [  2 196   2   0   0   0]
 [  4   2 200   0   0   0]
 [  0   0   0 253   9   0]
 [  0   0   0   9 267   0]
 [  0   0   0   0   0 280]]

>>> Naive Bayes
Accuracy: 0.8103331067301156
F1 Score: 0.8021684334964048
Confusion Matrix:
 [[208  28  11   0   0   0]
 [ 37 151  12   0   0   0]
 [ 25   7 174  

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

print("=== Benchmarking Multiple Models with PyTS data ===")

# --- Split train into train/val ---
X_tr, X_val, y_tr, y_val = train_test_split(
    X_rp_train, y_rp_train, test_size=0.2, random_state=42
)

# --- Escalar datos para modelos sensibles (LogReg, SVM, KNN, NB) ---
scaler = StandardScaler()
X_tr_scaled = scaler.fit_transform(X_tr)
X_val_scaled = scaler.transform(X_val)

# --- Definir modelos ---
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "Naive Bayes": GaussianNB(),
    "Linear SVM": LinearSVC(max_iter=2000),
    "RBF SVM": SVC(kernel="rbf"),
    "KNN (k=5)": KNeighborsClassifier(n_neighbors=5),
    "XGBoost": XGBClassifier(eval_metric="mlogloss", random_state=42)
}

# --- Entrenar y evaluar ---
results = {}
for name, model in models.items():
    print(f"\n>>> {name}")
    
    # Algunos modelos necesitan datos escalados
    if name in ["Logistic Regression", "Naive Bayes", "Linear SVM", "RBF SVM", "KNN (k=5)"]:
        model.fit(X_tr_scaled, y_tr)
        y_pred = model.predict(X_val_scaled)
    else:
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)
    
    acc = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred, average="weighted")
    cm = confusion_matrix(y_val, y_pred)
    
    results[name] = {"Accuracy": acc, "F1": f1}
    
    print("Accuracy:", acc)
    print("F1 Score:", f1)
    print("Confusion Matrix:\n", cm)
    # Opcional: reporte detallado
    # print(classification_report(y_val, y_pred))


=== Benchmarking Multiple Models with PyTS data ===

>>> Logistic Regression
Accuracy: 0.504418762746431
F1 Score: 0.5018065264523589
Confusion Matrix:
 [[170  29  47   0   1   0]
 [ 37 119  41   0   0   3]
 [ 37  47 104  10   5   3]
 [  2   2   3  75 113  67]
 [  3   1   5  74 129  64]
 [  5   4   1  52  73 145]]

>>> Decision Tree
Accuracy: 0.2610469068660775
F1 Score: 0.26108175277468004
Confusion Matrix:
 [[84 46 42 22 29 24]
 [47 47 41 16 17 32]
 [42 45 44 30 20 25]
 [21 26 29 63 64 59]
 [32 30 22 66 70 56]
 [34 22 39 45 64 76]]

>>> Random Forest
Accuracy: 0.469068660774983
F1 Score: 0.4614348757657078
Confusion Matrix:
 [[175  28  21   5  13   5]
 [ 66 101   7   7   3  16]
 [ 67  32  48  13  28  18]
 [  2   0   0  81 117  62]
 [  0   2   1  88 150  35]
 [  1   0   1  54  89 135]]

>>> Naive Bayes
Accuracy: 0.42895989123045547
F1 Score: 0.42940807607608267
Confusion Matrix:
 [[115  60  64   2   6   0]
 [ 39 100  36   6  17   2]
 [ 43  39  72  12  40   0]
 [  0   0   0  65 162  35

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=9cf3d34f-aa4d-4d65-a77f-229ed288b911' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>