# Project 1: Classification

Equipo:

- Jesús Valentín Niño Castañeda

- Angel Toshio Tribeño Hurtado

- Rafael Humberto Ramos Huamaní

- Gabriel Vargas Urmeneta

In [None]:
%pip install h5py pyts tsfresh xgboost

In [3]:
import h5py

with h5py.File("train.h5", "r") as f:
    print("Train keys:", list(f.keys()))

with h5py.File("test.h5", "r") as f:
    print("Test keys:", list(f.keys()))


Train keys: ['body_acc_x', 'body_acc_y', 'body_acc_z', 'body_gyro_x', 'body_gyro_y', 'body_gyro_z', 'total_acc_x', 'total_acc_y', 'total_acc_z', 'y']
Test keys: ['body_acc_x', 'body_acc_y', 'body_acc_z', 'body_gyro_x', 'body_gyro_y', 'body_gyro_z', 'total_acc_x', 'total_acc_y', 'total_acc_z']


In [4]:
import h5py
import numpy as np

SENSOR_KEYS = [
    'body_acc_x', 'body_acc_y', 'body_acc_z',
    'body_gyro_x', 'body_gyro_y', 'body_gyro_z',
    'total_acc_x', 'total_acc_y', 'total_acc_z'
]

def load_h5_file(path, include_labels=True):
    with h5py.File(path, "r") as f:
        # Stack 9 sensor signals along last axis
        signals = [np.array(f[k]) for k in SENSOR_KEYS]
        X = np.stack(signals, axis=-1)   # shape (n_samples, n_timestamps, 9)
        y = np.array(f['y']).flatten() if include_labels and 'y' in f else None
    return X, y

# Load train/test
X_train, y_train = load_h5_file("train.h5", include_labels=True)
X_test, _ = load_h5_file("test.h5", include_labels=False)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)

X_train shape: (7352, 128, 9)
y_train shape: (7352,)
X_test shape: (2947, 128, 9)


## TsFresh

In [5]:
import h5py
import numpy as np
import pandas as pd
from tsfresh import extract_features
from tsfresh.feature_extraction import MinimalFCParameters

SENSOR_KEYS = [
    'body_acc_x', 'body_acc_y', 'body_acc_z',
    'body_gyro_x', 'body_gyro_y', 'body_gyro_z',
    'total_acc_x', 'total_acc_y', 'total_acc_z'
]

def load_h5_file(path, include_labels=True):
    with h5py.File(path, "r") as f:
        signals = [np.array(f[k]) for k in SENSOR_KEYS]
        X = np.stack(signals, axis=-1)
        y = np.array(f['y']).flatten() if include_labels and 'y' in f else None
    return X, y

X_train, y_train = load_h5_file("train.h5", include_labels=True)
X_test, _ = load_h5_file("test.h5", include_labels=False)

def to_long_dataframe(X):
    n_samples, n_timestamps, n_features = X.shape
    records = []
    for sample in range(n_samples):
        for feature in range(n_features):
            for t in range(n_timestamps):
                records.append({
                    "id": sample,
                    "time": t,
                    "kind": f"f{feature}",
                    "value": X[sample, t, feature]
                })
    return pd.DataFrame(records)

df_train = to_long_dataframe(X_train)
df_test  = to_long_dataframe(X_test)

features_train = extract_features(
    df_train,
    column_id="id",
    column_sort="time",
    column_kind="kind",
    column_value="value",
    default_fc_parameters=MinimalFCParameters(),
    n_jobs=1
)

features_test = extract_features(
    df_test,
    column_id="id",
    column_sort="time",
    column_kind="kind",
    column_value="value",
    default_fc_parameters=MinimalFCParameters(),
    n_jobs=1
)

features_train["activity"] = y_train
features_train.to_csv("features_tsfresh_train.csv", index=False)
features_test.to_csv("features_tsfresh_test.csv", index=False)

print("tsfresh features saved to CSV")

Feature Extraction: 100%|██████████| 66168/66168 [00:24<00:00, 2697.92it/s]
Feature Extraction: 100%|██████████| 26523/26523 [00:10<00:00, 2607.99it/s]


tsfresh features saved to CSV


## PyTS

In [6]:
import h5py
import numpy as np
from pyts.image import RecurrencePlot

SENSOR_KEYS = [
    'body_acc_x', 'body_acc_y', 'body_acc_z',
    'body_gyro_x', 'body_gyro_y', 'body_gyro_z',
    'total_acc_x', 'total_acc_y', 'total_acc_z'
]

def load_h5_file(path, include_labels=True):
    with h5py.File(path, "r") as f:
        signals = [np.array(f[k]) for k in SENSOR_KEYS]
        X = np.stack(signals, axis=-1)  # (n_samples, 128, 9)
        y = np.array(f['y']).flatten() if include_labels and 'y' in f else None
    return X, y

X_train, y_train = load_h5_file("train.h5", include_labels=True)
X_test, _ = load_h5_file("test.h5", include_labels=False)

X_train_channel = X_train[:, :, 6]
X_test_channel  = X_test[:, :, 6]

def pyts_in_batches(X, batch_size=500, filename="output.npz"):
    rp = RecurrencePlot(threshold="point", percentage=20)
    n_samples = X.shape[0]
    
    with open(filename, "wb") as f:
        pass
    
    batches = []
    for start in range(0, n_samples, batch_size):
        end = min(start + batch_size, n_samples)
        print(f"Processing {start}:{end}/{n_samples}")
        
        batch = rp.fit_transform(X[start:end])
        batch = batch.astype(np.float32)
        batches.append(batch)
    
    X_rp = np.vstack(batches)
    np.savez_compressed(filename, X_rp=X_rp)
    return X_rp

X_rp_train = pyts_in_batches(X_train_channel, batch_size=500, filename="features_pyts_train.npz")
np.savez_compressed("features_pyts_train.npz", X_rp_train=X_rp_train, y_train=y_train)

X_rp_test = pyts_in_batches(X_test_channel, batch_size=500, filename="features_pyts_test.npz")
np.savez_compressed("features_pyts_test.npz", X_rp_test=X_rp_test)

print("PyTS features extracted and saved")

Processing 0:500/7352
Processing 500:1000/7352
Processing 1000:1500/7352
Processing 1500:2000/7352
Processing 2000:2500/7352
Processing 2500:3000/7352
Processing 3000:3500/7352
Processing 3500:4000/7352
Processing 4000:4500/7352
Processing 4500:5000/7352
Processing 5000:5500/7352
Processing 5500:6000/7352
Processing 6000:6500/7352
Processing 6500:7000/7352
Processing 7000:7352/7352
Processing 0:500/2947
Processing 500:1000/2947
Processing 1000:1500/2947
Processing 1500:2000/2947
Processing 2000:2500/2947
Processing 2500:2947/2947
PyTS features extracted and saved


## Preliminary Testing

In this section, we employ the `scikit-learn` and `xgboost` libraries to conduct an initial benchmarking of different classifiers.  
The objective is twofold: (i) to identify the most suitable models to later implement from scratch, and (ii) to compare the performance of the feature extraction approaches (PyTS vs. TsFresh).  

This preliminary evaluation provides guidance on which models and feature representations are most promising for the Human Activity Recognition task, ensuring that the manual implementations focus on methods with strong empirical performance.


In [6]:
import pandas as pd
import numpy as np

# === TsFresh data ===
features_train = pd.read_csv("features_tsfresh_train.csv")
features_test  = pd.read_csv("features_tsfresh_test.csv")

# Separate labels
y_train = features_train["activity"].astype(int) - 1
X_train = features_train.drop(columns=["activity"])
X_test  = features_test

# === PyTS data ===
data_train = np.load("features_pyts_train.npz")
X_rp_train = data_train["X_rp_train"]
y_rp_train = data_train["y_train"].astype(int) - 1

data_test = np.load("features_pyts_test.npz")
X_rp_test = data_test["X_rp_test"]

# Flatten images for sklearn classifiers
n_samples, h, w = X_rp_train.shape
X_rp_train = X_rp_train.reshape(n_samples, -1)
X_rp_test  = X_rp_test.reshape(X_rp_test.shape[0], -1)

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

print("=== Benchmarking Multiple Models with TsFresh data ===")

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_tr_scaled = scaler.fit_transform(X_tr)
X_val_scaled = scaler.transform(X_val)

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "Naive Bayes": GaussianNB(),
    "Linear SVM": LinearSVC(max_iter=2000),
    "KNN (k=5)": KNeighborsClassifier(n_neighbors=5),
    "XGBoost": XGBClassifier(eval_metric="mlogloss", random_state=42)
}

results = {}
for name, model in models.items():
    print(f"\n>>> {name}")
    
    if name in ["Logistic Regression", "Naive Bayes", "Linear SVM", "RBF SVM", "KNN (k=5)"]:
        model.fit(X_tr_scaled, y_tr)
        y_pred = model.predict(X_val_scaled)
    else:
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)
    
    acc = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred, average="weighted")
    cm = confusion_matrix(y_val, y_pred)
    
    results[name] = {"Accuracy": acc, "F1": f1}
    
    print("Accuracy:", acc)
    print("F1 Score:", f1)
    print("Confusion Matrix:\n", cm)


=== Benchmarking Multiple Models with TsFresh data ===

>>> Logistic Regression
Accuracy: 0.938137321549966
F1 Score: 0.9381845524468959
Confusion Matrix:
 [[236  10   1   0   0   0]
 [  6 192   2   0   0   0]
 [ 10   4 192   0   0   0]
 [  0   0   0 234  28   0]
 [  0   0   0  30 246   0]
 [  0   0   0   0   0 280]]

>>> Decision Tree
Accuracy: 0.9401767505098573
F1 Score: 0.9401592486401132
Confusion Matrix:
 [[226  14   6   0   1   0]
 [ 20 173   7   0   0   0]
 [ 11   8 187   0   0   0]
 [  0   0   0 245  17   0]
 [  0   0   0   4 272   0]
 [  0   0   0   0   0 280]]

>>> Random Forest
Accuracy: 0.9789259007477906
F1 Score: 0.9789219276974194
Confusion Matrix:
 [[244   2   1   0   0   0]
 [  2 196   2   0   0   0]
 [  4   2 200   0   0   0]
 [  0   0   0 253   9   0]
 [  0   0   0   9 267   0]
 [  0   0   0   0   0 280]]

>>> Naive Bayes
Accuracy: 0.8103331067301156
F1 Score: 0.8021684334964048
Confusion Matrix:
 [[208  28  11   0   0   0]
 [ 37 151  12   0   0   0]
 [ 25   7 174  

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression

print("=== Benchmarking Multiple Models with PyTS data ===")

X_tr, X_val, y_tr, y_val = train_test_split(
    X_rp_train, y_rp_train, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_tr_scaled = scaler.fit_transform(X_tr)
X_val_scaled = scaler.transform(X_val)

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "Naive Bayes": GaussianNB(),
    "Linear SVM": LinearSVC(max_iter=2000),
    "KNN (k=5)": KNeighborsClassifier(n_neighbors=5),
    "XGBoost": XGBClassifier(eval_metric="mlogloss", random_state=42)
}

results = {}
for name, model in models.items():
    print(f"\n>>> {name}")
    
    if name in ["Logistic Regression", "Naive Bayes", "Linear SVM", "RBF SVM", "KNN (k=5)"]:
        model.fit(X_tr_scaled, y_tr)
        y_pred = model.predict(X_val_scaled)
    else:
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_val)
    
    acc = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred, average="weighted")
    cm = confusion_matrix(y_val, y_pred)
    
    results[name] = {"Accuracy": acc, "F1": f1}
    
    print("Accuracy:", acc)
    print("F1 Score:", f1)
    print("Confusion Matrix:\n", cm)


=== Benchmarking Multiple Models with PyTS data ===

>>> Logistic Regression
Accuracy: 0.504418762746431
F1 Score: 0.5018065264523589
Confusion Matrix:
 [[170  29  47   0   1   0]
 [ 37 119  41   0   0   3]
 [ 37  47 104  10   5   3]
 [  2   2   3  75 113  67]
 [  3   1   5  74 129  64]
 [  5   4   1  52  73 145]]

>>> Decision Tree
Accuracy: 0.2610469068660775
F1 Score: 0.26108175277468004
Confusion Matrix:
 [[84 46 42 22 29 24]
 [47 47 41 16 17 32]
 [42 45 44 30 20 25]
 [21 26 29 63 64 59]
 [32 30 22 66 70 56]
 [34 22 39 45 64 76]]

>>> Random Forest
Accuracy: 0.469068660774983
F1 Score: 0.4614348757657078
Confusion Matrix:
 [[175  28  21   5  13   5]
 [ 66 101   7   7   3  16]
 [ 67  32  48  13  28  18]
 [  2   0   0  81 117  62]
 [  0   2   1  88 150  35]
 [  1   0   1  54  89 135]]

>>> Naive Bayes
Accuracy: 0.42895989123045547
F1 Score: 0.42940807607608267
Confusion Matrix:
 [[115  60  64   2   6   0]
 [ 39 100  36   6  17   2]
 [ 43  39  72  12  40   0]
 [  0   0   0  65 162  35

### Analysis and Conclusions

The preliminary experiments revealed a clear difference between the two feature extraction approaches.  
While the PyTS recurrence plot representation yielded poor classification performance (maximum weighted F1 ≈ 0.51 across models), the TsFresh feature extraction consistently led to strong results, with all models achieving weighted F1 scores above 0.80.  
This confirms that TsFresh is substantially more effective at extracting informative features from the time-series data in the Human Activity Recognition dataset.

Among the classifiers tested on TsFresh features, the best-performing models (ranked by weighted F1) were XGBoost, Random Forest, k-Nearest Neighbors (KNN), Linear SVM, Decision Tree, and Logistic Regression.  
Although ensemble methods such as XGBoost and Random Forest achieved the highest scores, their complexity makes them less suitable for a from-scratch implementation in this project.  

For this reason, we selected **Decision Tree** and **k-Nearest Neighbors** as the two models to implement manually. Both achieved competitive performance while being considerably simpler to code and interpret, and in addition, we already have prior experience implementing these algorithms.


# Model 1: Decision Tree

In [7]:
import numpy as np
import pandas as pd

def gini(y):
    if len(y) == 0:
        return 0.0
    classes, counts = np.unique(y, return_counts=True)
    probs = counts / counts.sum()
    return 1.0 - np.sum(probs ** 2)

def make_leaf(y, n_classes):
    if len(y) == 0:
        proba = np.zeros(n_classes)
    else:
        counts = np.bincount(y, minlength=n_classes)
        proba = counts / counts.sum()
    pred = np.argmax(proba)
    return {"leaf": True, "pred": int(pred), "proba": proba,
            "feature": None, "thr": None, "cat": None,
            "left": None, "right": None}

def make_cont_node(feature, thr, left, right):
    return {"leaf": False, "pred": None, "proba": None,
            "feature": feature, "thr": float(thr), "cat": None,
            "left": left, "right": right}

def make_cat_node(feature, cat, left, right):
    return {"leaf": False, "pred": None, "proba": None,
            "feature": feature, "thr": None, "cat": str(cat),
            "left": left, "right": right}

class DecisionTree:
    def fit(self, X, y, cont_cols, cat_cols,
            max_depth=10, min_samples_split=20, min_samples_leaf=10,
            max_thr_candidates=64, random_state=42):

        self.max_depth = int(max_depth)
        self.min_samples_split = int(min_samples_split)
        self.min_samples_leaf = int(min_samples_leaf)
        self.max_thr_candidates = int(max_thr_candidates)
        self.random_state = int(random_state)

        self.cont_cols = list(cont_cols)
        self.cat_cols = list(cat_cols)
        self.cat_levels = {c: set(X[c].astype(str).unique()) for c in self.cat_cols}

        self.n_classes_ = len(np.unique(y))
        self.root = self.grow(X, y, depth=0)
        return self

    def best_split(self, X, y):
        m = len(y)
        best = {"gain": 0.0}
        parent_imp = gini(y)
        if m < self.min_samples_split or parent_imp == 0.0:
            return None

        for col in self.cont_cols:
            values = X[col].astype(float).values
            uniq = np.unique(values)
            if len(uniq) <= 1:
                continue
            if len(uniq) > self.max_thr_candidates:
                qs = np.linspace(0.05, 0.95, self.max_thr_candidates)
                thr_cands = np.unique(np.quantile(values, qs))
            else:
                s = np.sort(uniq)
                thr_cands = (s[:-1] + s[1:]) / 2.0

            for thr in thr_cands:
                left_idx = values <= thr
                right_idx = ~left_idx
                if left_idx.sum() < self.min_samples_leaf or right_idx.sum() < self.min_samples_leaf:
                    continue
                g_left = gini(y[left_idx])
                g_right = gini(y[right_idx])
                gain = parent_imp - (left_idx.mean()*g_left + right_idx.mean()*g_right)
                if gain > best["gain"]:
                    best = {"gain": gain, "feature": col, "type": "cont", "thr": float(thr),
                            "left_idx": left_idx, "right_idx": right_idx}

        for col in self.cat_cols:
            vals = X[col].astype(str).values
            for cat in np.unique(vals):
                left_idx = (vals == cat)
                right_idx = ~left_idx
                if left_idx.sum() < self.min_samples_leaf or right_idx.sum() < self.min_samples_leaf:
                    continue
                g_left = gini(y[left_idx])
                g_right = gini(y[right_idx])
                gain = parent_imp - (left_idx.mean()*g_left + right_idx.mean()*g_right)
                if gain > best["gain"]:
                    best = {"gain": gain, "feature": col, "type": "cat", "cat": cat,
                            "left_idx": left_idx, "right_idx": right_idx}

        return best if best["gain"] > 0.0 else None

    def grow(self, X, y, depth):
        if depth >= self.max_depth or len(y) < self.min_samples_split or gini(y) == 0.0:
            return make_leaf(y, self.n_classes_)

        split = self.best_split(X, y)
        if split is None:
            return make_leaf(y, self.n_classes_)

        if split["type"] == "cont":
            left = self.grow(X[split["left_idx"]], y[split["left_idx"]], depth+1)
            right = self.grow(X[split["right_idx"]], y[split["right_idx"]], depth+1)
            return make_cont_node(split["feature"], split["thr"], left, right)
        else:
            left = self.grow(X[split["left_idx"]], y[split["left_idx"]], depth+1)
            right = self.grow(X[split["right_idx"]], y[split["right_idx"]], depth+1)
            return make_cat_node(split["feature"], split["cat"], left, right)

    def predict_row(self, row, node):
        while not node["leaf"]:
            val = row[node["feature"]]
            if node["thr"] is not None:
                go_left = float(val) <= node["thr"]
            else:
                go_left = str(val) == node["cat"]
            node = node["left"] if go_left else node["right"]
        return node["pred"], node["proba"]

    def predict(self, X):
        out = []
        for _, r in X.iterrows():
            p, _ = self.predict_row(r, self.root)
            out.append(p)
        return np.array(out, dtype=int)

    def predict_proba(self, X):
        out = []
        for _, r in X.iterrows():
            _, pr = self.predict_row(r, self.root)
            out.append(pr)
        return np.array(out)


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

tree = DecisionTree()
tree.fit(
    X_tr, y_tr,
    cont_cols=X_tr.columns,
    cat_cols=[],
    max_depth=10,
    min_samples_split=20,
    min_samples_leaf=10
)

y_pred = tree.predict(X_val)
acc = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred, average="weighted")
cm = confusion_matrix(y_val, y_pred)
report = classification_report(y_val, y_pred)

print("Custom Decision Tree Results:")
print("Accuracy:", acc)
print("F1 Score:", f1)
print("Confusion Matrix:\n", cm)
print("Classification Report:\n", report)

Custom Decision Tree Results:
Accuracy: 0.9211420802175391
F1 Score: 0.9210428000281494
Confusion Matrix:
 [[210  21  14   0   0   0]
 [ 22 181  11   1   0   0]
 [ 11   9 177   0   0   0]
 [  0   0   0 241  16   0]
 [  0   0   0  11 264   0]
 [  0   0   0   0   0 282]]
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.86      0.86       245
           1       0.86      0.84      0.85       215
           2       0.88      0.90      0.89       197
           3       0.95      0.94      0.95       257
           4       0.94      0.96      0.95       275
           5       1.00      1.00      1.00       282

    accuracy                           0.92      1471
   macro avg       0.92      0.92      0.92      1471
weighted avg       0.92      0.92      0.92      1471



In [13]:
tree = DecisionTree()
tree.fit(
    X_train, y_train,
    cont_cols=X_train.columns,
    cat_cols=[]
)
y_pred_test = tree.predict(X_test)

In [None]:
print("Test predictions shape:", y_pred_test.shape)
print("First 10 test predictions:", y_pred_test[:10] + 1)
y_pred_test = np.array(y_pred_test, dtype=int)
submission = pd.DataFrame({
    "ID": np.arange(1, len(y_pred_test) + 1),
    "Activity": y_pred_test
})
assert submission.shape[0] == 2947, "Row count must be 2947!"
submission.to_csv("submissionDT.csv", index=False)
print("submissionDT.csv saved")

Test predictions shape: (2947,)
First 10 test predictions: [5 5 5 5 5 5 5 5 4 5]
submissionDT.csv saved
