# Notebook 4: DTW + KNN Benchmark on Standardized Data

## Setup & Imports

In [1]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import euclidean
from fastdtw import fastdtw
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import time
import pickle
import os

# Set random seed for reproducibility
np.random.seed(42)

## Load 10D Data

In [8]:
def load_standardized_10d_data(data_path=r"C:\Users\jumia\Downloads\BackTap\backtapbench_standard\backtapbench_data.npz"):
    """
    Load the standardized 10D dataset from the .npz file created in Notebook 3.
    """
    if not os.path.exists(data_path):
        raise FileNotFoundError(f"Dataset not found at {data_path}. Please run Notebooks 1-3 first.")
    
    # Load the compressed numpy file
    data = np.load(data_path)
    
    # Extract data
    X = data['segments']      # Shape: (n_samples, 31, 10)
    y = data['labels']        # Shape: (n_samples,)
    
    print(f"âœ… Dataset loaded successfully!")
    print(f"   Total samples: {X.shape[0]}")
    print(f"   Sequence length: {X.shape[1]} timepoints")
    print(f"   Number of sensors: {X.shape[2]} (10D)")
    print(f"   Unique labels: {np.unique(y)}")
    
    return X, y

# Load the full dataset
X, y = load_standardized_10d_data()

âœ… Dataset loaded successfully!
   Total samples: 1299
   Sequence length: 31 timepoints
   Number of sensors: 10 (10D)
   Unique labels: [0 1 2 3 4 5 6 7 8]


## Train/Test Split

In [13]:
from sklearn.model_selection import train_test_split

# Use the same split as in Notebook 3 for consistency
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nðŸ“Š Data split:")
print(f"   Training: {X_train.shape[0]} samples")
print(f"   Test: {X_test.shape[0]} samples")


ðŸ“Š Data split:
   Training: 1039 samples
   Test: 260 samples


## DTW Computation

In [18]:
def compute_dtw_distance_matrix(X1, X2, batch_size=100):
    """
    Compute DTW distance matrix with batch processing for memory efficiency.
    """
    n1, n2 = len(X1), len(X2)
    distance_matrix = np.zeros((n1, n2))
    
    total_comparisons = n1 * n2
    print(f"Computing {total_comparisons:,} DTW distances...")
    
    for i in tqdm(range(n1), desc="Processing sequences"):
        for j in range(n2):
            # DTW with Euclidean distance
            dist, _ = fastdtw(X1[i], X2[j], dist=euclidean)
            distance_matrix[i, j] = dist
    
    return distance_matrix

def get_cached_distance_matrices(X_train, X_test, cache_dir='dtw_cache'):
    """
    Load cached distance matrices or compute and cache them.
    """
    os.makedirs(cache_dir, exist_ok=True)
    
    train_cache = os.path.join(cache_dir, 'dtw_train_distance.npy')
    test_cache = os.path.join(cache_dir, 'dtw_test_distance.npy')
    
    if os.path.exists(train_cache) and os.path.exists(test_cache):
        print("ðŸ“¦ Loading cached distance matrices...")
        distance_train = np.load(train_cache)
        distance_test = np.load(test_cache)
        print(f"   Train distance matrix: {distance_train.shape}")
        print(f"   Test distance matrix: {distance_test.shape}")
    else:
        print("ðŸ”„ Computing DTW distance matrices (this may take a while)...")
        start_time = time.time()
        
        distance_train = compute_dtw_distance_matrix(X_train, X_train)
        distance_test = compute_dtw_distance_matrix(X_test, X_train)
        
        compute_time = time.time() - start_time
        print(f"   Computation time: {compute_time:.2f} seconds")
        
        # Cache for future use
        np.save(train_cache, distance_train)
        np.save(test_cache, distance_test)
        print(f"   ðŸ’¾ Cached to {cache_dir}/")
    
    return distance_train, distance_test

## DTW + KNN Benchmark

In [21]:
def run_dtw_knn_benchmark(X_train, X_test, y_train, y_test, use_cache=True, k=1):
    """
    Complete DTW + KNN benchmark pipeline.
    """
    start_time = time.time()
    
    print("\n" + "="*70)
    print("ðŸš€ DTW + KNN BENCHMARK (10D Standardized Data)")
    print("="*70)
    
    # 1. Encode labels
    le = LabelEncoder()
    y_train_enc = le.fit_transform(y_train)
    y_test_enc = le.transform(y_test)
    
    # 2. Compute/load distance matrices
    if use_cache:
        distance_train, distance_test = get_cached_distance_matrices(X_train, X_test)
    else:
        distance_train = compute_dtw_distance_matrix(X_train, X_train)
        distance_test = compute_dtw_distance_matrix(X_test, X_train)
    
    # 3. Train KNN with precomputed metric
    print("\nðŸ¤– Training KNN classifier...")
    knn = KNeighborsClassifier(n_neighbors=k, metric='precomputed')
    knn.fit(distance_train, y_train_enc)
    
    # 4. Predict
    print("   Making predictions...")
    y_pred = knn.predict(distance_test)
    
    # 5. Evaluate
    accuracy = accuracy_score(y_test_enc, y_pred)
    exec_time = time.time() - start_time
    
    print(f"\nâœ… Results:")
    print(f"   Accuracy: {accuracy*100:.2f}%")
    print(f"   Execution time: {exec_time:.2f} seconds")
    print(f"   k = {k}")
    
    return {
        'accuracy': accuracy,
        'y_pred': y_pred,
        'y_test_enc': y_test_enc,
        'execution_time': exec_time,
        'knn_model': knn,
        'label_encoder': le,
        'distance_test': distance_test
    }

## Execute Benchmark

In [24]:
# Run the benchmark
results = run_dtw_knn_benchmark(
    X_train, X_test, y_train, y_test, 
    use_cache=True, k=1
)


ðŸš€ DTW + KNN BENCHMARK (10D Standardized Data)
ðŸ”„ Computing DTW distance matrices (this may take a while)...
Computing 1,079,521 DTW distances...


Processing sequences: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1039/1039 [3:56:29<00:00, 13.66s/it]   


Computing 270,140 DTW distances...


Processing sequences: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 260/260 [26:51<00:00,  6.20s/it]


   Computation time: 15801.07 seconds
   ðŸ’¾ Cached to dtw_cache/

ðŸ¤– Training KNN classifier...
   Making predictions...


NameError: name 'accuracy_score' is not defined