This program is for gesture classification using DTW. The goal is to compare a new hand trajectory obtained from live input to a set of pre-recorded reference sequences from the csv file and identify which predefined target or point it most closely resembles. It also saves the model and contains the performance metrics and confusion matrix.

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from tensorflow.keras.preprocessing.sequence import pad_sequences
from fastdtw import fastdtw
from scipy.spatial.distance import euclidean
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
import mediapipe as mp
import pyrealsense2 as rs
import time
import joblib
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from scipy.spatial.distance import euclidean
from collections import defaultdict

In [19]:
# Load data
df = pd.read_csv("hand_trajectory_labeled_new_edit.csv")

# Normalize x, y, z
scaler = MinMaxScaler()
df[['x_px', 'y_px', 'z_mm']] = scaler.fit_transform(df[['x_px', 'y_px', 'z_mm']])

# Group by point and sequence ID
grouped = df.groupby(['point_id', 'sequence_id'])

# Get the maximum sequence length dynamically
max_len = grouped.size().max()

# Prepare sequences and labels
sequences = []
labels = []

for (point_id, sequence_id), group in grouped:
    seq = group[['x_px', 'y_px', 'z_mm']].values
    sequences.append(seq)
    labels.append(point_id)

# Pad sequences to the max length
X = pad_sequences(sequences, maxlen=max_len, dtype='float32', padding='post', truncating='post')

# Encode labels to numeric
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)

# Build templates from training data only
reference_sequences = defaultdict(list)
for i, label in enumerate(y_train):
    reference_sequences[label].append(X_train[i])
templates = {label: seqs[0] for label, seqs in reference_sequences.items()}

# DTW classification function
def classify_with_dtw(sequence):
    best_label, best_distance = None, float('inf')
    for label, ref_seq in templates.items():
        dist, _ = fastdtw(sequence, ref_seq, dist=euclidean)
        if dist < best_distance:
            best_distance = dist
            best_label = label
    return best_label

# Predict on test set
y_pred = [classify_with_dtw(seq) for seq in X_test]

# Evaluation
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Print info
print("Input shape (X):", X.shape)
print("Target shape (y):", y.shape)
print("Classes:", label_encoder.classes_)


Classification Report:
              precision    recall  f1-score   support

          P1       1.00      1.00      1.00         6
          P2       1.00      1.00      1.00         6
          P3       1.00      1.00      1.00         6
          P4       1.00      1.00      1.00         6
          P5       1.00      1.00      1.00         6
          P6       1.00      1.00      1.00         6
          P7       1.00      1.00      1.00         6
          P8       1.00      1.00      1.00         6
          P9       1.00      1.00      1.00         6

    accuracy                           1.00        54
   macro avg       1.00      1.00      1.00        54
weighted avg       1.00      1.00      1.00        54

Confusion Matrix:
[[6 0 0 0 0 0 0 0 0]
 [0 6 0 0 0 0 0 0 0]
 [0 0 6 0 0 0 0 0 0]
 [0 0 0 6 0 0 0 0 0]
 [0 0 0 0 6 0 0 0 0]
 [0 0 0 0 0 6 0 0 0]
 [0 0 0 0 0 0 6 0 0]
 [0 0 0 0 0 0 0 6 0]
 [0 0 0 0 0 0 0 0 6]]
Input shape (X): (135, 79, 3)
Target shape (y): (135,)
Classes: 

In [20]:
# Save arrays and encoder
np.save("X_dtw_new.npy", X)
np.save("Y_dtw_new.npy", y)
joblib.dump(label_encoder, "label_encoder_dtw_new.pkl")


['label_encoder_dtw_new.pkl']

In [52]:
# === Setup RealSense ===
pipeline = rs.pipeline()
config = rs.config()
config.enable_stream(rs.stream.depth, 640, 480, rs.format.z16, 30)
config.enable_stream(rs.stream.color, 640, 480, rs.format.bgr8, 30)
pipeline.start(config)
align = rs.align(rs.stream.color)

# === Setup MediaPipe Hands ===
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(max_num_hands=1,
                       min_detection_confidence=0.7,
                       min_tracking_confidence=0.5)

# === Capture Sequence ===
print("Start moving hand — press 's' to stop recording")
sequence = []
start_time = time.time()

while True:
    frames = pipeline.wait_for_frames()
    aligned = align.process(frames)
    depth_frame = aligned.get_depth_frame()
    color_frame = aligned.get_color_frame()
    if not depth_frame or not color_frame:
        continue

    color_image = np.asanyarray(color_frame.get_data())
    rgb = cv2.cvtColor(color_image, cv2.COLOR_BGR2RGB)
    result = hands.process(rgb)

    if result.multi_hand_landmarks:
        for hand_landmarks in result.multi_hand_landmarks:
            h, w, _ = color_image.shape
            landmark = hand_landmarks.landmark[8]  # Index fingertip
            cx, cy = int(landmark.x * w), int(landmark.y * h)
            cx = np.clip(cx, 0, w - 1)
            cy = np.clip(cy, 0, h - 1)
            z = depth_frame.get_distance(cx, cy) * 1000  # in mm

            if z > 0:  # Ignore invalid points
                sequence.append([cx, cy, z])

            # Visual feedback
            cv2.circle(color_image, (cx, cy), 8, (0, 255, 0), -1)

    cv2.putText(color_image, f"Recording ({len(sequence)} frames)...",
                (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.7, (255, 0, 0), 2)
    cv2.imshow("Live Input", color_image)

    if cv2.waitKey(1) & 0xFF == ord('s'):
        break

pipeline.stop()
cv2.destroyAllWindows()
hands.close()

# === Normalize & Pad the new sequence ===
new_seq = np.array(sequence)
if new_seq.shape[0] == 0:
    print("No valid data captured.")
    exit()

# Normalize using the same scaler
new_seq_scaled = scaler.transform(new_seq)

# Pad to match training sequence length
new_seq_padded = pad_sequences([new_seq_scaled], maxlen=X.shape[1],
                               dtype='float32', padding='post', truncating='post')

# === DTW Classification ===
def classify_with_dtw(sequence):
    best_label, best_distance = None, float('inf')
    for label, ref_seq in templates.items():
        dist, _ = fastdtw(sequence, ref_seq, dist=euclidean)
        if dist < best_distance:
            best_distance = dist
            best_label = label
    return best_label

matched_label = classify_with_dtw(new_seq_padded[0])
matched_name = label_encoder.inverse_transform([matched_label])[0]

print(f"Predicted target point: {matched_name}")


Start moving hand — press 's' to stop recording
Predicted target point: P7


