In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


## Load Dataset


In [None]:
# Load the feature dataset
dataset_path = Path("../data/processed/features/jab_nonjab_dataset.csv")

if dataset_path.exists():
    df = pd.read_csv(dataset_path)
    print(f"Dataset loaded: {len(df)} samples")
    print(f"Columns: {df.columns.tolist()}")
else:
    print(f"Dataset not found at {dataset_path}")
    print("Please run feature engineering first:")
    print("  python -m src.feature_engineering")


## Basic Statistics


In [None]:
# Display first few rows
df.head()


In [None]:
# Basic info
df.info()


In [None]:
# Descriptive statistics
df.describe()


## Label Distribution


In [None]:
# Count labels
label_counts = df['label'].value_counts()
print("Label distribution:")
print(label_counts)
print(f"\nJab (1): {label_counts.get(1, 0)} samples")
print(f"Non-Jab (0): {label_counts.get(0, 0)} samples")


In [None]:
# Plot label distribution
plt.figure(figsize=(8, 5))
label_counts.plot(kind='bar', color=['skyblue', 'salmon'])
plt.title('Label Distribution')
plt.xlabel('Label (0=Non-Jab, 1=Jab)')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.show()


## Feature Distributions


In [None]:
# Identify feature columns (exclude metadata)
exclude_cols = ['label', 'source_file', 'window_start_frame', 'window_end_frame',
                'window_start_time', 'window_end_time']
feature_cols = [col for col in df.columns if col not in exclude_cols]

print(f"Feature columns ({len(feature_cols)}):")
for col in feature_cols:
    print(f"  - {col}")


### Wrist Velocity Distribution


In [None]:
# Plot wrist velocity by label
if 'left_wrist_velocity_mean' in df.columns:
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    
    # Histogram
    df[df['label'] == 0]['left_wrist_velocity_mean'].hist(
        ax=axes[0], alpha=0.7, label='Non-Jab', bins=30
    )
    df[df['label'] == 1]['left_wrist_velocity_mean'].hist(
        ax=axes[0], alpha=0.7, label='Jab', bins=30
    )
    axes[0].set_xlabel('Mean Wrist Velocity')
    axes[0].set_ylabel('Frequency')
    axes[0].set_title('Wrist Velocity Distribution')
    axes[0].legend()
    axes[0].grid(alpha=0.3)
    
    # Box plot
    df.boxplot(column='left_wrist_velocity_mean', by='label', ax=axes[1])
    axes[1].set_xlabel('Label (0=Non-Jab, 1=Jab)')
    axes[1].set_ylabel('Mean Wrist Velocity')
    axes[1].set_title('Wrist Velocity by Label')
    plt.suptitle('')
    plt.tight_layout()
    plt.show()
else:
    print("left_wrist_velocity_mean column not found")


### Elbow Angle Distribution


In [None]:
# Plot elbow angle by label
if 'left_elbow_angle_mean' in df.columns:
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    
    # Histogram
    df[df['label'] == 0]['left_elbow_angle_mean'].hist(
        ax=axes[0], alpha=0.7, label='Non-Jab', bins=30
    )
    df[df['label'] == 1]['left_elbow_angle_mean'].hist(
        ax=axes[0], alpha=0.7, label='Jab', bins=30
    )
    axes[0].set_xlabel('Mean Elbow Angle (degrees)')
    axes[0].set_ylabel('Frequency')
    axes[0].set_title('Elbow Angle Distribution')
    axes[0].legend()
    axes[0].grid(alpha=0.3)
    
    # Box plot
    df.boxplot(column='left_elbow_angle_mean', by='label', ax=axes[1])
    axes[1].set_xlabel('Label (0=Non-Jab, 1=Jab)')
    axes[1].set_ylabel('Mean Elbow Angle (degrees)')
    axes[1].set_title('Elbow Angle by Label')
    plt.suptitle('')
    plt.tight_layout()
    plt.show()
else:
    print("left_elbow_angle_mean column not found")


## Feature Correlations


In [None]:
# Compute correlation matrix for numeric features
numeric_features = df[feature_cols].select_dtypes(include=[np.number])
correlation_matrix = numeric_features.corr()

# Plot heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()


## Summary Statistics by Label


In [None]:
# Group by label and compute statistics
if len(feature_cols) > 0:
    summary = df.groupby('label')[feature_cols].agg(['mean', 'std', 'min', 'max'])
    print("Summary statistics by label:")
    print(summary)
else:
    print("No feature columns found")
