# **Pirate Pain Challenge ‚Äî Exploratory Analysis**

---

Questa analisi esplora il dataset della Challenge Pirate Pain per supportare un modello GRU monodirezionale (solo GPU) con particolare attenzione ai campioni high pain, che sono rari e critici per le performance.


## üåê **Google Drive Connection**


In [None]:
from google.colab import drive
drive.mount("/content/drive")


## ‚öôÔ∏è **Libraries Import**


In [None]:
from pathlib import Path
from typing import Tuple

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid", context="talk")
plt.rcParams["figure.figsize"] = (14, 6)
pd.options.display.max_columns = 120
pd.options.display.max_rows = 200


## ‚è≥ **Data Loading**


In [None]:
DATA_DIR = Path('/content/drive/MyDrive/[2025-2026] AN2DL/Challenge')

def load_data(data_dir: Path) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    X_train = pd.read_csv(data_dir / 'data' / 'pirate_pain_train.csv')
    y_train = pd.read_csv(data_dir / 'data' / 'pirate_pain_train_labels.csv')
    X_test = pd.read_csv(data_dir / 'data' / 'pirate_pain_test.csv')
    return X_train, y_train, X_test

X_train_raw, y_train, X_test_raw = load_data(DATA_DIR)
print('X_train shape:', X_train_raw.shape)
print('y_train shape:', y_train.shape)
print('X_test shape:', X_test_raw.shape)


## üîß **Feature Casting**


In [None]:
# Ensure numeric types before aggregations
X_train_raw = X_train_raw.copy()
X_test_raw = X_test_raw.copy()

word_to_num = {
    'zero': 0,
    'one': 1,
    'two': 2,
    'three': 3,
    'four': 4,
    'five': 5,
    'six': 6,
    'seven': 7,
    'eight': 8,
    'nine': 9,
}

def coerce_numeric_counts(df: pd.DataFrame) -> pd.DataFrame:
    for col in ['n_legs', 'n_hands', 'n_eyes']:
        if col in df.columns:
            col_series = df[col].astype(str).str.lower()
            numeric_part = pd.to_numeric(col_series, errors='coerce')
            word_part = col_series.str.extract('(zero|one|two|three|four|five|six|seven|eight|nine)', expand=False)
            word_numeric = word_part.map(word_to_num)
            df[col] = numeric_part.fillna(word_numeric)
    return df

X_train_raw = coerce_numeric_counts(X_train_raw)
X_test_raw = coerce_numeric_counts(X_test_raw)

# Convert remaining object columns to numeric whenever possible
for df in (X_train_raw, X_test_raw):
    object_cols = df.select_dtypes(include='object').columns
    for col in object_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')

X_train_raw[['n_legs', 'n_hands', 'n_eyes']].head()


## üîç **First Look**


In [None]:
X_train_raw.head()


In [None]:
y_train.head()


## üßº **Data Quality Checks**


In [None]:
missing_features = X_train_raw.isna().sum().sort_values(ascending=False)
missing_labels = y_train.isna().sum()

print('Missing values in features (top 10):')
print(missing_features.head(10))
print('\nMissing values in labels:')
print(missing_labels)


## üìä **Label Distribution**


In [None]:
label_counts = y_train['label'].value_counts().rename('count')
label_pct = (label_counts / label_counts.sum()).rename('percent') * 100
display(pd.concat([label_counts, label_pct], axis=1))


In [None]:
ax = sns.countplot(data=y_train, x='label', order=label_counts.index, palette='rocket')
ax.bar_label(ax.containers[0])
plt.title('Label counts')
plt.show()


## ‚è±Ô∏è **Sequence Structure**


In [None]:
seq_lengths = X_train_raw.groupby('sample_index').size()
print(seq_lengths.describe())

coverage = X_train_raw.groupby('sample_index')['time'].agg(['min', 'max'])
print('\nTime coverage per sequence (first 5 samples):')
display(coverage.head())


In [None]:
high_pain_indices = y_train.loc[y_train['label'] == 'high_pain', 'sample_index']
print(f"# high pain sequences: {len(high_pain_indices)} ({len(high_pain_indices) / len(y_train):.1%})")
print('Sample indices (first 10):', high_pain_indices.head(10).tolist())


## üß¨ **Feature Dynamics per Label**


In [None]:
numeric_cols = X_train_raw.select_dtypes(include=np.number).columns

survey_cols = [c for c in numeric_cols if c.startswith('pain_survey')]
joint_cols = [c for c in numeric_cols if c.startswith('joint_')]
count_cols = [c for c in ['n_legs', 'n_hands', 'n_eyes'] if c in numeric_cols]

agg_columns = survey_cols + joint_cols + count_cols

sequence_summary = (
    X_train_raw
    .groupby('sample_index')[agg_columns]
    .agg(['mean', 'std', 'min', 'max'])
)
sequence_summary.columns = [f"{col}_{stat}" for col, stat in sequence_summary.columns]
sequence_summary = sequence_summary.reset_index().merge(y_train, on='sample_index', how='left')
sequence_summary.head()


In [None]:
feature_cols = [c for c in sequence_summary.columns if c not in ['sample_index', 'label']]
label_means = (
    sequence_summary[['label'] + feature_cols]
    .groupby('label')
    .mean(numeric_only=True)
)
contrast = label_means.loc['high_pain'] - label_means.drop('high_pain').mean()
contrast.sort_values(key=np.abs, ascending=False).head(15)


In [None]:
top_diff = contrast.sort_values(key=np.abs, ascending=False).head(20)
plt.figure(figsize=(14, 6))
ax = sns.barplot(x=top_diff.values, y=top_diff.index, palette='coolwarm', orient='h')
plt.axvline(0, color='black', linewidth=1)
plt.title('High pain vs (low + no pain) mean difference')
plt.xlabel('Mean difference')
plt.ylabel('Feature')
plt.show()


## ü©π **Pain Survey Signals Over Time**


In [None]:
survey_variability = (
    X_train_raw
    .groupby('sample_index')[survey_cols]
    .nunique()
)
print('Unique values per survey feature across timesteps (describe):')
display(survey_variability.describe())
print('\nShare of sequences with constant survey values:')
constant_share = (survey_variability == 1).mean().sort_values(ascending=False)
display(constant_share)


In [None]:
survey_mean_cols = [c for c in sequence_summary.columns if c.startswith('pain_survey_') and c.endswith('_mean')]
survey_melt = sequence_summary[['label'] + survey_mean_cols].melt(id_vars='label', var_name='survey', value_name='mean_value')
survey_melt['survey'] = survey_melt['survey'].str.replace('_mean', '', regex=False)
plt.figure(figsize=(14, 6))
ax = sns.boxplot(data=survey_melt, x='survey', y='mean_value', hue='label', palette='rocket', showfliers=False)
plt.title('Distribution of mean survey values per label')
plt.ylabel('Mean value across time')
plt.legend(title='Label')
plt.show()


## ü¶¥ **Joint Sensor Highlights for High Pain**


In [None]:
joint_diff = contrast[contrast.index.str.startswith('joint_')]
joint_diff = joint_diff.sort_values(key=np.abs, ascending=False).head(30)
plt.figure(figsize=(14, 10))
ax = sns.barplot(x=joint_diff.values, y=joint_diff.index, palette='vlag', orient='h')
plt.axvline(0, color='black', linewidth=1)
plt.title('Top joint statistics differentiating high pain sequences')
plt.xlabel('High pain mean ‚àí others mean')
plt.ylabel('Joint feature (statistic)')
plt.show()


## ‚è≥ **Sample Trajectories**


In [None]:
example_high = high_pain_indices.iloc[0]
example_low = y_train.loc[y_train['label'] == 'low_pain', 'sample_index'].iloc[0]
example_no = y_train.loc[y_train['label'] == 'no_pain', 'sample_index'].iloc[0]

features_to_plot = ['pain_survey_1', 'pain_survey_2', 'joint_00', 'joint_15']
examples = {
    'high_pain': example_high,
    'low_pain': example_low,
    'no_pain': example_no,
}

fig, axes = plt.subplots(len(features_to_plot), 1, figsize=(14, 10), sharex=True)
for ax, feature in zip(axes, features_to_plot):
    for label, idx in examples.items():
        subset = X_train_raw.loc[X_train_raw['sample_index'] == idx]
        ax.plot(subset['time'], subset[feature], label=label if feature == features_to_plot[0] else None)
    ax.set_title(feature)
    ax.set_xlabel('time')

handles, labels = axes[0].get_legend_handles_labels()
fig.legend(handles, labels, loc='upper right')
plt.tight_layout()
plt.show()


## ‚öñÔ∏è **Class Imbalance Signals**


In [None]:
other_mask = sequence_summary['label'] != 'high_pain'
feature_cols = [c for c in sequence_summary.columns if c not in ['sample_index', 'label']]
features_high = sequence_summary.loc[~other_mask, feature_cols]
features_other = sequence_summary.loc[other_mask, feature_cols]

mean_high = features_high.mean(numeric_only=True)
mean_other = features_other.mean(numeric_only=True)
var_high = features_high.var(numeric_only=True)
var_other = features_other.var(numeric_only=True)

n_high = len(features_high)
n_other = len(features_other)
pooled_std = np.sqrt(((n_high - 1) * var_high + (n_other - 1) * var_other) / (n_high + n_other - 2))
pooled_std = pooled_std.replace(0, np.nan)
cohens_d = (mean_high - mean_other) / pooled_std
cohens_d = cohens_d.dropna().sort_values(key=np.abs, ascending=False)
cohens_d.head(20)


In [None]:
top_effect = cohens_d.head(25)
plt.figure(figsize=(14, 8))
ax = sns.barplot(x=top_effect.values, y=top_effect.index, palette='Spectral', orient='h')
plt.axvline(0, color='black', linewidth=1)
plt.title("Cohen's d between high pain and other classes")
plt.xlabel("Effect size (œÉ)")
plt.ylabel("Feature statistic")
plt.show()


## üß© **Body Part Counts (n_legs, n_hands, n_eyes)**


In [None]:
count_mean_cols = [f'{col}_mean' for col in count_cols]
count_std_cols = [f'{col}_std' for col in count_cols]
count_summary = sequence_summary[['label'] + count_mean_cols + count_std_cols]
count_summary_mean = count_summary.groupby('label')[count_mean_cols].mean()
count_summary_std = count_summary.groupby('label')[count_std_cols].mean()
print('Mean counts per label:')
display(count_summary_mean)
print('\nAverage temporal variability per label:')
display(count_summary_std)


## üìù **Research Questions & Next Steps**

- Rafforzare la modellazione delle feature pi√π discriminanti individuate (survey e articolazioni con maggiore effetto size) magari con normalizzazioni dedicate.
- Valutare strategie di upsampling o loss reweighting per ridurre l'impatto dell'imbalance dei 56 soggetti high pain.
- Considerare feature engineering temporale (derivate, slope, trend) sui joint e sulle survey per alimentare la GRU monodirezionale.
- Analizzare ulteriormente i pattern dinamici dei pochi high pain (clustering, DTW) per capire quali firme preservare durante l'augmentation.
