# Feature Engineering for Nurse Stress Detection

This notebook focuses on creating meaningful features from physiological sensor data for stress detection classification.

## Objectives:
- Create time-based features
- Generate rolling statistical features
- Extract movement and activity features
- Prepare data for machine learning models

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

print("=== FEATURE ENGINEERING FOR STRESS DETECTION ===\n")

=== FEATURE ENGINEERING FOR STRESS DETECTION ===



In [2]:
# Load the subset data
df = pd.read_csv(r'C:\Users\Michi\nurse-stress-wearables\data\merged_data_subset.csv')

# Convert datetime and set as index for time series operations
df['datetime'] = pd.to_datetime(df['datetime'])
df = df.sort_values(['id', 'datetime']).reset_index(drop=True)

print(f"Dataset shape: {df.shape}")
print(f"Date range: {df['datetime'].min()} to {df['datetime'].max()}")
print(f"Subjects: {df['id'].unique()}")
print(f"Stress events: {(df['label'] != 0).sum()} ({(df['label'] != 0).mean()*100:.2f}%)")

Dataset shape: (1964182, 9)
Date range: 2020-07-16 14:49:00 to 2020-12-13 08:01:00
Subjects: [83 '6D' '83']
Stress events: 1297937 (66.08%)


## 1. Basic Time-Based Features

Extract temporal patterns that might influence stress levels.

In [3]:
def create_time_features(df):
    """Create time-based features from datetime."""
    df_time = df.copy()
    
    # Basic time features
    df_time['hour'] = df_time['datetime'].dt.hour
    df_time['day_of_week'] = df_time['datetime'].dt.dayofweek
    df_time['is_weekend'] = df_time['day_of_week'].isin([5, 6]).astype(int)
    df_time['is_night_shift'] = ((df_time['hour'] >= 22) | (df_time['hour'] <= 6)).astype(int)
    df_time['is_work_hours'] = ((df_time['hour'] >= 8) & (df_time['hour'] <= 18)).astype(int)
    
    # Cyclical encoding for hour (24-hour cycle)
    df_time['hour_sin'] = np.sin(2 * np.pi * df_time['hour'] / 24)
    df_time['hour_cos'] = np.cos(2 * np.pi * df_time['hour'] / 24)
    
    # Cyclical encoding for day of week (7-day cycle)
    df_time['dow_sin'] = np.sin(2 * np.pi * df_time['day_of_week'] / 7)
    df_time['dow_cos'] = np.cos(2 * np.pi * df_time['day_of_week'] / 7)
    
    return df_time

# Apply time features
df_features = create_time_features(df)

# Display new time features
time_cols = ['hour', 'day_of_week', 'is_weekend', 'is_night_shift', 'is_work_hours', 
             'hour_sin', 'hour_cos', 'dow_sin', 'dow_cos']
print("Time-based features created:")
print(df_features[time_cols].head())

Time-based features created:
   hour  day_of_week  is_weekend  is_night_shift  is_work_hours  hour_sin  \
0     6            4           0               1              0       1.0   
1     6            4           0               1              0       1.0   
2     6            4           0               1              0       1.0   
3     6            4           0               1              0       1.0   
4     6            4           0               1              0       1.0   

       hour_cos   dow_sin   dow_cos  
0  6.123234e-17 -0.433884 -0.900969  
1  6.123234e-17 -0.433884 -0.900969  
2  6.123234e-17 -0.433884 -0.900969  
3  6.123234e-17 -0.433884 -0.900969  
4  6.123234e-17 -0.433884 -0.900969  


## 2. Rolling Window Statistical Features

Create features that capture recent physiological trends and variability.

In [4]:
def create_rolling_features(df, window_minutes=10):
    """Create rolling statistical features for physiological signals."""
    
    # Assuming 1Hz sampling rate (adjust if different)
    window_size = window_minutes * 60
    
    physiological_cols = ['EDA', 'HR', 'TEMP']
    df_rolling = df.copy()
    
    for col in physiological_cols:
        # Rolling statistics
        df_rolling[f'{col}_mean_{window_minutes}min'] = df_rolling.groupby('id')[col].rolling(
            window=window_size, min_periods=1).mean().reset_index(0, drop=True)
        
        df_rolling[f'{col}_std_{window_minutes}min'] = df_rolling.groupby('id')[col].rolling(
            window=window_size, min_periods=1).std().reset_index(0, drop=True)
        
        df_rolling[f'{col}_min_{window_minutes}min'] = df_rolling.groupby('id')[col].rolling(
            window=window_size, min_periods=1).min().reset_index(0, drop=True)
        
        df_rolling[f'{col}_max_{window_minutes}min'] = df_rolling.groupby('id')[col].rolling(
            window=window_size, min_periods=1).max().reset_index(0, drop=True)
        
        # Rate of change features
        df_rolling[f'{col}_slope'] = df_rolling.groupby('id')[col].diff()
        
        df_rolling[f'{col}_slope_mean_{window_minutes}min'] = df_rolling.groupby('id')[f'{col}_slope'].rolling(
            window=window_size, min_periods=1).mean().reset_index(0, drop=True)
    
    return df_rolling

# Apply rolling features with 5-minute and 10-minute windows
print("Creating rolling features (5 minutes)...")
df_features = create_rolling_features(df_features, window_minutes=5)

print("Creating rolling features (10 minutes)...")
df_features = create_rolling_features(df_features, window_minutes=10)

# Show sample of new rolling features
rolling_cols = [col for col in df_features.columns if 'min' in col and ('mean' in col or 'std' in col)][:6]
print(f"\nSample rolling features:")
print(df_features[rolling_cols].head())

Creating rolling features (5 minutes)...
Creating rolling features (10 minutes)...


MemoryError: Unable to allocate 435. MiB for an array with shape (29, 1964182) and data type float64

## 3. Movement and Activity Features

Extract features from accelerometer data to capture physical activity patterns.

In [5]:
def create_movement_features(df, window_minutes=5):
    """Create movement and activity features from accelerometer data."""
    
    window_size = window_minutes * 60
    df_movement = df.copy()
    
    # Movement magnitude
    df_movement['movement_magnitude'] = np.sqrt(df_movement['X']**2 + df_movement['Y']**2 + df_movement['Z']**2)
    
    # Movement variability
    df_movement['movement_std'] = df_movement.groupby('id')['movement_magnitude'].rolling(
        window=window_size, min_periods=1).std().reset_index(0, drop=True)
    
    # Activity level (based on movement magnitude)
    df_movement['activity_level'] = pd.cut(df_movement['movement_magnitude'], 
                                         bins=[0, 1, 2, 5, np.inf], 
                                         labels=['sedentary', 'light', 'moderate', 'vigorous'])
    
    # Convert activity level to numeric
    activity_mapping = {'sedentary': 0, 'light': 1, 'moderate': 2, 'vigorous': 3}
    df_movement['activity_numeric'] = df_movement['activity_level'].map(activity_mapping)
    
    # Rolling activity features
    df_movement['activity_mean_5min'] = df_movement.groupby('id')['activity_numeric'].rolling(
        window=window_size, min_periods=1).mean().reset_index(0, drop=True)
    
    # Movement direction changes (acceleration changes)
    for axis in ['X', 'Y', 'Z']:
        df_movement[f'{axis}_acceleration'] = df_movement.groupby('id')[axis].diff()
    
    df_movement['total_acceleration'] = np.sqrt(
        df_movement['X_acceleration']**2 + 
        df_movement['Y_acceleration']**2 + 
        df_movement['Z_acceleration']**2
    )
    
    return df_movement

# Apply movement features
df_features = create_movement_features(df_features)

# Display movement features
movement_cols = ['movement_magnitude', 'movement_std', 'activity_numeric', 'activity_mean_5min', 'total_acceleration']
print("Movement features created:")
print(df_features[movement_cols].describe())

MemoryError: Unable to allocate 15.0 MiB for an array with shape (1, 1964182) and data type float64

## 4. Physiological Feature Engineering

Create advanced features specific to stress detection from physiological signals.

In [None]:
def create_physiological_features(df):
    """Create advanced physiological features for stress detection."""
    
    df_physio = df.copy()
    
    # Heart Rate Variability (simple approximation)
    df_physio['HR_variability'] = df_physio.groupby('id')['HR'].rolling(
        window=300, min_periods=1).std().reset_index(0, drop=True)
    
    # EDA features (stress-specific)
    # EDA peaks (simple peak detection)
    df_physio['EDA_diff'] = df_physio.groupby('id')['EDA'].diff()
    df_physio['EDA_peaks'] = ((df_physio['EDA_diff'] > 0) & 
                             (df_physio['EDA_diff'].shift(-1) < 0)).astype(int)
    
    # EDA response magnitude
    df_physio['EDA_response'] = df_physio.groupby('id')['EDA'].rolling(
        window=60, min_periods=1).max().reset_index(0, drop=True) - \
                               df_physio.groupby('id')['EDA'].rolling(
        window=60, min_periods=1).min().reset_index(0, drop=True)
    
    # Temperature stability
    df_physio['TEMP_stability'] = 1 / (1 + df_physio.groupby('id')['TEMP'].rolling(
        window=300, min_periods=1).std().reset_index(0, drop=True))
    
    # Physiological stress indicators
    # Elevated HR (above personal baseline)
    hr_baseline = df_physio.groupby('id')['HR'].transform('median')
    df_physio['HR_elevated'] = (df_physio['HR'] > hr_baseline * 1.1).astype(int)
    
    # EDA elevation (above personal baseline)
    eda_baseline = df_physio.groupby('id')['EDA'].transform('median')
    df_physio['EDA_elevated'] = (df_physio['EDA'] > eda_baseline * 1.2).astype(int)
    
    return df_physio

# Apply physiological features
df_features = create_physiological_features(df_features)

# Display physiological features
physio_cols = ['HR_variability', 'EDA_peaks', 'EDA_response', 'TEMP_stability', 'HR_elevated', 'EDA_elevated']
print("Physiological features created:")
print(df_features[physio_cols].describe())

## 5. Feature Summary and Correlation Analysis

Analyze the created features and their relationships with stress labels.

In [None]:
# Get all feature columns (exclude original raw signals and metadata)
exclude_cols = ['datetime', 'id', 'label', 'X', 'Y', 'Z', 'EDA', 'HR', 'TEMP', 'activity_level']
feature_cols = [col for col in df_features.columns if col not in exclude_cols]

print(f"Total features created: {len(feature_cols)}")
print("\nFeature categories:")

# Categorize features
time_features = [col for col in feature_cols if any(x in col for x in ['hour', 'day', 'weekend', 'night', 'work'])]
rolling_features = [col for col in feature_cols if 'min' in col and any(x in col for x in ['mean', 'std', 'max', 'min'])]
movement_features = [col for col in feature_cols if any(x in col for x in ['movement', 'activity', 'acceleration'])]
physio_features = [col for col in feature_cols if any(x in col for x in ['variability', 'peaks', 'response', 'stability', 'elevated'])]
slope_features = [col for col in feature_cols if 'slope' in col]

print(f"Time features: {len(time_features)}")
print(f"Rolling statistical features: {len(rolling_features)}")
print(f"Movement features: {len(movement_features)}")
print(f"Physiological features: {len(physio_features)}")
print(f"Slope features: {len(slope_features)}")

In [None]:
# Correlation analysis with stress labels
feature_correlations = df_features[feature_cols + ['label']].corr()['label'].abs().sort_values(ascending=False)

print("Top 15 features correlated with stress:")
print(feature_correlations.head(16)[1:])  # Exclude self-correlation

# Visualize top correlations
plt.figure(figsize=(12, 8))
top_features = feature_correlations.head(16)[1:15]  # Top 15 excluding self
plt.barh(range(len(top_features)), top_features.values)
plt.yticks(range(len(top_features)), top_features.index)
plt.xlabel('Absolute Correlation with Stress Label')
plt.title('Top 15 Features Correlated with Stress')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 6. Handle Missing Values and Data Quality

Clean the engineered features and prepare for modeling.

In [None]:
# Check for missing values in engineered features
missing_summary = df_features[feature_cols].isnull().sum()
missing_features = missing_summary[missing_summary > 0]

if len(missing_features) > 0:
    print("Features with missing values:")
    print(missing_features)
    
    # Forward fill missing values (appropriate for time series)
    for col in missing_features.index:
        df_features[col] = df_features.groupby('id')[col].fillna(method='ffill')
        df_features[col] = df_features.groupby('id')[col].fillna(method='bfill')
else:
    print("No missing values in engineered features!")

# Check for infinite values
inf_summary = np.isinf(df_features[feature_cols]).sum()
inf_features = inf_summary[inf_summary > 0]

if len(inf_features) > 0:
    print(f"\nFeatures with infinite values:")
    print(inf_features)
    
    # Replace infinite values
    df_features = df_features.replace([np.inf, -np.inf], np.nan)
    # Fill with median values
    for col in inf_features.index:
        median_val = df_features[col].median()
        df_features[col] = df_features[col].fillna(median_val)
else:
    print("No infinite values found!")

print(f"\nFinal dataset shape: {df_features.shape}")
print(f"Features ready for modeling: {len(feature_cols)}")

## 7. Feature Visualization for Stress vs Normal Periods

Compare feature distributions between stress and normal periods.

In [None]:
# Select top features for visualization
top_viz_features = feature_correlations.head(7)[1:7].index.tolist()  # Top 6 features

# Create comparison plots
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

for i, feature in enumerate(top_viz_features):
    stress_data = df_features[df_features['label'] != 0][feature].dropna()
    normal_data = df_features[df_features['label'] == 0][feature].dropna()
    
    # Sample if too large for visualization
    if len(normal_data) > 5000:
        normal_data = normal_data.sample(5000)
    
    axes[i].hist(normal_data, bins=50, alpha=0.7, label='Normal', density=True, color='green')
    if len(stress_data) > 0:
        axes[i].hist(stress_data, bins=30, alpha=0.7, label='Stress', density=True, color='red')
    
    axes[i].set_title(f'{feature}')
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('Density')
    axes[i].legend()
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Save Engineered Features

Save the processed dataset with all engineered features for machine learning.

In [None]:
# Select final feature set for modeling
final_features = feature_cols + ['id', 'datetime', 'label']
df_final = df_features[final_features].copy()

# Save the engineered dataset
output_path = r'C:\Users\Michi\nurse-stress-wearables\data\engineered_features.csv'
df_final.to_csv(output_path, index=False)

print(f"Engineered features saved to: {output_path}")
print(f"Final dataset info:")
print(f"- Shape: {df_final.shape}")
print(f"- Features: {len(feature_cols)}")
print(f"- Stress events: {(df_final['label'] != 0).sum()}")
print(f"- Date range: {df_final['datetime'].min()} to {df_final['datetime'].max()}")

# Display feature summary
print(f"\nFeature summary by category:")
print(f"Time features ({len(time_features)}): {time_features[:3]}...")
print(f"Rolling features ({len(rolling_features)}): {rolling_features[:3]}...")
print(f"Movement features ({len(movement_features)}): {movement_features[:3]}...")
print(f"Physiological features ({len(physio_features)}): {physio_features}")

print("\n=== FEATURE ENGINEERING COMPLETE ===")