# Create Filtered Dataset with Top Important Features - DREAMER

This notebook creates filtered datasets containing only the most important features based on the DREAMER feature importance analysis.

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler

# ============================================================================
# CONFIGURATION
# ============================================================================
TOP_N_VALENCE = 30  # Number of top features to use for valence prediction
TOP_N_AROUSAL = 30  # Number of top features to use for arousal prediction
TOP_N_COMBINED = 40  # Number of top features when combining valence & arousal
BINARIZE_THRESHOLD = 3.0  # Threshold for binarizing valence/arousal (1-5 scale)
# ============================================================================

print(f"Configuration:")
print(f"  - Top {TOP_N_VALENCE} features for valence")
print(f"  - Top {TOP_N_AROUSAL} features for arousal")
print(f"  - Top {TOP_N_COMBINED} features for combined (valence + arousal)")
print(f"  - Binarization threshold: {BINARIZE_THRESHOLD}")

## 1. Load Feature Importance Results

In [None]:
# Load feature importance rankings
importance_valence = pd.read_csv('dreamer_feature_importance_valence.csv')
importance_arousal = pd.read_csv('dreamer_feature_importance_arousal.csv')

print(f"Loaded feature importance for {len(importance_valence)} features")
print(f"\nTop 10 features for VALENCE:")
print(importance_valence[['Feature', 'Valence_AvgScore']].head(10))

print(f"\nTop 10 features for AROUSAL:")
print(importance_arousal[['Feature', 'Arousal_AvgScore']].head(10))

## 2. Load Original Dataset

In [None]:
# Separate features and targets
metadata_cols = ['Unnamed: 0', 'patient_index', 'video_index']
target_cols = ['arousal', 'valence']

# Get feature columns
feature_cols = [col for col in merged_df.columns if col not in metadata_cols + target_cols]

X = merged_df[feature_cols].values
feature_names = feature_cols

# Get targets
y_valence_cont = merged_df['valence'].values
y_arousal_cont = merged_df['arousal'].values

# Binarize
y_valence = (y_valence_cont > BINARIZE_THRESHOLD).astype(int)
y_arousal = (y_arousal_cont > BINARIZE_THRESHOLD).astype(int)

print(f"\nFeature matrix shape: {X.shape}")
print(f"Total features: {len(feature_names)}")

# Clean data (remove features with >50% invalid, impute rest)
print("\n=== Data Cleaning ===")
nan_counts = np.isnan(X).sum(axis=0)
inf_counts = np.isinf(X).sum(axis=0)
invalid_ratio = (nan_counts + inf_counts) / X.shape[0]

features_to_keep = invalid_ratio <= 0.5
print(f"Removing {(~features_to_keep).sum()} features with >50% invalid values")

X = X[:, features_to_keep]
feature_names = [f for i, f in enumerate(feature_names) if features_to_keep[i]]

# Replace inf with NaN, then impute
from sklearn.impute import SimpleImputer
X = np.where(np.isinf(X), np.nan, X)
imputer = SimpleImputer(strategy='median')
X = imputer.fit_transform(X)

print(f"Clean feature matrix shape: {X.shape}")
print(f"Clean features: {len(feature_names)}")

## 3. Create Filtered Datasets

In [None]:
# Get top N features for valence
top_features_valence = importance_valence['Feature'].head(TOP_N_VALENCE).tolist()

# Get indices of these features
valence_feature_indices = [feature_names.index(f) for f in top_features_valence]

# Create filtered dataset
X_valence = X[:, valence_feature_indices]

print(f"\n=== VALENCE FILTERED DATASET ===")
print(f"Shape: {X_valence.shape}")
print(f"Selected features ({len(top_features_valence)}):")
for i, feat in enumerate(top_features_valence, 1):
    score = importance_valence[importance_valence['Feature'] == feat]['Valence_AvgScore'].values[0]
    print(f"  {i:2d}. {feat:50s} (score: {score:.4f})")

In [None]:
# Get top N features for arousal
top_features_arousal = importance_arousal['Feature'].head(TOP_N_AROUSAL).tolist()

# Get indices of these features
arousal_feature_indices = [feature_names.index(f) for f in top_features_arousal]

# Create filtered dataset
X_arousal = X[:, arousal_feature_indices]

print(f"\n=== AROUSAL FILTERED DATASET ===")
print(f"Shape: {X_arousal.shape}")
print(f"Selected features ({len(top_features_arousal)}):")
for i, feat in enumerate(top_features_arousal, 1):
    score = importance_arousal[importance_arousal['Feature'] == feat]['Arousal_AvgScore'].values[0]
    print(f"  {i:2d}. {feat:50s} (score: {score:.4f})")

In [None]:
# Create combined dataset (rank by average of both scores)
all_importance = importance_valence[['Feature', 'Valence_AvgScore']].merge(
    importance_arousal[['Feature', 'Arousal_AvgScore']], on='Feature'
)
all_importance['Combined_Score'] = (all_importance['Valence_AvgScore'] + all_importance['Arousal_AvgScore']) / 2
all_importance = all_importance.sort_values('Combined_Score', ascending=False)

top_features_combined = all_importance['Feature'].head(TOP_N_COMBINED).tolist()
combined_feature_indices = [feature_names.index(f) for f in top_features_combined]

X_combined = X[:, combined_feature_indices]

print(f"\n=== COMBINED (VALENCE + AROUSAL) FILTERED DATASET ===")
print(f"Shape: {X_combined.shape}")
print(f"Selected features ({len(top_features_combined)}):")
for i, feat in enumerate(top_features_combined, 1):
    score = all_importance[all_importance['Feature'] == feat]['Combined_Score'].values[0]
    print(f"  {i:2d}. {feat:50s} (combined score: {score:.4f})")

## 4. Save Filtered Datasets

In [None]:
# Create output directory
output_dir = Path('../datasets/Dreamer/filtered_features')
output_dir.mkdir(exist_ok=True, parents=True)

# Save as NumPy arrays (.npy)
np.save(output_dir / f'X_valence_top{TOP_N_VALENCE}.npy', X_valence)
np.save(output_dir / f'X_arousal_top{TOP_N_AROUSAL}.npy', X_arousal)
np.save(output_dir / f'X_combined_top{TOP_N_COMBINED}.npy', X_combined)
np.save(output_dir / 'y_valence_binary.npy', y_valence)
np.save(output_dir / 'y_arousal_binary.npy', y_arousal)
np.save(output_dir / 'y_valence_continuous.npy', y_valence_cont)
np.save(output_dir / 'y_arousal_continuous.npy', y_arousal_cont)

# Save feature names as text files
with open(output_dir / f'features_valence_top{TOP_N_VALENCE}.txt', 'w') as f:
    f.write('\n'.join(top_features_valence))

with open(output_dir / f'features_arousal_top{TOP_N_AROUSAL}.txt', 'w') as f:
    f.write('\n'.join(top_features_arousal))

with open(output_dir / f'features_combined_top{TOP_N_COMBINED}.txt', 'w') as f:
    f.write('\n'.join(top_features_combined))

print("\n=== Files Saved ===")
print(f"Directory: {output_dir}")
print("\nData files:")
print(f"  - X_valence_top{TOP_N_VALENCE}.npy")
print(f"  - X_arousal_top{TOP_N_AROUSAL}.npy")
print(f"  - X_combined_top{TOP_N_COMBINED}.npy")
print("  - y_valence_binary.npy")
print("  - y_arousal_binary.npy")
print("  - y_valence_continuous.npy")
print("  - y_arousal_continuous.npy")
print("\nFeature lists:")
print(f"  - features_valence_top{TOP_N_VALENCE}.txt")
print(f"  - features_arousal_top{TOP_N_AROUSAL}.txt")
print(f"  - features_combined_top{TOP_N_COMBINED}.txt")

## 5. Save as Pandas DataFrames (CSV)

In [None]:
# Create DataFrames with feature names
df_valence = pd.DataFrame(X_valence, columns=top_features_valence)
df_valence['valence_binary'] = y_valence
df_valence['valence_continuous'] = y_valence_cont
df_valence['arousal_binary'] = y_arousal
df_valence['arousal_continuous'] = y_arousal_cont

df_arousal = pd.DataFrame(X_arousal, columns=top_features_arousal)
df_arousal['valence_binary'] = y_valence
df_arousal['valence_continuous'] = y_valence_cont
df_arousal['arousal_binary'] = y_arousal
df_arousal['arousal_continuous'] = y_arousal_cont

df_combined = pd.DataFrame(X_combined, columns=top_features_combined)
df_combined['valence_binary'] = y_valence
df_combined['valence_continuous'] = y_valence_cont
df_combined['arousal_binary'] = y_arousal
df_combined['arousal_continuous'] = y_arousal_cont

# Save as CSV
df_valence.to_csv(output_dir / f'dataset_valence_top{TOP_N_VALENCE}.csv', index=False)
df_arousal.to_csv(output_dir / f'dataset_arousal_top{TOP_N_AROUSAL}.csv', index=False)
df_combined.to_csv(output_dir / f'dataset_combined_top{TOP_N_COMBINED}.csv', index=False)

print("\nCSV files saved:")
print(f"  - dataset_valence_top{TOP_N_VALENCE}.csv")
print(f"  - dataset_arousal_top{TOP_N_AROUSAL}.csv")
print(f"  - dataset_combined_top{TOP_N_COMBINED}.csv")

print(f"\nDataFrame shapes:")
print(f"  - Valence: {df_valence.shape}")
print(f"  - Arousal: {df_arousal.shape}")
print(f"  - Combined: {df_combined.shape}")

## 6. Create Metadata File

In [None]:
# Create a metadata file documenting the filtered datasets
metadata = f"""# DREAMER Filtered Feature Datasets - Metadata

Generated from Feature Importance Analysis
Date: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}

## Dataset Information

Original dataset: DREAMER (Database for Emotion Recognition using EEG, ECG and Multimodal Signals)
- Total samples: {X.shape[0]}
- Original features: {X.shape[1]} (combined from features_table.csv and features_table_imf.csv)
- EEG channels: 14 (AF3, F7, F3, FC5, T7, P7, O1, O2, P8, T8, FC6, F4, F8, AF4)
- Frequency bands: 6 (delta, theta, alpha, betaL, betaH, gamma)
- Feature types: bandpower, activity, mobility, complexity, entropy, wenergy, IMF energy, IMF entropy

## Filtered Datasets

### 1. Valence-Optimized Dataset
- File: dataset_valence_top{TOP_N_VALENCE}.csv / X_valence_top{TOP_N_VALENCE}.npy
- Features: {TOP_N_VALENCE}
- Shape: ({X.shape[0]}, {TOP_N_VALENCE})
- Selection: Top {TOP_N_VALENCE} features ranked by aggregated importance for valence prediction

### 2. Arousal-Optimized Dataset
- File: dataset_arousal_top{TOP_N_AROUSAL}.csv / X_arousal_top{TOP_N_AROUSAL}.npy
- Features: {TOP_N_AROUSAL}
- Shape: ({X.shape[0]}, {TOP_N_AROUSAL})
- Selection: Top {TOP_N_AROUSAL} features ranked by aggregated importance for arousal prediction

### 3. Combined Dataset (Valence + Arousal)
- File: dataset_combined_top{TOP_N_COMBINED}.csv / X_combined_top{TOP_N_COMBINED}.npy
- Features: {TOP_N_COMBINED}
- Shape: ({X.shape[0]}, {TOP_N_COMBINED})
- Selection: Top {TOP_N_COMBINED} features ranked by average importance for both valence and arousal

## Target Variables

All datasets include 4 target variables:
1. **valence_binary**: Binary valence labels (0/1, threshold {BINARIZE_THRESHOLD})
2. **valence_continuous**: Continuous valence ratings (1-5 scale)
3. **arousal_binary**: Binary arousal labels (0/1, threshold {BINARIZE_THRESHOLD})
4. **arousal_continuous**: Continuous arousal ratings (1-5 scale)

## Feature Importance Methodology

Features were ranked using 5 different methods:
1. Pearson/Spearman correlation
2. ANOVA F-test & Mutual Information
3. Random Forest feature importance
4. Permutation importance
5. Linear SVM coefficients

All scores were normalized and averaged to create final rankings.

## Usage Example

```python
import numpy as np
import pandas as pd

# Option 1: Load NumPy arrays
X = np.load('filtered_features/X_valence_top{TOP_N_VALENCE}.npy')
y = np.load('filtered_features/y_valence_binary.npy')

# Option 2: Load CSV with feature names
df = pd.read_csv('filtered_features/dataset_valence_top{TOP_N_VALENCE}.csv')
X = df.drop(['valence_binary', 'valence_continuous', 'arousal_binary', 'arousal_continuous'], axis=1)
y = df['valence_binary']
```

## Files

### NumPy Arrays (.npy)
- X_valence_top{TOP_N_VALENCE}.npy
- X_arousal_top{TOP_N_AROUSAL}.npy
- X_combined_top{TOP_N_COMBINED}.npy
- y_valence_binary.npy
- y_arousal_binary.npy
- y_valence_continuous.npy
- y_arousal_continuous.npy

### CSV Files
- dataset_valence_top{TOP_N_VALENCE}.csv
- dataset_arousal_top{TOP_N_AROUSAL}.csv
- dataset_combined_top{TOP_N_COMBINED}.csv

### Feature Lists (.txt)
- features_valence_top{TOP_N_VALENCE}.txt
- features_arousal_top{TOP_N_AROUSAL}.txt
- features_combined_top{TOP_N_COMBINED}.txt
"""

with open(output_dir / 'README.md', 'w') as f:
    f.write(metadata)

print("\nMetadata file created: README.md")

## 7. Summary and Statistics

In [None]:
print("\n" + "="*80)
print(" "*25 + "FILTERED DATASET SUMMARY")
print("="*80)

print("\n### Dataset Dimensions ###")
print(f"Original: ({X.shape[0]}, {X.shape[1]})")
print(f"Valence-optimized: ({X.shape[0]}, {TOP_N_VALENCE}) - {(TOP_N_VALENCE/X.shape[1])*100:.1f}% of features")
print(f"Arousal-optimized: ({X.shape[0]}, {TOP_N_AROUSAL}) - {(TOP_N_AROUSAL/X.shape[1])*100:.1f}% of features")
print(f"Combined: ({X.shape[0]}, {TOP_N_COMBINED}) - {(TOP_N_COMBINED/X.shape[1])*100:.1f}% of features")

print("\n### Feature Overlap ###")
overlap = set(top_features_valence) & set(top_features_arousal)
print(f"Features in both valence and arousal top-{TOP_N_VALENCE}: {len(overlap)}")
print(f"Overlap percentage: {(len(overlap)/TOP_N_VALENCE)*100:.1f}%")

if len(overlap) > 0 and len(overlap) <= 20:
    print(f"\nOverlapping features:")
    for feat in sorted(overlap):
        print(f"  - {feat}")

print("\n### Target Distribution ###")
print(f"Valence: {np.bincount(y_valence)} (0: {np.bincount(y_valence)[0]}, 1: {np.bincount(y_valence)[1]})")
print(f"Arousal: {np.bincount(y_arousal)} (0: {np.bincount(y_arousal)[0]}, 1: {np.bincount(y_arousal)[1]})")

print("\n### Files Created ###")
print(f"Location: {output_dir}")
print(f"Total files: {len(list(output_dir.glob('*')))}")

print("\n" + "="*80)
print("\nâœ“ Filtered datasets created successfully!")
print("\nYou can now use these datasets for training more efficient models.")
print("The reduced feature sets should improve training speed and may reduce overfitting.")