# Create Filtered Dataset with Top Important Features

This notebook creates filtered datasets containing only the most important features based on the feature importance analysis.

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
import pickle

# ============================================================================
# CONFIGURATION
# ============================================================================
TOP_N_VALENCE = 30  # Number of top features to use for valence prediction
TOP_N_AROUSAL = 30  # Number of top features to use for arousal prediction
TOP_N_COMBINED = 40  # Number of top features when combining valence & arousal
# ============================================================================

print(f"Configuration:")
print(f"  - Top {TOP_N_VALENCE} features for valence")
print(f"  - Top {TOP_N_AROUSAL} features for arousal")
print(f"  - Top {TOP_N_COMBINED} features for combined (valence + arousal)")

Configuration:
  - Top 30 features for valence
  - Top 30 features for arousal
  - Top 40 features for combined (valence + arousal)


## 1. Load Feature Importance Results

In [3]:
# Load feature importance rankings
importance_valence = pd.read_csv('feature_importance_valence.csv')
importance_arousal = pd.read_csv('feature_importance_arousal.csv')

print(f"Loaded feature importance for {len(importance_valence)} features")
print(f"\nTop 10 features for VALENCE:")
print(importance_valence[['Feature', 'Valence_AvgScore']].head(10))

print(f"\nTop 10 features for AROUSAL:")
print(importance_arousal[['Feature', 'Arousal_AvgScore']].head(10))

Loaded feature importance for 96 features

Top 10 features for VALENCE:
     Feature  Valence_AvgScore
0   AF4_beta          0.620503
1  AF4_alpha          0.593262
2  AF4_gamma          0.569039
3    C3_beta          0.543337
4   C3_alpha          0.538552
5    O2_beta          0.476611
6   T7_gamma          0.472280
7   Fp1_beta          0.453665
8   CP1_beta          0.452815
9  CP1_alpha          0.452565

Top 10 features for AROUSAL:
     Feature  Arousal_AvgScore
0    F7_beta          0.621710
1    T7_beta          0.603381
2   F7_gamma          0.581367
3   P4_alpha          0.552088
4   P4_gamma          0.526675
5   T7_alpha          0.516825
6  PO3_gamma          0.510727
7    P4_beta          0.485583
8   T7_gamma          0.461914
9   F3_gamma          0.449467


## 2. Load Original Dataset

In [4]:
# Define paths
base_path = Path('../datasets/DEAP/deap-dataset')
features_path = base_path / 'extracted_features'
labels_path = base_path / 'Metadata/participant_ratings.xls'

# Channel names (32 EEG channels)
channels = [
    'Fp1', 'AF3', 'F3', 'F7', 'FC5', 'FC1', 'C3', 'T7', 'CP5', 'CP1',
    'P3', 'P7', 'PO3', 'O1', 'Oz', 'Pz', 'Fp2', 'AF4', 'Fz', 'F4', 'F8',
    'FC6', 'FC2', 'Cz', 'C4', 'T8', 'CP6', 'CP2', 'P4', 'P8', 'PO4', 'O2'
]

# Frequency bands
bands = ['alpha', 'beta', 'gamma']

# Load labels
labels_df = pd.read_excel(labels_path)

# Binarize valence and arousal (threshold 4.5)
y_valence = (labels_df['Valence'] > 4.5).astype(int).values
y_arousal = (labels_df['Arousal'] > 4.5).astype(int).values

# Also keep continuous values
y_valence_cont = labels_df['Valence'].values
y_arousal_cont = labels_df['Arousal'].values

print(f"Labels loaded: {labels_df.shape}")

Labels loaded: (1280, 9)


In [5]:
# Load and concatenate features from all channels
X_list = []
feature_names = []

for channel in channels:
    channel_data = []
    
    for subject in range(1, 33):  # 32 subjects
        file_path = features_path / channel / f's{subject:02d}_bandpower.csv'
        df = pd.read_csv(file_path)
        channel_data.append(df[['alpha_power', 'beta_power', 'gamma_power']].values)
    
    # Stack all subjects (1280 trials total)
    channel_features = np.vstack(channel_data)
    X_list.append(channel_features)
    
    # Create feature names
    for band in bands:
        feature_names.append(f'{channel}_{band}')

# Concatenate all channels horizontally (1280 samples × 96 features)
X = np.hstack(X_list)

print(f"Original feature matrix shape: {X.shape}")
print(f"Total features: {len(feature_names)}")

Original feature matrix shape: (1280, 96)
Total features: 96


## 3. Create Filtered Datasets

In [6]:
# Get top N features for valence
top_features_valence = importance_valence['Feature'].head(TOP_N_VALENCE).tolist()

# Get indices of these features
valence_feature_indices = [feature_names.index(f) for f in top_features_valence]

# Create filtered dataset
X_valence = X[:, valence_feature_indices]

print(f"\n=== VALENCE FILTERED DATASET ===")
print(f"Shape: {X_valence.shape}")
print(f"Selected features ({len(top_features_valence)}):")
for i, feat in enumerate(top_features_valence, 1):
    score = importance_valence[importance_valence['Feature'] == feat]['Valence_AvgScore'].values[0]
    print(f"  {i:2d}. {feat:15s} (score: {score:.4f})")


=== VALENCE FILTERED DATASET ===
Shape: (1280, 30)
Selected features (30):
   1. AF4_beta        (score: 0.6205)
   2. AF4_alpha       (score: 0.5933)
   3. AF4_gamma       (score: 0.5690)
   4. C3_beta         (score: 0.5433)
   5. C3_alpha        (score: 0.5386)
   6. O2_beta         (score: 0.4766)
   7. T7_gamma        (score: 0.4723)
   8. Fp1_beta        (score: 0.4537)
   9. CP1_beta        (score: 0.4528)
  10. CP1_alpha       (score: 0.4526)
  11. FC6_gamma       (score: 0.4493)
  12. F3_alpha        (score: 0.4349)
  13. FC6_alpha       (score: 0.4314)
  14. Fp1_gamma       (score: 0.4221)
  15. Fp1_alpha       (score: 0.4202)
  16. F4_beta         (score: 0.4097)
  17. FC1_gamma       (score: 0.3993)
  18. P8_gamma        (score: 0.3937)
  19. Fp2_gamma       (score: 0.3924)
  20. FC6_beta        (score: 0.3834)
  21. F4_gamma        (score: 0.3808)
  22. FC2_gamma       (score: 0.3731)
  23. P4_gamma        (score: 0.3667)
  24. C3_gamma        (score: 0.3627)
  25. Fp2_al

In [7]:
# Get top N features for arousal
top_features_arousal = importance_arousal['Feature'].head(TOP_N_AROUSAL).tolist()

# Get indices of these features
arousal_feature_indices = [feature_names.index(f) for f in top_features_arousal]

# Create filtered dataset
X_arousal = X[:, arousal_feature_indices]

print(f"\n=== AROUSAL FILTERED DATASET ===")
print(f"Shape: {X_arousal.shape}")
print(f"Selected features ({len(top_features_arousal)}):")
for i, feat in enumerate(top_features_arousal, 1):
    score = importance_arousal[importance_arousal['Feature'] == feat]['Arousal_AvgScore'].values[0]
    print(f"  {i:2d}. {feat:15s} (score: {score:.4f})")


=== AROUSAL FILTERED DATASET ===
Shape: (1280, 30)
Selected features (30):
   1. F7_beta         (score: 0.6217)
   2. T7_beta         (score: 0.6034)
   3. F7_gamma        (score: 0.5814)
   4. P4_alpha        (score: 0.5521)
   5. P4_gamma        (score: 0.5267)
   6. T7_alpha        (score: 0.5168)
   7. PO3_gamma       (score: 0.5107)
   8. P4_beta         (score: 0.4856)
   9. T7_gamma        (score: 0.4619)
  10. F3_gamma        (score: 0.4495)
  11. F7_alpha        (score: 0.4373)
  12. AF4_beta        (score: 0.4300)
  13. PO4_alpha       (score: 0.4226)
  14. Fp1_gamma       (score: 0.4176)
  15. C4_gamma        (score: 0.4061)
  16. FC6_alpha       (score: 0.3821)
  17. F3_beta         (score: 0.3812)
  18. C4_alpha        (score: 0.3758)
  19. Fp1_beta        (score: 0.3756)
  20. FC2_gamma       (score: 0.3730)
  21. FC5_beta        (score: 0.3723)
  22. CP1_alpha       (score: 0.3716)
  23. FC1_gamma       (score: 0.3666)
  24. CP6_beta        (score: 0.3656)
  25. FC2_al

In [8]:
# Create combined dataset (union of top features for both)
# Combine and get unique features
combined_features = list(set(top_features_valence + top_features_arousal))

# If we want exactly TOP_N_COMBINED features, rank by average of both scores
all_importance = importance_valence[['Feature', 'Valence_AvgScore']].merge(
    importance_arousal[['Feature', 'Arousal_AvgScore']], on='Feature'
)
all_importance['Combined_Score'] = (all_importance['Valence_AvgScore'] + all_importance['Arousal_AvgScore']) / 2
all_importance = all_importance.sort_values('Combined_Score', ascending=False)

top_features_combined = all_importance['Feature'].head(TOP_N_COMBINED).tolist()
combined_feature_indices = [feature_names.index(f) for f in top_features_combined]

X_combined = X[:, combined_feature_indices]

print(f"\n=== COMBINED (VALENCE + AROUSAL) FILTERED DATASET ===")
print(f"Shape: {X_combined.shape}")
print(f"Selected features ({len(top_features_combined)}):")
for i, feat in enumerate(top_features_combined, 1):
    score = all_importance[all_importance['Feature'] == feat]['Combined_Score'].values[0]
    print(f"  {i:2d}. {feat:15s} (combined score: {score:.4f})")


=== COMBINED (VALENCE + AROUSAL) FILTERED DATASET ===
Shape: (1280, 40)
Selected features (40):
   1. AF4_beta        (combined score: 0.5253)
   2. T7_gamma        (combined score: 0.4671)
   3. AF4_alpha       (combined score: 0.4542)
   4. P4_gamma        (combined score: 0.4467)
   5. C3_alpha        (combined score: 0.4367)
   6. P4_alpha        (combined score: 0.4305)
   7. AF4_gamma       (combined score: 0.4207)
   8. T7_beta         (combined score: 0.4207)
   9. Fp1_gamma       (combined score: 0.4198)
  10. Fp1_beta        (combined score: 0.4146)
  11. C3_beta         (combined score: 0.4132)
  12. CP1_alpha       (combined score: 0.4121)
  13. FC6_alpha       (combined score: 0.4067)
  14. F7_beta         (combined score: 0.3943)
  15. P4_beta         (combined score: 0.3907)
  16. PO3_gamma       (combined score: 0.3836)
  17. FC1_gamma       (combined score: 0.3830)
  18. F3_gamma        (combined score: 0.3804)
  19. FC6_gamma       (combined score: 0.3740)
  20. FC2_

## 4. Save Filtered Datasets

In [9]:
# Create output directory
output_dir = Path('../datasets/DEAP/filtered_features')
output_dir.mkdir(exist_ok=True, parents=True)

# Save as NumPy arrays (.npy)
np.save(output_dir / 'X_valence_top30.npy', X_valence)
np.save(output_dir / 'X_arousal_top30.npy', X_arousal)
np.save(output_dir / 'X_combined_top40.npy', X_combined)
np.save(output_dir / 'y_valence_binary.npy', y_valence)
np.save(output_dir / 'y_arousal_binary.npy', y_arousal)
np.save(output_dir / 'y_valence_continuous.npy', y_valence_cont)
np.save(output_dir / 'y_arousal_continuous.npy', y_arousal_cont)

# Save feature names as text files
with open(output_dir / 'features_valence_top30.txt', 'w') as f:
    f.write('\n'.join(top_features_valence))

with open(output_dir / 'features_arousal_top30.txt', 'w') as f:
    f.write('\n'.join(top_features_arousal))

with open(output_dir / 'features_combined_top40.txt', 'w') as f:
    f.write('\n'.join(top_features_combined))

print("\n=== Files Saved ===")
print(f"Directory: {output_dir}")
print("\nData files:")
print("  - X_valence_top30.npy")
print("  - X_arousal_top30.npy")
print("  - X_combined_top40.npy")
print("  - y_valence_binary.npy")
print("  - y_arousal_binary.npy")
print("  - y_valence_continuous.npy")
print("  - y_arousal_continuous.npy")
print("\nFeature lists:")
print("  - features_valence_top30.txt")
print("  - features_arousal_top30.txt")
print("  - features_combined_top40.txt")


=== Files Saved ===
Directory: ../datasets/DEAP/filtered_features

Data files:
  - X_valence_top30.npy
  - X_arousal_top30.npy
  - X_combined_top40.npy
  - y_valence_binary.npy
  - y_arousal_binary.npy
  - y_valence_continuous.npy
  - y_arousal_continuous.npy

Feature lists:
  - features_valence_top30.txt
  - features_arousal_top30.txt
  - features_combined_top40.txt


## 5. Save as Pandas DataFrames (CSV)

In [10]:
# Create DataFrames with feature names
df_valence = pd.DataFrame(X_valence, columns=top_features_valence)
df_valence['valence_binary'] = y_valence
df_valence['valence_continuous'] = y_valence_cont
df_valence['arousal_binary'] = y_arousal
df_valence['arousal_continuous'] = y_arousal_cont

df_arousal = pd.DataFrame(X_arousal, columns=top_features_arousal)
df_arousal['valence_binary'] = y_valence
df_arousal['valence_continuous'] = y_valence_cont
df_arousal['arousal_binary'] = y_arousal
df_arousal['arousal_continuous'] = y_arousal_cont

df_combined = pd.DataFrame(X_combined, columns=top_features_combined)
df_combined['valence_binary'] = y_valence
df_combined['valence_continuous'] = y_valence_cont
df_combined['arousal_binary'] = y_arousal
df_combined['arousal_continuous'] = y_arousal_cont

# Save as CSV
df_valence.to_csv(output_dir / 'dataset_valence_top30.csv', index=False)
df_arousal.to_csv(output_dir / 'dataset_arousal_top30.csv', index=False)
df_combined.to_csv(output_dir / 'dataset_combined_top40.csv', index=False)

print("\nCSV files saved:")
print("  - dataset_valence_top30.csv")
print("  - dataset_arousal_top30.csv")
print("  - dataset_combined_top40.csv")

print(f"\nDataFrame shapes:")
print(f"  - Valence: {df_valence.shape}")
print(f"  - Arousal: {df_arousal.shape}")
print(f"  - Combined: {df_combined.shape}")


CSV files saved:
  - dataset_valence_top30.csv
  - dataset_arousal_top30.csv
  - dataset_combined_top40.csv

DataFrame shapes:
  - Valence: (1280, 34)
  - Arousal: (1280, 34)
  - Combined: (1280, 44)


## 6. Create Metadata File

In [11]:
# Create a metadata file documenting the filtered datasets
metadata = f"""# Filtered Feature Datasets - Metadata

Generated from Feature Importance Analysis
Date: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}

## Dataset Information

Original dataset: DEAP (Database for Emotion Analysis using Physiological signals)
- Total samples: 1280 (32 subjects × 40 trials)
- Original features: 96 (32 channels × 3 frequency bands)
- Frequency bands: alpha (8-12 Hz), beta (12-30 Hz), gamma (30-64 Hz)

## Filtered Datasets

### 1. Valence-Optimized Dataset
- File: dataset_valence_top{TOP_N_VALENCE}.csv / X_valence_top{TOP_N_VALENCE}.npy
- Features: {TOP_N_VALENCE}
- Shape: (1280, {TOP_N_VALENCE})
- Selection: Top {TOP_N_VALENCE} features ranked by aggregated importance for valence prediction

### 2. Arousal-Optimized Dataset
- File: dataset_arousal_top{TOP_N_AROUSAL}.csv / X_arousal_top{TOP_N_AROUSAL}.npy
- Features: {TOP_N_AROUSAL}
- Shape: (1280, {TOP_N_AROUSAL})
- Selection: Top {TOP_N_AROUSAL} features ranked by aggregated importance for arousal prediction

### 3. Combined Dataset (Valence + Arousal)
- File: dataset_combined_top{TOP_N_COMBINED}.csv / X_combined_top{TOP_N_COMBINED}.npy
- Features: {TOP_N_COMBINED}
- Shape: (1280, {TOP_N_COMBINED})
- Selection: Top {TOP_N_COMBINED} features ranked by average importance for both valence and arousal

## Target Variables

All datasets include 4 target variables:
1. **valence_binary**: Binary valence labels (0/1, threshold 4.5)
2. **valence_continuous**: Continuous valence ratings (1-9 scale)
3. **arousal_binary**: Binary arousal labels (0/1, threshold 4.5)
4. **arousal_continuous**: Continuous arousal ratings (1-9 scale)

## Feature Importance Methodology

Features were ranked using 5 different methods:
1. Pearson/Spearman correlation
2. ANOVA F-test & Mutual Information
3. Random Forest feature importance
4. Permutation importance
5. Linear SVM coefficients

All scores were normalized and averaged to create final rankings.

## Usage Example

```python
import numpy as np
import pandas as pd

# Option 1: Load NumPy arrays
X = np.load('filtered_features/X_valence_top{TOP_N_VALENCE}.npy')
y = np.load('filtered_features/y_valence_binary.npy')

# Option 2: Load CSV with feature names
df = pd.read_csv('filtered_features/dataset_valence_top{TOP_N_VALENCE}.csv')
X = df.drop(['valence_binary', 'valence_continuous', 'arousal_binary', 'arousal_continuous'], axis=1)
y = df['valence_binary']
```

## Files

### NumPy Arrays (.npy)
- X_valence_top{TOP_N_VALENCE}.npy
- X_arousal_top{TOP_N_AROUSAL}.npy
- X_combined_top{TOP_N_COMBINED}.npy
- y_valence_binary.npy
- y_arousal_binary.npy
- y_valence_continuous.npy
- y_arousal_continuous.npy

### CSV Files
- dataset_valence_top{TOP_N_VALENCE}.csv
- dataset_arousal_top{TOP_N_AROUSAL}.csv
- dataset_combined_top{TOP_N_COMBINED}.csv

### Feature Lists (.txt)
- features_valence_top{TOP_N_VALENCE}.txt
- features_arousal_top{TOP_N_AROUSAL}.txt
- features_combined_top{TOP_N_COMBINED}.txt
"""

with open(output_dir / 'README.md', 'w') as f:
    f.write(metadata)

print("\nMetadata file created: README.md")


Metadata file created: README.md


## 7. Summary and Statistics

In [None]:
print("\n" + "="*80)
print(" "*25 + "FILTERED DATASET SUMMARY")
print("="*80)

print("\n### Dataset Dimensions ###")
print(f"Original: (1280, 96)")
print(f"Valence-optimized: (1280, {TOP_N_VALENCE}) - {(TOP_N_VALENCE/96)*100:.1f}% of features")
print(f"Arousal-optimized: (1280, {TOP_N_AROUSAL}) - {(TOP_N_AROUSAL/96)*100:.1f}% of features")
print(f"Combined: (1280, {TOP_N_COMBINED}) - {(TOP_N_COMBINED/96)*100:.1f}% of features")

print("\n### Feature Overlap ###")
overlap = set(top_features_valence) & set(top_features_arousal)
print(f"Features in both valence and arousal top-{TOP_N_VALENCE}: {len(overlap)}")
print(f"Overlap percentage: {(len(overlap)/TOP_N_VALENCE)*100:.1f}%")

if len(overlap) > 0:
    print(f"\nOverlapping features:")
    for feat in sorted(overlap):
        print(f"  - {feat}")

print("\n### Target Distribution ###")
print(f"Valence: {np.bincount(y_valence)} (0: {np.bincount(y_valence)[0]}, 1: {np.bincount(y_valence)[1]})")
print(f"Arousal: {np.bincount(y_arousal)} (0: {np.bincount(y_arousal)[0]}, 1: {np.bincount(y_arousal)[1]})")

print("\n### Files Created ###")
print(f"Location: {output_dir}")
print(f"Total files: {len(list(output_dir.glob('*')))}")

print("\n" + "="*80)
print("\n✓ Filtered datasets created successfully!")
print("\nYou can now use these datasets for training more efficient models.")



                         FILTERED DATASET SUMMARY

### Dataset Dimensions ###
Original: (1280, 96)
Valence-optimized: (1280, 30) - 31.2% of features
Arousal-optimized: (1280, 30) - 31.2% of features
Combined: (1280, 40) - 41.7% of features

### Feature Overlap ###
Features in both valence and arousal top-30: 10
Overlap percentage: 33.3%

Overlapping features:
  - AF4_beta
  - C4_alpha
  - CP1_alpha
  - FC1_gamma
  - FC2_gamma
  - FC6_alpha
  - Fp1_beta
  - Fp1_gamma
  - P4_gamma
  - T7_gamma

### Target Distribution ###
Valence: [472 808] (0: 472, 1: 808)
Arousal: [464 816] (0: 464, 1: 816)

### Files Created ###
Location: ../datasets/DEAP/filtered_features
Total files: 14


✓ Filtered datasets created successfully!

You can now use these datasets for training more efficient models.
The reduced feature sets should improve training speed and may reduce overfitting.
