In [None]:
"""
EEG Person Identification - Data Download
PhysioNet Motor Movement/Imagery Dataset
Author: [Your Name]
Date: 2025

This notebook helps you download and organize the PhysioNet dataset.
Dataset: EEG Motor Movement/Imagery Database
URL: https://physionet.org/content/eegmmidb/1.0.0/
"""

#%% Import Libraries
import os
import urllib.request
from tqdm import tqdm
import zipfile
import shutil

print("Libraries imported successfully!")

#%% Configuration

class Config:
    """Configuration for data download"""
    # Dataset information
    DATASET_URL = "https://physionet.org/static/published-projects/eegmmidb/eeg-motor-movementimagery-dataset-1.0.0.zip"
    
    # Local paths
    DATA_DIR = './data/'
    RAW_DATA_DIR = './data/raw/'
    DOWNLOAD_DIR = './data/downloads/'
    
    # Dataset parameters
    N_SUBJECTS = 109
    N_RUNS_PER_SUBJECT = 14
    
    # Create directories
    os.makedirs(DATA_DIR, exist_ok=True)
    os.makedirs(RAW_DATA_DIR, exist_ok=True)
    os.makedirs(DOWNLOAD_DIR, exist_ok=True)

config = Config()
print("\nConfiguration loaded!")
print(f"Dataset will be downloaded to: {config.RAW_DATA_DIR}")

#%% Download Progress Bar

class DownloadProgressBar(tqdm):
    """Custom progress bar for urllib downloads"""
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)

def download_url(url, output_path):
    """
    Download file with progress bar
    
    Parameters:
    -----------
    url : str
        URL to download from
    output_path : str
        Local path to save file
    """
    with DownloadProgressBar(unit='B', unit_scale=True, miniters=1, desc=url.split('/')[-1]) as t:
        urllib.request.urlretrieve(url, filename=output_path, reporthook=t.update_to)

#%% Method 1: Automatic Download

def download_dataset_automatic(config):
    """
    Automatically download the complete dataset (recommended)
    
    This downloads the entire dataset as a zip file (~1.5GB)
    """
    print("\n" + "="*70)
    print("METHOD 1: AUTOMATIC DOWNLOAD (RECOMMENDED)")
    print("="*70)
    
    zip_path = os.path.join(config.DOWNLOAD_DIR, 'eegmmidb.zip')
    
    print("\n‚ö†Ô∏è  Note: This will download approximately 1.5 GB of data")
    print("Make sure you have sufficient disk space and internet connection.")
    
    response = input("\nDo you want to proceed? (yes/no): ")
    
    if response.lower() not in ['yes', 'y']:
        print("Download cancelled.")
        return False
    
    print("\nDownloading dataset...")
    print(f"URL: {config.DATASET_URL}")
    print(f"Destination: {zip_path}")
    
    try:
        download_url(config.DATASET_URL, zip_path)
        print("\n‚úì Download complete!")
        
        # Extract zip file
        print("\nExtracting files...")
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(config.DOWNLOAD_DIR)
        print("‚úì Extraction complete!")
        
        # Move files to raw data directory
        print("\nOrganizing files...")
        extracted_dir = os.path.join(config.DOWNLOAD_DIR, 'eeg-motor-movementimagery-dataset-1.0.0')
        
        if os.path.exists(extracted_dir):
            # Look for the files directory
            files_dir = os.path.join(extracted_dir, 'files')
            if os.path.exists(files_dir):
                # Copy all EDF files
                for item in os.listdir(files_dir):
                    if item.endswith('.edf'):
                        src = os.path.join(files_dir, item)
                        dst = os.path.join(config.RAW_DATA_DIR, item)
                        shutil.copy2(src, dst)
                print(f"‚úì Files organized in: {config.RAW_DATA_DIR}")
            else:
                print("Warning: 'files' directory not found in extracted archive")
        
        # Clean up
        print("\nCleaning up...")
        os.remove(zip_path)
        shutil.rmtree(extracted_dir, ignore_errors=True)
        print("‚úì Cleanup complete!")
        
        # Verify download
        edf_files = [f for f in os.listdir(config.RAW_DATA_DIR) if f.endswith('.edf')]
        print(f"\n‚úì Successfully downloaded {len(edf_files)} EDF files")
        
        expected_files = config.N_SUBJECTS * config.N_RUNS_PER_SUBJECT
        if len(edf_files) == expected_files:
            print(f"‚úì All {expected_files} files downloaded successfully!")
        else:
            print(f"‚ö†Ô∏è  Warning: Expected {expected_files} files, found {len(edf_files)}")
        
        return True
        
    except Exception as e:
        print(f"\n‚ùå Error during download: {str(e)}")
        return False

#%% Method 2: Manual Download Instructions

def show_manual_download_instructions(config):
    """
    Show instructions for manual download
    """
    print("\n" + "="*70)
    print("METHOD 2: MANUAL DOWNLOAD")
    print("="*70)
    
    print("\nIf automatic download fails, follow these steps:")
    print("\n1. Visit the PhysioNet website:")
    print("   https://physionet.org/content/eegmmidb/1.0.0/")
    
    print("\n2. Download the dataset:")
    print("   - Click 'Download the ZIP file'")
    print("   - Or use this direct link:")
    print("   https://physionet.org/static/published-projects/eegmmidb/eeg-motor-movementimagery-dataset-1.0.0.zip")
    
    print("\n3. Extract the downloaded ZIP file")
    
    print("\n4. Copy all .edf files to this directory:")
    print(f"   {os.path.abspath(config.RAW_DATA_DIR)}")
    
    print("\n5. Verify you have all files:")
    print(f"   - Expected: {config.N_SUBJECTS * config.N_RUNS_PER_SUBJECT} .edf files")
    print("   - Files should be named like: S001R01.edf, S001R02.edf, ...")
    
    print("\n6. Once files are in place, run the verification function below")

#%% Method 3: Download Subset (for testing)

def download_subset_for_testing(config, n_subjects=5):
    """
    Download a small subset of data for testing
    
    Parameters:
    -----------
    n_subjects : int
        Number of subjects to download (for testing)
    """
    print("\n" + "="*70)
    print(f"METHOD 3: DOWNLOAD SUBSET ({n_subjects} subjects for testing)")
    print("="*70)
    
    print("\n‚ö†Ô∏è  This downloads a small subset for testing purposes only")
    print(f"It will download data for {n_subjects} subjects (not the full dataset)")
    
    response = input("\nDo you want to proceed? (yes/no): ")
    
    if response.lower() not in ['yes', 'y']:
        print("Download cancelled.")
        return False
    
    base_url = "https://physionet.org/files/eegmmidb/1.0.0/"
    
    try:
        downloaded = 0
        for subject_id in range(1, n_subjects + 1):
            print(f"\nDownloading Subject {subject_id:03d}...")
            
            for run in range(1, config.N_RUNS_PER_SUBJECT + 1):
                filename = f"S{subject_id:03d}R{run:02d}.edf"
                url = base_url + filename
                output_path = os.path.join(config.RAW_DATA_DIR, filename)
                
                try:
                    urllib.request.urlretrieve(url, output_path)
                    downloaded += 1
                    print(f"  ‚úì {filename}")
                except Exception as e:
                    print(f"  ‚úó Failed to download {filename}: {str(e)}")
        
        print(f"\n‚úì Downloaded {downloaded} files for {n_subjects} subjects")
        return True
        
    except Exception as e:
        print(f"\n‚ùå Error during subset download: {str(e)}")
        return False

#%% Verify Downloaded Data

def verify_downloaded_data(config):
    """
    Verify that all necessary files are present
    """
    print("\n" + "="*70)
    print("VERIFYING DOWNLOADED DATA")
    print("="*70)
    
    if not os.path.exists(config.RAW_DATA_DIR):
        print(f"\n‚ùå Raw data directory not found: {config.RAW_DATA_DIR}")
        return False
    
    # Count EDF files
    edf_files = [f for f in os.listdir(config.RAW_DATA_DIR) if f.endswith('.edf')]
    
    print(f"\nüìä Found {len(edf_files)} EDF files in {config.RAW_DATA_DIR}")
    
    # Check file naming pattern
    valid_files = 0
    subjects_found = set()
    
    for filename in edf_files:
        # Expected format: S001R01.edf
        if len(filename) == 11 and filename[0] == 'S' and filename[4] == 'R':
            try:
                subject_id = int(filename[1:4])
                run_id = int(filename[5:7])
                if 1 <= subject_id <= config.N_SUBJECTS and 1 <= run_id <= config.N_RUNS_PER_SUBJECT:
                    valid_files += 1
                    subjects_found.add(subject_id)
            except ValueError:
                pass
    
    print(f"‚úì Valid files: {valid_files}")
    print(f"‚úì Unique subjects found: {len(subjects_found)}")
    
    # Check completeness
    expected_total = config.N_SUBJECTS * config.N_RUNS_PER_SUBJECT
    
    if len(edf_files) == expected_total:
        print(f"\n‚úÖ Complete dataset! All {expected_total} files present.")
        return True
    elif len(subjects_found) >= 5:
        print(f"\n‚úÖ Sufficient data for testing ({len(subjects_found)} subjects)")
        print(f"   For full project, download all {config.N_SUBJECTS} subjects")
        return True
    else:
        print(f"\n‚ö†Ô∏è  Incomplete dataset: {len(edf_files)}/{expected_total} files")
        print("   You may need to download more data")
        return False

#%% Display Dataset Information

def display_dataset_info(config):
    """
    Display information about the dataset
    """
    print("\n" + "="*70)
    print("PHYSIONET EEG MOTOR MOVEMENT/IMAGERY DATASET")
    print("="*70)
    
    print("\nDataset Overview:")
    print("  - 109 subjects (healthy volunteers)")
    print("  - 64-channel EEG (10-10 system)")
    print("  - 160 Hz sampling rate")
    print("  - 14 experimental runs per subject")
    print("  - Each run: 1-2 minutes duration")
    
    print("\nExperimental Tasks:")
    print("  Baseline runs (1-2):")
    print("    - Run 1: Eyes open")
    print("    - Run 2: Eyes closed")
    
    print("\n  Motor execution runs (3, 7, 11):")
    print("    - Open and close left or right fist")
    
    print("\n  Motor imagery runs (4, 8, 12):")
    print("    - Imagine opening and closing left or right fist")
    
    print("\n  Motor execution runs (5, 9, 13):")
    print("    - Open and close both fists or both feet")
    
    print("\n  Motor imagery runs (6, 10, 14):")
    print("    - Imagine opening and closing both fists or both feet")
    
    print("\nFor this project, we use:")
    print("  - Runs 4, 6, 8, 10, 12, 14 (motor imagery tasks)")
    print("  - These runs provide the most distinctive EEG patterns")
    
    print("\nFile Format:")
    print("  - European Data Format (EDF)")
    print("  - Naming: S###R##.edf (e.g., S001R04.edf)")
    print("    - S### = Subject ID (001-109)")
    print("    - R## = Run number (01-14)")
    
    print("\nDataset Size:")
    total_files = config.N_SUBJECTS * config.N_RUNS_PER_SUBJECT
    print(f"  - Total files: {total_files}")
    print(f"  - Approximate size: 1.5 GB")
    
    print("\nReferences:")
    print("  - Schalk, G., McFarland, D.J., Hinterberger, T., Birbaumer, N.,")
    print("    Wolpaw, J.R. BCI2000: A General-Purpose Brain-Computer Interface")
    print("    (BCI) System. IEEE TBME 51(6):1034-1043, 2004")
    
    print("\n" + "="*70)

#%% Main Execution

if __name__ == "__main__":
    print("\n" + "="*70)
    print("EEG PERSON IDENTIFICATION - DATA DOWNLOAD")
    print("="*70)
    
    # Display dataset information
    display_dataset_info(config)
    
    # Check if data already exists
    if os.path.exists(config.RAW_DATA_DIR):
        edf_files = [f for f in os.listdir(config.RAW_DATA_DIR) if f.endswith('.edf')]
        if len(edf_files) > 0:
            print(f"\n‚úì Found {len(edf_files)} EDF files in {config.RAW_DATA_DIR}")
            print("Data already downloaded!")
            
            verify_downloaded_data(config)
            
            response = input("\nDo you want to download again? (yes/no): ")
            if response.lower() not in ['yes', 'y']:
                print("\nSkipping download. Using existing data.")
                print("\n" + "="*70)
                print("Next step: Run 02_preprocessing.ipynb")
                print("="*70)
                exit()
    
    # Download options
    print("\n" + "="*70)
    print("DOWNLOAD OPTIONS")
    print("="*70)
    print("\n1. Automatic download (full dataset, ~1.5GB) - RECOMMENDED")
    print("2. Manual download instructions")
    print("3. Download subset for testing (5 subjects)")
    print("4. Verify existing data")
    print("5. Exit")
    
    choice = input("\nEnter your choice (1-5): ")
    
    if choice == '1':
        success = download_dataset_automatic(config)
    elif choice == '2':
        show_manual_download_instructions(config)
        success = False
    elif choice == '3':
        success = download_subset_for_testing(config, n_subjects=5)
    elif choice == '4':
        success = verify_downloaded_data(config)
    else:
        print("\nExiting...")
        success = False
    
    # Final verification
    if success:
        print("\n" + "="*70)
        print("DOWNLOAD COMPLETE!")
        print("="*70)
        verify_downloaded_data(config)
        print("\n" + "="*70)
        print("Next step: Run 02_preprocessing.ipynb")
        print("="*70)
    else:
        print("\n" + "="*70)
        print("Please complete the download before proceeding to preprocessing.")
        print("="*70)

#%% Download Summary

"""
DATA DOWNLOAD SUMMARY
=====================

This notebook provides three methods to obtain the PhysioNet dataset:

1. Automatic Download (Recommended):
   - Downloads the complete dataset automatically
   - Extracts and organizes files
   - Size: ~1.5 GB
   - Easiest method

2. Manual Download:
   - Visit PhysioNet website
   - Download ZIP file
   - Extract to specified directory
   - Use this if automatic download fails

3. Subset Download (Testing):
   - Downloads data for 5 subjects only
   - Faster for testing the pipeline
   - Not suitable for final project submission

After download, the verify function checks:
- File count and naming
- Number of subjects
- Data completeness

Next Step: 02_preprocessing.ipynb
"""