In [1]:
import pyedflib
import json
import os
from collections import defaultdict

def check_channels_in_files(file_source, required_channels=None):
    """
    Check if EDF files contain required channels
    
    Args:
        file_source: Either:
            - Path to JSON segments file
            - List of EDF file paths
            - Single EDF file path
        required_channels: List of required channel names
    """
    
    if required_channels is None:
        required_channels = [
            'C3-P3', 'C4-P4', 'CZ-PZ', 'F3-C3', 'F4-C4', 'F7-T7', 'F8-T8', 
            'FP1-F3', 'FP1-F7', 'FP2-F4', 'FP2-F8', 'FZ-CZ', 'P3-O1', 'P4-O2', 
            'P7-O1', 'P8-O2', 'T7-P7', 'T8-P8'
        ]
    
    # Get list of EDF files to check
    edf_files = []
    
    if isinstance(file_source, str):
        if file_source.endswith('.json'):
            # Load from JSON segments file
            with open(file_source, 'r') as f:
                data = json.load(f)
            
            # Extract unique file paths
            files_set = set()
            for segment in data['preictal_segments'] + data['interictal_segments']:
                file_path = f"physionet.org/files/chbmit/1.0.0/{segment['patient_id']}/{segment['file']}"
                files_set.add(file_path)
            edf_files = list(files_set)
            
        elif file_source.endswith('.edf'):
            # Single EDF file
            edf_files = [file_source]
        else:
            raise ValueError("String input must be .json or .edf file")
    
    elif isinstance(file_source, list):
        # List of EDF file paths
        edf_files = file_source
    
    # Check each file
    results = {}
    missing_channels_summary = defaultdict(list)
    
    print(f"Checking {len(edf_files)} EDF files for {len(required_channels)} required channels...\n")
    
    for edf_path in sorted(edf_files):
        if not os.path.exists(edf_path):
            results[edf_path] = {
                'status': 'FILE_NOT_FOUND',
                'available_channels': [],
                'missing_channels': required_channels.copy()
            }
            print(f"❌ {edf_path} - FILE NOT FOUND")
            continue
        
        try:
            # Read EDF file
            f = pyedflib.EdfReader(edf_path)
            available_channels = f.getSignalLabels()
            f.close()
            
            # Check for missing channels
            missing_channels = [ch for ch in required_channels if ch not in available_channels]
            
            results[edf_path] = {
                'status': 'OK' if not missing_channels else 'MISSING_CHANNELS',
                'available_channels': available_channels,
                'missing_channels': missing_channels
            }
            
            # Track missing channels for summary
            for ch in missing_channels:
                missing_channels_summary[ch].append(edf_path)
            
            # Print results
            if not missing_channels:
                print(f"✅ {edf_path} - ALL CHANNELS PRESENT")
            else:
                print(f"⚠️  {edf_path} - MISSING {len(missing_channels)} CHANNELS:")
                for ch in missing_channels:
                    print(f"    - {ch}")
                
        except Exception as e:
            results[edf_path] = {
                'status': 'ERROR',
                'available_channels': [],
                'missing_channels': required_channels.copy(),
                'error': str(e)
            }
            print(f"❌ {edf_path} - ERROR: {e}")
    
    # Print summary
    print(f"\n{'='*60}")
    print("SUMMARY:")
    print(f"{'='*60}")
    
    total_files = len(edf_files)
    files_with_all_channels = sum(1 for r in results.values() if r['status'] == 'OK')
    files_with_missing = total_files - files_with_all_channels
    
    print(f"Total files checked: {total_files}")
    print(f"Files with all channels: {files_with_all_channels}")
    print(f"Files with missing channels: {files_with_missing}")
    
    if missing_channels_summary:
        print(f"\nMost commonly missing channels:")
        for channel, files in sorted(missing_channels_summary.items(), 
                                   key=lambda x: len(x[1]), reverse=True):
            print(f"  {channel}: missing in {len(files)} files")
    
    return results

# Usage examples:
results = check_channels_in_files('all_patients_segments.json')

Checking 652 EDF files for 18 required channels...

✅ physionet.org/files/chbmit/1.0.0/chb01/chb01_01.edf - ALL CHANNELS PRESENT
✅ physionet.org/files/chbmit/1.0.0/chb01/chb01_02.edf - ALL CHANNELS PRESENT
✅ physionet.org/files/chbmit/1.0.0/chb01/chb01_03.edf - ALL CHANNELS PRESENT
✅ physionet.org/files/chbmit/1.0.0/chb01/chb01_04.edf - ALL CHANNELS PRESENT
✅ physionet.org/files/chbmit/1.0.0/chb01/chb01_05.edf - ALL CHANNELS PRESENT
✅ physionet.org/files/chbmit/1.0.0/chb01/chb01_06.edf - ALL CHANNELS PRESENT
✅ physionet.org/files/chbmit/1.0.0/chb01/chb01_07.edf - ALL CHANNELS PRESENT
✅ physionet.org/files/chbmit/1.0.0/chb01/chb01_08.edf - ALL CHANNELS PRESENT
✅ physionet.org/files/chbmit/1.0.0/chb01/chb01_09.edf - ALL CHANNELS PRESENT
✅ physionet.org/files/chbmit/1.0.0/chb01/chb01_10.edf - ALL CHANNELS PRESENT
✅ physionet.org/files/chbmit/1.0.0/chb01/chb01_11.edf - ALL CHANNELS PRESENT
✅ physionet.org/files/chbmit/1.0.0/chb01/chb01_12.edf - ALL CHANNELS PRESENT
✅ physionet.org/files/ch