In [1]:
import os
import pandas as pd

In [2]:
def check_folder_structure(root_dir):
    """
    Checks if each folder within the root directory adheres to the following structure:
        - Contains a subfolder named "XX"
        - The "XX" subfolder contains exactly 8 files

    Args:
        root_dir: The path to the root directory containing the 200 folders.

    Returns:
        A list of tuples:
            - (folder_path, True) if the folder structure is consistent
            - (folder_path, False) if the folder structure is inconsistent
    """

    results = []
    for folder in os.listdir(root_dir):
        folder_path = os.path.join(root_dir, folder)
        if not os.path.isdir(folder_path):
            continue  # Skip if not a directory

        xx_folder_path = os.path.join(folder_path, "EEG")
        if not os.path.exists(xx_folder_path):
            results.append((folder_path, False))  # "XX" folder missing
            continue

        try:
            files_in_xx = os.listdir(xx_folder_path)
        except OSError:
            results.append((folder_path, False))  # Unable to list files in "XX"
            continue

        if len(files_in_xx) != 8:
            results.append((folder_path, False))  # Incorrect number of files
        else:
            results.append((folder_path, True))

    return results


In [3]:
####
root_directory = "Dataset_clean_for_jupyter"  # Replace with the actual path
verification_results = check_folder_structure(root_directory)

for folder_path, is_consistent in verification_results:
    print(f"{folder_path}: {'Consistent' if is_consistent else 'Inconsistent'}")

# Count the number of consistent and inconsistent folders
consistent_count = sum(result[1] for result in verification_results)
inconsistent_count = len(verification_results) - consistent_count
print(f"\nConsistent folders: {consistent_count}")
print(f"Inconsistent folders: {inconsistent_count}")


Consistent folders: 0
Inconsistent folders: 128


----------

In [4]:
def compare_files_in_folders(root_dir):
    """
    Compares 'event_1_psg.csv' and 'event_1_hb.csv' files within each subfolder of the root directory.

    Args:
        root_dir: The path to the root directory containing the 200 folders.

    Returns:
        A list of tuples:
            - (folder_path, True) if the specified columns are equal in both files.
            - (folder_path, False) if the specified columns are not equal.
    """

    results = []
    cols_to_compare = ['onset', 'duration', 'begsample', 'endsample', 'offset']
    for index, folder in enumerate(os.listdir(root_dir), start=1):
        folder_path = os.path.join(root_dir, f'sub-{index}')
        eeg_folder_path = os.path.join(folder_path, "eeg")

        psg_file_path = os.path.join(eeg_folder_path, f'sub-{index}_task-Sleep_acq-psg_events.tsv')
        hb_file_path = os.path.join(eeg_folder_path, f'sub-{index}_task-Sleep_acq-headband_events.tsv')

        df_psg = pd.read_csv(psg_file_path,sep = '\t')
        df_hb = pd.read_csv(hb_file_path,sep = '\t')

        comparison_result = df_psg[cols_to_compare].equals(df_hb[cols_to_compare])
        results.append((folder_path, comparison_result))


    return results


In [5]:
root_directory = "Dataset_clean_for_jupyter"  
verification_results = compare_files_in_folders(root_directory)

# Print the results
for folder_path, is_equal in verification_results:
    print(f"{folder_path}: {'Equal' if is_equal else 'Not Equal'}")

# Count the number of equal and not equal folders
equal_count = sum(result[1] for result in verification_results)
not_equal_count = len(verification_results) - equal_count
print(f"\nEqual folders: {equal_count}")
print(f"Not Equal folders: {not_equal_count}")

Dataset_clean_for_jupyter\sub-1: Equal
Dataset_clean_for_jupyter\sub-2: Equal
Dataset_clean_for_jupyter\sub-3: Equal
Dataset_clean_for_jupyter\sub-4: Equal
Dataset_clean_for_jupyter\sub-5: Equal
Dataset_clean_for_jupyter\sub-6: Equal
Dataset_clean_for_jupyter\sub-7: Equal
Dataset_clean_for_jupyter\sub-8: Equal
Dataset_clean_for_jupyter\sub-9: Equal
Dataset_clean_for_jupyter\sub-10: Equal
Dataset_clean_for_jupyter\sub-11: Equal
Dataset_clean_for_jupyter\sub-12: Equal
Dataset_clean_for_jupyter\sub-13: Equal
Dataset_clean_for_jupyter\sub-14: Equal
Dataset_clean_for_jupyter\sub-15: Equal
Dataset_clean_for_jupyter\sub-16: Equal
Dataset_clean_for_jupyter\sub-17: Equal
Dataset_clean_for_jupyter\sub-18: Equal
Dataset_clean_for_jupyter\sub-19: Equal
Dataset_clean_for_jupyter\sub-20: Equal
Dataset_clean_for_jupyter\sub-21: Equal
Dataset_clean_for_jupyter\sub-22: Equal
Dataset_clean_for_jupyter\sub-23: Equal
Dataset_clean_for_jupyter\sub-24: Equal
Dataset_clean_for_jupyter\sub-25: Equal
Dataset_c

so yes all the files have same columns 

----------

In [6]:
# for each night let's check the offset value
offset_value = 0 
for sub_no in range(1,129):
    event_1_psg = pd.read_csv(f'Dataset_clean_for_jupyter\\sub-{sub_no}\\eeg\\sub-{sub_no}_task-Sleep_acq-psg_events.tsv',sep = '\t')
    event_1_hb = pd.read_csv(f'Dataset_clean_for_jupyter\\sub-{sub_no}\\eeg\\sub-{sub_no}_task-Sleep_acq-headband_events.tsv',sep = '\t')
    offset_value += (event_1_hb['offset'].sum() + event_1_psg['offset'].sum())
offset_value

np.int64(0)

Good News.! ,This means that there is no offset value present in the dataset and we can ignore this column

In [8]:
#this is just optimization of function: "compare_files_in_folders"
cols_to_compare = ['onset', 'duration', 'begsample', 'endsample', 'offset']
comparison_result =0
for sub_no in range(1,129):
    event_1_psg = pd.read_csv(f'Dataset_clean_for_jupyter\\sub-{sub_no}\\eeg\\sub-{sub_no}_task-Sleep_acq-psg_events.tsv',sep = '\t')
    event_1_hb = pd.read_csv(f'Dataset_clean_for_jupyter\\sub-{sub_no}\\eeg\\sub-{sub_no}_task-Sleep_acq-headband_events.tsv',sep = '\t')
    
    # Check for equality of the selected columns
    comparison_result += event_1_psg[cols_to_compare].equals(event_1_hb[cols_to_compare])
comparison_result

128