In [12]:
import scipy.io
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import pathlib as pl

In [13]:
def read_heafile(file_name):
    # Open the .hea file
    with open(file_name, 'r') as file:
        # Read the content of the .hea file
        hea_content = file.readlines()
    return hea_content

In [14]:
def create_array(hea_content):
    ID = hea_content[0].strip().split()[0]
    
    # Extract 'Age' from .hea file content
    age_info = hea_content[13].strip().split()
    age = int(age_info[2]) if len(age_info) > 2 and age_info[2].isdigit() else 0
    
    # Extract 'Gender' from .hea file content
    gender = hea_content[14].strip().split()[2] if len(hea_content) > 14 else 'Unknown'
    
    # Extract 'Abnormality' from .hea file content
    abnormality = hea_content[15].strip().split()[2] if len(hea_content) > 15 else 'Unknown'
    
    return [ID, age, gender, abnormality]

In [17]:
import pandas as pd
import pathlib as pl

def create_dataframes(training_directory):
    dataframes = {}

    subdirectories = [subdir for subdir in pl.Path(training_directory).iterdir() if subdir.is_dir()]
    
    for source_folder_path in subdirectories:
        source_folder_name = source_folder_path.name
        columns = ['ID', 'Age', 'Gender', 'Abnormality']
        source_dataframe = pd.DataFrame(columns=columns)
        patient_data = {}  # To collect patient information
        
        for subdir in source_folder_path.iterdir():
            if subdir.is_dir():
                data_dir = pl.Path(subdir)
                header_files = list(data_dir.glob('*.hea'))

                for header_file in header_files:
                    header_path = data_dir.joinpath(header_file.name)
                    hea_content = read_heafile(header_path)
                    patient_info = create_array(hea_content)
                    patient_id = patient_info[0]
                    
                    # Collect patient information
                    for i, column_name in enumerate(['Age', 'Gender', 'Abnormality']):
                        patient_data.setdefault(patient_id, {})[column_name] = patient_info[i + 1]
                        
        # Create a list of patient data dictionaries
        patient_rows = []
        for patient_id, info in patient_data.items():
            row = {'ID': patient_id, 'Age': info.get('Age'), 'Gender': info.get('Gender'), 'Abnormality': info.get('Abnormality')}
            patient_rows.append(row)
            
        # Concatenate patient data into the dataframe
        source_dataframe = pd.concat([source_dataframe, pd.DataFrame(patient_rows)])
        
        dataframes[f'{source_folder_name}_df'] = source_dataframe
        
    return dataframes

# Call the function with your specific directory
directory_path = r'C:\Users\ASUS\Desktop\CardioData\CardioData\training'
result_dataframes = create_dataframes(directory_path)
print(result_dataframes)

{'cpsc_2018_df':          ID Age  Gender Abnormality
0     A0001  74    Male    59118001
1     A0002  49  Female   426783006
2     A0003  81  Female   164889003
3     A0004  45    Male   164889003
4     A0005  53    Male   164884008
...     ...  ..     ...         ...
6872  A6873  80    Male   270492004
6873  A6874  62  Female   429622005
6874  A6875  78    Male   164909002
6875  A6876   0  Female   164889003
6876  A6877  71  Female   164884008

[6877 rows x 4 columns], 'cpsc_2018_extra_df':          ID Age  Gender          Abnormality
0     Q0001  53    Male  164867002,427084000
1     Q0002  70  Female            164861001
2     Q0003  55    Male  164867002,428750005
3     Q0004  57    Male  164861001,428750005
4     Q0005  51  Female            428750005
...     ...  ..     ...                  ...
3448  Q3577  61    Male   54329005,428750005
3449  Q3578  73    Male            164867002
3450  Q3579  41  Female            164865005
3451  Q3580  73  Female            164867002
3452  Q3