In [2]:
import scipy.io
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import pathlib as pl
import numpy as np

In [3]:
def read_heafile(file_name):
    # Open the .hea file
    with open(file_name, 'r') as file:
        # Read the content of the .hea file
        hea_content = file.readlines()
    return hea_content

In [4]:
def create_array(hea_content):
    ID = hea_content[0].strip().split()[0]
    
    # Extract 'Age' from .hea file content
    age_info = hea_content[13].strip().split()
    age = int(age_info[2]) if len(age_info) > 2 and age_info[2].isdigit() else 0
    
    # Extract 'Gender' from .hea file content
    gender = hea_content[14].strip().split()[2] if len(hea_content) > 14 else 'Unknown'
    
    # Extract 'Abnormality' from .hea file content
    abnormality = hea_content[15].strip().split()[2] if len(hea_content) > 15 else 'Unknown'
    
    return [ID, age, gender, abnormality]

In [5]:
def create_dataframes(training_directory):
    dataframes = {}

    subdirectories = [subdir for subdir in pl.Path(training_directory).iterdir() if subdir.is_dir()]
    
    for source_folder_path in subdirectories:
        source_folder_name = source_folder_path.name
        columns = ['ID', 'Age', 'Gender', 'Abnormality']
        source_dataframe = pd.DataFrame(columns=columns)
        patient_data = {}  # To collect patient information
        
        for subdir in source_folder_path.iterdir():
            if subdir.is_dir():
                data_dir = pl.Path(subdir)
                header_files = list(data_dir.glob('*.hea'))

                for header_file in header_files:
                    header_path = data_dir.joinpath(header_file.name)
                    hea_content = read_heafile(header_path)
                    patient_info = create_array(hea_content)
                    patient_id = patient_info[0]
                    
                    # Collect patient information
                    for i, column_name in enumerate(['Age', 'Gender', 'Abnormality']):
                        patient_data.setdefault(patient_id, {})[column_name] = patient_info[i + 1]
                        
        # Create a list of patient data dictionaries
        patient_rows = []
        for patient_id, info in patient_data.items():
            row = {'ID': patient_id, 'Age': info.get('Age'), 'Gender': info.get('Gender'), 'Abnormality': info.get('Abnormality')}
            patient_rows.append(row)
            
        # Concatenate patient data into the dataframe
        source_dataframe = pd.concat([source_dataframe, pd.DataFrame(patient_rows)])
        
        dataframes[f'{source_folder_name}_df'] = source_dataframe
        
    return dataframes

# Call the function with your specific directory
directory_path = r'C:\Users\ASUS\Desktop\CardioData\CardioData\training'
result_dataframes = create_dataframes(directory_path)
print(result_dataframes)


{'cpsc_2018_df':          ID Age  Gender Abnormality
0     A0001  74    Male    59118001
1     A0002  49  Female   426783006
2     A0003  81  Female   164889003
3     A0004  45    Male   164889003
4     A0005  53    Male   164884008
...     ...  ..     ...         ...
6872  A6873  80    Male   270492004
6873  A6874  62  Female   429622005
6874  A6875  78    Male   164909002
6875  A6876   0  Female   164889003
6876  A6877  71  Female   164884008

[6877 rows x 4 columns], 'cpsc_2018_extra_df':          ID Age  Gender          Abnormality
0     Q0001  53    Male  164867002,427084000
1     Q0002  70  Female            164861001
2     Q0003  55    Male  164867002,428750005
3     Q0004  57    Male  164861001,428750005
4     Q0005  51  Female            428750005
...     ...  ..     ...                  ...
3448  Q3577  61    Male   54329005,428750005
3449  Q3578  73    Male            164867002
3450  Q3579  41  Female            164865005
3451  Q3580  73  Female            164867002
3452  Q3

In [6]:
cpsc_2018_df = result_dataframes['cpsc_2018_df']
cpsc_2018_extra_df = result_dataframes['cpsc_2018_extra_df']
georgia_df = result_dataframes['georgia_df']
ptb_df = result_dataframes['ptb_df']
ptb_xl_df = result_dataframes['ptb-xl_df']
st_petersburg_incart_df = result_dataframes['st_petersburg_incart_df']

dataframes = [cpsc_2018_df, cpsc_2018_extra_df, georgia_df, ptb_df, ptb_xl_df, st_petersburg_incart_df]

# Identify age values less than 0 and greater than 100
invalid_age_indices = (ptb_xl_df['Age'] < 0) | (ptb_xl_df['Age'] > 100)

#preidentified that outliers are only in ptb_xl_df dataframe
invalid_age_indices = ptb_xl_df['Age'] == 300
ptb_xl_df.loc[invalid_age_indices, 'Age'] = np.nan

In [7]:
# #option 1 - remove the outliers as it is  less than 1% of the data
# # ptb_xl_df = ptb_xl_df[ptb_xl_df['Age'] != 300]

# #option 2 - replace the outliers with NaN value and preprocess it as normal values

# invalid_age_indices = ptb_xl_df['Age'] == 300
# # Replace invalid age values with NaN
# ptb_xl_df.loc[invalid_age_indices, 'Age'] = np.nan

In [8]:
# Replace placeholders with NaN
for df_name, df in result_dataframes.items():
    df['Age'] = df['Age'].replace(0, np.nan)
    df['Gender'] = df['Gender'].replace('Unknown', np.nan)
    df['Abnormality'] = df['Abnormality'].replace('Unknown', np.nan)

for df_name, df in result_dataframes.items():
    print(f"Null value sum for DataFrame '{df_name}':")
    null_sum = df.isnull().sum()
    print(null_sum)

Null value sum for DataFrame 'cpsc_2018_df':
ID             0
Age            9
Gender         0
Abnormality    0
dtype: int64
Null value sum for DataFrame 'cpsc_2018_extra_df':
ID             0
Age            3
Gender         0
Abnormality    0
dtype: int64
Null value sum for DataFrame 'georgia_df':
ID              0
Age            77
Gender          0
Abnormality     0
dtype: int64
Null value sum for DataFrame 'ptb_df':
ID             0
Age            9
Gender         0
Abnormality    0
dtype: int64
Null value sum for DataFrame 'ptb-xl_df':
ID               0
Age            293
Gender           0
Abnormality      0
dtype: int64
Null value sum for DataFrame 'st_petersburg_incart_df':
ID             0
Age            0
Gender         0
Abnormality    0
dtype: int64


In [24]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder

# Loop through each dataframe
for df_name, df in result_dataframes.items():
    # Create a copy of the dataframe to modify
    df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
    df_copy = df.copy()
    
    # Create a one-hot encoded matrix for 'Abnormality'
    encoder = OneHotEncoder()
    abnormality_encoded = encoder.fit_transform(df_copy['Abnormality'].values.reshape(-1, 1)).toarray()
    
    # Define features and target variables
    X = abnormality_encoded
    y = df_copy['Age']
    
    # Identify rows with missing 'Age' values
    missing_age_indices = np.isnan(y)
    
    if np.sum(missing_age_indices) == 0:
        continue  # Skip if there are no missing 'Age' values
    
    # Build a linear regression model
    regressor = LinearRegression()
    regressor.fit(X[~missing_age_indices], y[~missing_age_indices])
    
    # Fill missing 'Age' values using the regression model
    for index, row in df.iterrows():
        if np.isnan(row['Age']):
            abnormality = row['Abnormality']
            abnormality_encoded = encoder.transform(np.array(abnormality).reshape(1, -1)).toarray()
            predicted_age = regressor.predict(abnormality_encoded)[0]
            df.loc[index, 'Age'] = predicted_age

# Now 'Age' values in each dataframe have been updated using regression


In [10]:
# import matplotlib.pyplot as plt

# # Replace this with the actual DataFrames from your result_dataframes dictionary
# cpsc_2018_df = result_dataframes['cpsc_2018_df']
# cpsc_2018_extra_df = result_dataframes['cpsc_2018_extra_df']
# georgia_df = result_dataframes['georgia_df']
# ptb_df = result_dataframes['ptb_df']
# ptb_xl_df = result_dataframes['ptb-xl_df']
# st_petersburg_incart_df = result_dataframes['st_petersburg_incart_df']

# # List of DataFrame objects
# dataframes = [cpsc_2018_df, cpsc_2018_extra_df, georgia_df, ptb_df, ptb_xl_df, st_petersburg_incart_df]

# # Create histograms for the "Age" column in each DataFrame
# for i, df in enumerate(dataframes):
#     plt.figure(figsize=(8, 6))
#     plt.hist(df['Age'], bins=20, color='skyblue', edgecolor='black')
    
#     # Add labels and title
#     plt.xlabel('Age')
#     plt.ylabel('Frequency')
#     plt.title(f'Age Distribution Histogram - {df}')
#     plt.grid(True)
    
#     # Display the histogram
#     plt.show()


In [11]:
# for df in dataframes:
#     plt.figure(figsize=(10, 6))
    
#     # Calculate the average age for each category of abnormality
#     average_age_by_abnormality = df.groupby('Abnormality')['Age'].mean()

#     average_age_by_abnormality.plot(kind='bar', color='skyblue')

#     # Add labels and title
#     plt.xlabel('Abnormality')
#     plt.ylabel('Average Age')
#     plt.title(f'Average Age by Abnormality - {df}')
#     plt.xticks(rotation=45, ha='right')

#     # Display the chart
#     plt.tight_layout()
#     plt.show()

In [12]:
# #According to the histogram representation for df-1 and df-2
# import numpy as np

# # Calculate the median age
# median_age = np.median(cpsc_2018_df['Age'].dropna())
# median_age = np.median(cpsc_2018_extra_df['Age'].dropna())

# # Fill missing values with the median age
# cpsc_2018_df['Age'].fillna(median_age, inplace=True)
# cpsc_2018_extra_df['Age'].fillna(median_age, inplace=True)