 <div style="background-color: #00008B; padding: 10px; color: white;">
<h2 style="color: #00BFFF; font-weight: bold;">optimising the minimum viable product in the heart_disease_ETL_exploratory notebook</h2>
<h3>

- wrap the code into functions as we want to keep it organised into tasks and it will be repetitive to read the data from the four data sources to form a larger dataset for modelling(the more the data the better) thus a function is appropriate
- add source identifier before merging for tracability and docstrings for function documentation
- clean master data set and save to csv and then continue with the machine learning life cycle

In [57]:
import pandas as pd
import numpy as np
import kagglehub
import os
from pathlib import Path

In [58]:
path = kagglehub.dataset_download("abdelazizsami/heart-disease")
print(os.listdir(path))



In [59]:
error_rows = []
def parse_heart_data(data_dir, filename):
    """
    Parses the UCI Heart Disease 'stream' data format into a structured DataFrame.
    
    Identifies records using the 'name' anchor, validates for 76 attributes

    Args:
        file_path (str): The path to the raw .data file.

    Returns:
        pd.DataFrame: A validated DataFrame with 76 columns.
    """
    #  Load raw trxt
    file_path = f"{data_dir}/{filename}"
    with open(f'{file_path}') as file:
        raw_text = file.read().replace('\n', ' ')#new line characters are interrupting so i'll strip them or use replace this
    raw_rows= raw_text.split('name')

    clean_rows = []
        
    # handle each row  text block(eaah patient record)
    for i, record in enumerate(raw_rows):
        values = record.strip().split()
        
        if not values:# this code prevent potential crashes when the anchor 'name' finishes at the end
            continue 
        
        # add 'name' back as the 76th column to maintain original data
        values.append('name')
        
        # validation: Check colunm count
        if len(values) == 76:
            clean_rows.append(values)
        else:
            error_rows.append({
            "patient_index": i,
            "found_length": len(values)
            })
        
    # create DataFrame
    df = pd.DataFrame(clean_rows) 
    return df

In [60]:
my_files = [
    'new.data', #from the kaggle documentation, the cleveland data was corupted and replaced with new.data
    'hungarian.data', 
    'switzerland.data', 
    'long-beach-va.data'
]

In [61]:
all_dfs = []
for file in my_files:
    # We pass 'path' (the KaggleHub directory) into the function
    temp_df = parse_heart_data(path, file)

    print(f"data source: {file} with shape: {temp_df.shape}")

    # add source identifier for the master dataset
    temp_df['source_dataset'] = file
    all_dfs.append(temp_df)

data source: new.data with shape: (0, 0)
data source: hungarian.data with shape: (294, 76)
data source: switzerland.data with shape: (123, 76)
data source: long-beach-va.data with shape: (200, 76)


In [62]:
print(len(error_rows))

308


In [63]:
error_rows[0:6]

[{'patient_index': 0, 'found_length': 90},
 {'patient_index': 1, 'found_length': 90},
 {'patient_index': 2, 'found_length': 90},
 {'patient_index': 3, 'found_length': 90},
 {'patient_index': 4, 'found_length': 90},
 {'patient_index': 5, 'found_length': 90}]

In [64]:
non_new_data_errors = [r for r in error_rows if r.get('found_length') != 90]
print(f"Found {len(non_new_data_errors)} errors that aren't length 90:")
print(non_new_data_errors)


Found 5 errors that aren't length 90:
[{'patient_index': 303, 'found_length': 103500}, {'patient_index': 304, 'found_length': 451}, {'patient_index': 305, 'found_length': 721}, {'patient_index': 306, 'found_length': 91}, {'patient_index': 307, 'found_length': 6662}]


### the new.data file fails the 76 column chech like we see here then it fails; without any info on how this file's columns are structured then there is no safe way forward, truncating s

In [65]:
print(len(all_dfs))

4


In [70]:
# concatenate and Export
processed_dfs=[]
for i, df in enumerate(all_dfs):
    if not df.empty:
        df['source_dataset'] = my_files[i].replace('.data', '')
        processed_dfs.append(df)
master_df = pd.concat(all_dfs, ignore_index=True)# each dataframe will have indexes from 0 to n, and will overlap which defeats the point of being unique thus we set ignore_index to true
master_df.to_csv('heart_disease_master.csv', index=False)# index is set to false to avoid point mutation where the indexes are added as data values

In [71]:
master_df.shape

(617, 77)

### as espected


In [73]:
master_df.head()

Unnamed: 0,source_dataset,0,1,2,3,4,5,6,7,8,...,66,67,68,69,70,71,72,73,74,75
0,hungarian,1254,0,40,1,1,0,0,-9,2,...,-9,-9,1,1,1,1,1,-9.0,-9.0,name
1,hungarian,1255,0,49,0,1,0,0,-9,3,...,-9,-9,1,1,1,1,1,-9.0,-9.0,name
2,hungarian,1256,0,37,1,1,0,0,-9,2,...,-9,-9,1,1,1,1,1,-9.0,-9.0,name
3,hungarian,1257,0,48,0,1,1,1,-9,4,...,2,-9,1,1,1,1,1,-9.0,-9.0,name
4,hungarian,1258,0,54,1,1,0,1,-9,3,...,1,-9,1,1,1,1,1,-9.0,-9.0,name


In [75]:
df.columns[-1]

'source_dataset'