In [108]:
import os
from pathlib import Path
import wfdb
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [109]:
# Database path
database_path = Path('D:/IIT_SE/Learning Resources/4th Year/FYP/Project/ECG Based CAD Prediction System/src/datasets/mit-bih-arrhythmia-database-1.0.0')

In [110]:
# List of records in the MIT-BIH Arrhythmia dataset
records = ['100'] 

data = []

# Loop through each record
for record_name in records:
    record_path = database_path / record_name
    record = wfdb.rdrecord(str(record_path))
    
    try:
        annotation = wfdb.rdann(str(record_path), 'atr')
        
        # Convert sample indices to labels
        labels = []
        for s in range(len(record.p_signal)):
            indices = np.where(annotation.sample == s)[0]
            if len(indices) > 0:
                labels.append(annotation.symbol[indices[0]])
            else:
                labels.append(None)  # No annotation for this sample

        # Create dataframe with waveform data and labels
        combined_data = pd.DataFrame(data=record.p_signal, columns=record.sig_name)
        combined_data['label'] = labels
        
        data.append(combined_data)
    except Exception as e:
        print(f"Error processing record {record_name}: {e}")

# Concatenate data from all valid records
if data:
    data = pd.concat(data, ignore_index=True)

    # Shuffle the dataset
    data_shuffled = data.sample(frac=1, random_state=42).reset_index(drop=True)

    # Split the dataset into features and labels
    X = data_shuffled.drop(columns=["label"])
    y = data_shuffled["label"]

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Write training data to CSV file
    train_data = pd.concat([X_train, y_train], axis=1)
    train_data.to_csv("100.csv", index=False)
else:
    print("No valid data found to process.")