## Fetch 2009 PhysioNet challenge data

https://archive.physionet.org/challenge/2009/

In [None]:
from bs4 import BeautifulSoup
import os
import pandas as pd
import requests
import tarfile
import wfdb

from src.data.parsing import parse_txt

source_url = 'https://archive.physionet.org/challenge/2009/training-set-clinical-data.tar.gz'
target_dir = 'data'
target_path = f'{target_dir}/training-set-clinical-data.tar.gz'




In [None]:
def fetch_settings():
    return {
        'fetch_clinical_data': False,
        'verbose': False
    }

## Fetch archive of PhysioNet challenge records

In [None]:
if fetch_settings()['fetch_clinical_data']:
    response = requests.get(source_url, stream=True)
    if response.status_code == 200:
        with open(target_path, 'wb') as f:
            f.write(response.raw.read())

    with tarfile.open(target_path, "r:gz") as tar_file:
        tar_file.extractall(target_dir)
else: 
    print("Use cached clinical data")

## Handle individual records

In [None]:
record_map = pd.read_csv(
    'data/mimic2cdb/MAP', 
    sep="\t", 
    names = ['Clinical', 'Wave', 'Sex', 'Age', 'Birthdate', 'Waveform'],
    index_col = False, 
    skiprows = [0,1])
record_map.head()


## Functions to generate waveform data set

In [None]:
def generate_record_map():
    settings = fetch_settings()
    df = pd.read_csv(
        'data/mimic2cdb/MAP', 
        sep="\t", 
        names = ['Clinical', 'Wave', 'Sex', 'Age', 'Birthdate', 'Waveform'],
        index_col = False, 
        skiprows = [0,1])
    if settings['verbose']:
        print(f"Dimensions of data set: {df.shape}")
        print(f"Data set reflects data for {len(df['Clinical'].unique().tolist())} clinical IDs")
        print(f"Data set reflects data for {len(df['Wave'].unique().tolist())} waveform IDs")
    return({'data':df, 
            'clinical_entities': df['Clinical'].unique().tolist(),
            'waveform_entities': df['Wave'].unique().tolist()
           })


def filter_data_to_entity(df, entity_colname, entity):
    return df[df[entity_colname] == entity]


def generate_waveform_dataset(e, df):
    settings = fetch_settings()
    data = filter_data_to_entity(df, 'Wave', e)
    data = data.squeeze().to_dict()
    if settings['verbose']: print(data)
    record = wfdb.rdrecord(f"data/train_wave/{data['Wave']}")
    return {
        'raw_data': data,
        'waveform_data': record
    }
    

## Generate the data set

In [None]:
record_map = generate_record_map()
x = {e:generate_waveform_dataset(e, record_map['data']) for e in record_map['waveform_entities'][0:10]}