In [19]:
import json
data = json.load(open("categorical-data.json"))
data


[{'patientId': 'P001',
  'gender': 'female',
  'birthYear': 1982,
  'disease': 'disease_A',
  'therapies': [{'therapyId': 'T1',
    'startDate': '2025-08-20',
    'endDate': '2025-10-07',
    'stopReason': 'adverse_event'},
   {'therapyId': 'T2',
    'startDate': '2025-08-22',
    'endDate': '2025-11-20',
    'stopReason': 'adverse_event'}],
  'sideEffects': [],
  'diagnoses': [{'diagnosisOptionsId': 'D1',
    'startDate': '2024-07-28',
    'endDate': None}],
  'events': []},
 {'patientId': 'P002',
  'gender': 'male',
  'birthYear': 1975,
  'disease': 'disease_B',
  'therapies': [{'therapyId': 'T1',
    'startDate': '2025-08-05',
    'endDate': '2025-09-21',
    'stopReason': 'completed'}],
  'sideEffects': [{'sideEffectId': 'SE1',
    'intensity': 4,
    'startDate': '2025-07-28',
    'endDate': '2025-07-29'}],
  'diagnoses': [{'diagnosisOptionsId': 'D1',
    'startDate': '2024-07-28',
    'endDate': None}],
  'events': [{'event': 'RELAPSE', 'startDate': '2025-10-20'},
   {'event': 'H

In [20]:
import json
import pprint
pp = pprint.PrettyPrinter(indent=2)
pp.pprint(json.load(open("categorical-data.json")))


[ { 'birthYear': 1982,
    'diagnoses': [ { 'diagnosisOptionsId': 'D1',
                     'endDate': None,
                     'startDate': '2024-07-28'}],
    'disease': 'disease_A',
    'events': [],
    'gender': 'female',
    'patientId': 'P001',
    'sideEffects': [],
    'therapies': [ { 'endDate': '2025-10-07',
                     'startDate': '2025-08-20',
                     'stopReason': 'adverse_event',
                     'therapyId': 'T1'},
                   { 'endDate': '2025-11-20',
                     'startDate': '2025-08-22',
                     'stopReason': 'adverse_event',
                     'therapyId': 'T2'}]},
  { 'birthYear': 1975,
    'diagnoses': [ { 'diagnosisOptionsId': 'D1',
                     'endDate': None,
                     'startDate': '2024-07-28'}],
    'disease': 'disease_B',
    'events': [ {'event': 'RELAPSE', 'startDate': '2025-10-20'},
                {'event': 'HOSPITALIZATION', 'startDate': '2025-10-01'}],
    'gender': 'male

In [21]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import json

ts = pd.read_json("timeseries-data.json")
clinical = json.load(open("categorical-data.json"))

ts['start'] = pd.to_datetime(ts['start'])
ts['end'] = pd.to_datetime(ts['end'])
ts['date'] = ts['start'].dt.date

daily_steps = ts.groupby('date')['count'].sum().reset_index()
daily_steps.rename({'count': 'Daily_Step_Count'}, axis=1, inplace=True)

full_range = pd.date_range(daily_steps['date'].min(),
                           daily_steps['date'].max(),
                           freq='D')
daily_steps = daily_steps.set_index('date').reindex(full_range).fillna(0)
daily_steps.index.name = 'date'


df = daily_steps.copy()

# Extract the single patient record (most datasets of this type store 1 patient per file)
clinical_record = clinical[0]

# Demographics
gender = clinical_record['demographics']['gender']
birthyear = clinical_record['demographics']['birthYear']
disease = clinical_record['demographics']['disease']

df['age'] = datetime.now().year - birthyear
df[f'gender_{gender}'] = 1
df[f'disease_{disease}'] = 1

# Therapies
for t in clinical_record['therapies']:
    sid = t['therapyId']
    s = pd.to_datetime(t['startDate']).date()
    e = pd.to_datetime(t['endDate']).date()
    add_interval_feature(df, s, e, f"therapy_{sid}")

# Side effects
df['active_side_effect_count'] = 0
df['max_side_effect_intensity'] = 0

for se in clinical_record['sideEffects']:
    s = pd.to_datetime(se['startDate']).date()
    e = pd.to_datetime(se['endDate']).date()
    intensity = se['intensity']
    mask = (df.index >= s) & (df.index <= e)
    df.loc[mask, 'active_side_effect_count'] += 1
    df.loc[mask, 'max_side_effect_intensity'] = df.loc[mask, 'max_side_effect_intensity'].clip(lower=intensity)

# Diagnoses
for d in clinical_record['diagnoses']:
    s = pd.to_datetime(d['startDate']).date()
    e = pd.to_datetime(d['endDate']).date()
    diag = d['diagnosisOptionsId']
    add_interval_feature(df, s, e, f"diagnosis_{diag}")

# Events
df['days_since_event'] = np.nan
last_event = None
event_dates = {pd.to_datetime(ev['startDate']).date() for ev in clinical_record['events']}

for day in df.index:
    if day in event_dates:
        last_event = day
        df.loc[day, 'days_since_event'] = 0
    elif last_event:
        df.loc[day, 'days_since_event'] = (day - last_event).days

df['days_since_event'] = df['days_since_event'].fillna(df['days_since_event'].max())



KeyError: 'demographics'