In [None]:
import numpy as np
import pandas as pd
import pathlib
from tqdm.auto import tqdm

import ray

In [None]:
base_path = "/home/jakobs"

In [None]:
data_baseline = pd.read_feather(f"{base_path}/data/data_baseline_230321.feather")

In [None]:
data_baseline

## Map records to phecodes

In [None]:
endpoints_md = pd.read_csv(f"{base_path}/BiHealth/onnx/endpoints.csv", dtype={"phecode":str}).drop(columns="Unnamed: 0").set_index("endpoint")#[["endpoint", "eligable", "n", "freq", "phecode", "phecode_string", "phecode_category", "sex"]]
endpoints_md

In [None]:
phecodes_md = pd.read_csv(f"{base_path}/mapping/phecode_strings_V2.csv", dtype={"phecode":str, "icd10":str}).sort_values("phecode").reset_index(drop=True)#.rename(columns={"icd10":"icd"})[["icd", "phecode"]]
phecodes_md["phecode"] = phecodes_md["phecode"].str.replace(".", "-")
phecodes_md

In [None]:
icd10_phecode_mapping = pd.read_csv(f"{base_path}/mapping/ICD10_to_phecode_V2.csv", dtype={"phecode":str, "icd10":str}).rename(columns={"icd10":"icd"})[["icd", "phecode"]]
icd10_phecode_mapping["phecode"] = icd10_phecode_mapping["phecode"].str.replace(".", "-")
icd10_phecode_mapping

## Create Metdata and load records

In [None]:
hes = pd.read_stata(f"{base_path}/BiHealth/Data/StudyData/HESLong2019HC3.dta").rename(columns={"bihid": "eid", "admidate":"date", "Diag":"icd"})[["eid", "icd", "date"]].assign(source="HES")

In [None]:
deaths = pd.read_stata(f"{base_path}/BiHealth/Data/StudyData/DeathsLong2019HC3.dta").rename(columns={"bihid": "eid", "dod":"date", "diag":"icd"})[["eid", "icd", "date"]].assign(source="ONS")

In [None]:
metadata_md = data_baseline[["eid", "recruitment_date"]]\
    .merge(deaths[["eid", "date"]].drop_duplicates().rename(columns={"date":"death_date"}), how="left")\
    .assign(censoring_date=pd.to_datetime("2019-06-17", infer_datetime_format=True))#\
    #.assign(exit_date = lambda x: min(x.date, x.censoring_date))[["eid", "recruitment_date", "exit_date"]]
metadata_md["exit_date"] = metadata_md["death_date"].combine_first(metadata_md["censoring_date"])
metadata_md = metadata_md[["eid", "recruitment_date", "death_date", "exit_date"]].assign(exit_time=lambda x: (x.exit_date - x.recruitment_date).dt.days / 365.25)

In [None]:
icd_long = pd.concat([hes, deaths], axis=0).sort_values(["eid", "date"]).drop_duplicates().reset_index(drop=True)

In [None]:
phecodes_long = icd_long.merge(icd10_phecode_mapping, how="left", on="icd")
phecodes_long

In [None]:
phecodes_long_mapped_native = phecodes_long.query("phecode==phecode")
phecodes_long_unmapped_hack = phecodes_long.query("phecode!=phecode").drop(columns="phecode").assign(icd= lambda x: x.icd.str.slice(0, 3)).merge(icd10_phecode_mapping, how="left", on="icd")
phecodes_long_mapped = pd.concat([phecodes_long_mapped_native, phecodes_long_unmapped_hack], axis=0).drop_duplicates().sort_values(["eid", "date"]).query("phecode==phecode").reset_index(drop=True)
phecodes_long_mapped["endpoint"] = "phecode_" + phecodes_long_mapped["phecode"]
phecodes_long_mapped

In [None]:
deaths_long_mapped = deaths[["eid", "date"]].drop_duplicates().assign(source="ONS", icd="", phecode="", endpoint = "OMOP_4306655")[["eid", "icd", "date", "source", "phecode", "endpoint"]]
deaths_long_mapped

In [None]:
endpoints_long_mapped = pd.concat([phecodes_long_mapped, deaths_long_mapped], axis=0).sort_values(["eid", "date"]).reset_index(drop=True).merge(metadata_md, on="eid", how="left")
endpoints_long_mapped = endpoints_long_mapped[['eid','recruitment_date', "death_date", 'exit_date', 'source', 'date',  'icd', 'phecode', 'endpoint',]]
endpoints_long_mapped

In [None]:
endpoints_long_mapped.to_feather(f"{base_path}/data/records_long_230320.feather")

In [None]:
records_prev = endpoints_long_mapped.query("date<=recruitment_date")
records_inc = endpoints_long_mapped.query("date>recruitment_date")

In [None]:
data_outcomes_prev = records_prev[["eid", "endpoint"]].drop_duplicates().reset_index(drop=True).assign(prev=1)
data_outcomes_prev

In [None]:
data_outcomes_event_time = records_inc.sort_values(["eid", "endpoint", "date"]).drop_duplicates(["eid", "endpoint", "date"], keep="first")\
    .assign(event=1, time=lambda x: (x.date - x.recruitment_date).dt.days / 365.25).reset_index(drop=True)[["eid", "endpoint", "event", "time"]]
data_outcomes_event_time

In [None]:
data_outcomes_event = data_outcomes_event_time[["eid", "endpoint", "event"]]
data_outcomes_time = data_outcomes_event_time[["eid", "endpoint", "time"]]

In [None]:
temp_wide = data_outcomes_prev.pivot_table(index='eid', columns='endpoint', values='prev', fill_value=0)
data_outcomes_prev_wide = data_baseline[["eid"]].merge(temp_wide, on="eid", how="left").fillna(0)
data_outcomes_prev_wide.reset_index(drop=True, inplace=True)
data_outcomes_prev_wide = data_outcomes_prev_wide.set_index("eid")
data_outcomes_prev_wide.columns = [f"{c}_prev" for c in data_outcomes_prev_wide.columns]

data_outcomes_prev_wide

In [None]:
temp_wide = data_outcomes_event.pivot_table(index='eid', columns='endpoint', values='event', fill_value=0)
data_outcomes_event_wide = data_baseline[["eid"]].merge(temp_wide, on="eid", how="left").fillna(0)
data_outcomes_event_wide.reset_index(drop=True, inplace=True)
data_outcomes_event_wide = data_outcomes_event_wide.set_index("eid")
data_outcomes_event_wide.columns = [f"{c}_event" for c in data_outcomes_event_wide.columns]

data_outcomes_event_wide

In [None]:
temp_wide = data_outcomes_time.pivot_table(index='eid', columns='endpoint', values='time', fill_value=np.nan)
data_outcomes_time_wide = data_baseline[["eid"]].merge(temp_wide, on="eid", how="left")
data_outcomes_time_wide.reset_index(drop=True, inplace=True)
data_outcomes_time_wide = data_outcomes_time_wide.set_index("eid")

metadata_md = metadata_md.set_index("eid")

# Use the 'apply()' function to replace missing values (NaN) with the corresponding 'exit_time' values
data_outcomes_time_wide = data_outcomes_time_wide.apply(
    lambda row: row.where(
        pd.notnull(row), metadata_md.loc[row.name]["exit_time"]
    ),
    axis=1
)

# Reset the index of 'metadata_md' if needed
metadata_md.reset_index(inplace=True)

data_outcomes_time_wide.columns = [f"{c}_time" for c in data_outcomes_time_wide.columns]

# Check the updated 'data_outcomes_time_wide' DataFrame
data_outcomes_time_wide

In [None]:
data_outcomes_wide = data_outcomes_prev_wide\
    .merge(data_outcomes_event_wide, how="left", left_index=True, right_index=True)\
    .merge(data_outcomes_time_wide, how="left", left_index=True, right_index=True).sort_index(axis=1)
data_outcomes_wide

In [None]:
data_outcomes_wide.reset_index().to_feather(f"{base_path}/data/data_outcomes_wide_230320.feather")

In [None]:
endpoints = sorted(list(set([c.replace("_prev", "").replace("_event", "").replace("_time", "") for c in data_outcomes_wide.columns])))

In [None]:
endpoints

In [None]:
dfs_long = []
for e in tqdm(endpoints):
    cols = [c for c in data_outcomes_wide.columns if c in [f"{e}_prev", f"{e}_event", f"{e}_time"]]
    temp = data_outcomes_wide[cols]
    cols_post = temp.columns = [c.replace(f"{e}_", "") for c in temp.columns]
    temp = temp.assign(endpoint = e)[["endpoint"] + cols_post]
    dfs_long.append(temp)

In [None]:
data_outcomes_long_pre = pd.concat(dfs_long, axis=0).reset_index()
data_outcomes_long_pre["prev"] = data_outcomes_long_pre["prev"].fillna(0.0)

metadata_long = data_outcomes_long_pre.query("event==0")[["eid", "event", "time"]].drop_duplicates().reset_index(drop=True)
merged_df = data_outcomes_long_pre.merge(metadata_long, on="eid", how="left", suffixes=("_outcomes", "_metadata"))
merged_df["event"] = merged_df["event_outcomes"].fillna(merged_df["event_metadata"])
merged_df["time"] = merged_df["time_outcomes"].fillna(merged_df["time_metadata"])
merged_df.drop(["event_metadata", "time_metadata", "event_outcomes", "time_outcomes"], axis=1, inplace=True)
data_outcomes_long = merged_df[["eid", "endpoint", "prev", "event", "time"]]

In [None]:
data_outcomes_long.to_feather(f"{base_path}/data/data_outcomes_long_230320.feather")

In [None]:
data_outcomes_long.sample(10)

In [None]:
len(data_outcomes_long)/7411