In [None]:
import pandas as pd
import numpy as np
import pathlib
from tqdm.auto import tqdm

import hydra
from omegaconf import DictConfig, OmegaConf

import torch
from torch_geometric import seed_everything

import ray

In [None]:
node = !hostname
if "sc" in node[0]:
    base_path = "/sc-projects/sc-proj-ukb-cvd"
else: 
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"
print(base_path)

project_label = "22_medical_records"
project_path = f"{base_path}/results/projects/{project_label}"
figure_path = f"{project_path}/figures"
output_path = f"{project_path}/data"

pathlib.Path(figure_path).mkdir(parents=True, exist_ok=True)
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)

experiment = 220531
experiment_path = f"{output_path}/{experiment}"
pathlib.Path(experiment_path).mkdir(parents=True, exist_ok=True)

In [None]:
records_long = pd.read_feather("/sc-projects/sc-proj-ukb-cvd/data/2_datasets_pre/211110_anewbeginning/artifacts/final_records_omop_220531.feather")

## Check Records

In [None]:
records_in = records_long[["eid", "concept_id", "date", "recruitment_date"]]\
    .query("date<=recruitment_date")\
    .drop_duplicates().assign(record_in=1.)\
    .rename(columns={"concept_id": "concept"})

In [None]:
records_out = pd.read_feather(f"{output_path}/baseline_records_long_220627.feather")#.query("eid==@eids_no_prior_records")
records_out

In [None]:
records_comparison = records_out.merge(records_in[["eid", "concept", "record_in"]], on=["eid","concept"], how="outer")
records_comparison["record_in"] = records_comparison["record_in"].fillna(0).astype(bool)
records_comparison["record"] = records_comparison["record"].fillna(0).astype(bool)

In [None]:
records_comparison_clean = records_comparison[["eid", "concept", "record_in", "record"]].drop_duplicates().reset_index(drop=True)

In [None]:
records_comparison_clean.query("record_in==record").sort_values("eid")

In [None]:
records_comparison_clean_mismatch = records_comparison_clean.query("record_in!=record").sort_values("eid")

In [None]:
records_comparison_clean_mismatch.assign(vocab = lambda x: x.concept.str.split(pat="_").str.get(0)).query("vocab!='phecode'")

In [None]:
records_comparison_clean.reset_index(drop=True).to_feather(f"{experiment_path}/sanity_check_records_220627.feather")

## Check Outcomes

In [None]:
endpoints_in = records_long[["eid", "concept_id", "date", "recruitment_date", "exit_date"]]\
    .query("date>recruitment_date")\
    .query("date<=exit_date")\
    .drop_duplicates().assign(event_in=1.)\
    .rename(columns={"concept_id": "endpoint"})

endpoints_in["endpoint"] = endpoints_in["endpoint"].str.replace(".", "-")

In [None]:
endpoints_out = pd.read_feather(f"{output_path}/baseline_outcomes_long_220627.feather")#.query("eid==@eids_no_prior_records")
endpoints_out

In [None]:
endpoints_comparison = endpoints_out.merge(endpoints_in[["eid", "endpoint", "event_in"]], on=["eid","endpoint"], how="left")
endpoints_comparison["event_in"] = endpoints_comparison["event_in"].fillna(0).astype(bool)

In [None]:
endpoints_comparison_clean = endpoints_comparison[["eid", "endpoint", "event_in", "event"]].drop_duplicates().reset_index(drop=True)

In [None]:
endpoints_comparison_clean.query("event==event_in").sort_values("eid")

In [None]:
endpoints_comparison_clean.query("event!=event_in").sort_values("eid")

In [None]:
endpoints_comparison_clean.query("event!=event_in").query("event_in==False").sort_values("eid")

In [None]:
sanity_check_false = endpoints_comparison_clean.query("event!=event_in").query("event_in==False").sort_values("eid")

In [None]:
sanity_check_false.eid.nunique()

In [None]:
sanity_check_false.eid.nunique()

In [None]:
endpoints_comparison_clean.reset_index(drop=True).to_feather(f"{experiment_path}/sanity_check_220627.feather")

In [None]:
f"{experiment_path}/sanity_check_220627.feather"