In [None]:
import pandas as pd
import numpy as np
import pathlib
from tqdm.auto import tqdm

import hydra
from omegaconf import DictConfig, OmegaConf

import ray

In [None]:
node = !hostname
if "sc" in node[0]:
    base_path = "/sc-projects/sc-proj-ukb-cvd"
else: 
    base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"
print(base_path)

project_label = "22_medical_records"
project_path = f"{base_path}/results/projects/{project_label}"
figure_path = f"{project_path}/figures"
output_path = f"{project_path}/data"

pathlib.Path(figure_path).mkdir(parents=True, exist_ok=True)
pathlib.Path(output_path).mkdir(parents=True, exist_ok=True)

In [None]:
records = pd.read_feather(f"{output_path}/baseline_records_220627.feather").set_index("eid")

In [None]:
records

In [None]:
records_per_individual = records.sum(axis=1).to_frame("n_records").reset_index()

In [None]:
records_per_individual

In [None]:
records_per_individual.to_feather("/sc-projects/sc-proj-ukb-cvd/data/2_datasets_pre/211110_anewbeginning/artifacts/records_per_individual_220627.feather")

In [None]:
records.info()

In [None]:
records_freq = records.sum(axis=0).sort_values(ascending=False).to_frame().reset_index()
records_freq.columns = ["record", "n"]
records_freq = records_freq.set_index("record")
records_freq

In [None]:
concepts_raw = pd.read_csv("/sc-projects/sc-proj-ukb-cvd/data/mapping/athena/CONCEPT.csv", sep="\t", engine="c", dtype={"concept_id": str})
concepts_raw["record"] = "OMOP_" + concepts_raw["concept_id"]
concept_raw = concepts_raw.set_index("record")

In [None]:
records_freq_md = records_freq.merge(concept_raw, left_index=True, right_index=True, how="left")

In [None]:
records_freq_md.query("n>1000").shape

In [None]:
records_freq_md.query("n>100").shape

In [None]:
records_freq_md.query("n>50").shape

In [None]:
records_freq_md.query("n>25").shape

In [None]:
records_freq_md.query("n>10").shape

In [None]:
records_freq_md.query("n>5").shape

In [None]:
artifact_path = "/sc-projects/sc-proj-ukb-cvd/data/2_datasets_pre/211110_anewbeginning/artifacts/record_frequencies_220627.feather"

In [None]:
records_freq_md.reset_index()[["record", "n", "concept_id", "concept_name", "domain_id", "vocabulary_id", "concept_class_id", "standard_concept"]].to_feather(artifact_path)

In [None]:
import wandb

run = wandb.init(project="RecordGraphs", entity="cardiors", tags=["artifacts"])

artifact = wandb.Artifact("RecordFrequencies", type="prepare_records")
artifact.add_reference(f"file://{artifact_path}", "RecordsMetadata", checksum=True)
run.log_artifact(artifact)

run.finish()

In [None]:
records_freq_md.reset_index()