# Preprocessing

In [None]:
import pandas as pd
import numpy as np
import os
import yaml
from tqdm.auto import tqdm

node = !hostname
if "sc" in node[0]:
    base_path = "/sc-projects/sc-proj-ukb-cvd"
else: base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"
print(base_path)

dataset_name = "211110_anewbeginning"
mapping_path = f"{base_path}/data/mapping"
dataset_path = f"{base_path}/data/2_datasets_pre/{dataset_name}"

In [None]:
from pathlib import Path
Path(dataset_path).mkdir(parents=True, exist_ok=True)

In [None]:
import wandb

project="recordgraphs"
entity="cardiors"
artifact_date = "220621"

In [None]:
def create_artifact(project, entity, artifact_name, type_name, description, artifact_path, df):
    
    run = wandb.init(project=project, job_type="log_artifact", entity=entity, tags=["artifact"])
    
    artifact = wandb.Artifact(artifact_name, type=type_name, 
                              description=description,
                             metadata = {"1_shape": f"{len(df)}x{len(df.columns)}",
                                         "2_cols": str(df.columns.to_list())})
    artifact.add_reference(f"""file://{artifact_path}""", artifact_name, checksum=True)
    run.log_artifact(artifact)
    
    run.finish()

# Prepare Patient Records

## Mappings + Vocabulary

In [None]:
vocab_dir = f"{mapping_path}/athena"
vocab = {
    "concept": pd.read_csv(f"{vocab_dir}/CONCEPT.csv", sep='\t'),
    "domain": pd.read_csv(f"{vocab_dir}/DOMAIN.csv", sep='\t'),
    "class": pd.read_csv(f"{vocab_dir}/CONCEPT_CLASS.csv", sep='\t'),
    "relationship": pd.read_csv(f"{vocab_dir}/RELATIONSHIP.csv", sep='\t'),
    "drug_strength": pd.read_csv(f"{vocab_dir}/DRUG_STRENGTH.csv", sep='\t'),
    "vocabulary": pd.read_csv(f"{vocab_dir}/VOCABULARY.csv", sep='\t'),
    "concept_synonym": pd.read_csv(f"{vocab_dir}/CONCEPT_SYNONYM.csv", sep='\t'),
    "concept_ancestor": pd.read_csv(f"{vocab_dir}/CONCEPT_ANCESTOR.csv", sep='\t'),
    "concept_relationship": pd.read_csv(f"{vocab_dir}/CONCEPT_RELATIONSHIP.csv", sep='\t')                       
}

In [None]:
def get_fields(fields, data, data_field):
    f = data_field[data_field["field.showcase"].isin(fields) & data_field["field.tab"].str.contains("f\\.\\d+\\.0\\.\\d")].copy()
    f["field"] = pd.Categorical(f["field.showcase"], categories=fields, ordered=True)
    f = f.sort_values("field").reset_index().drop("field", axis=1)
    return f

def get_fields_all(fields, data, data_field):
    f = data_field[data_field["field.showcase"].isin(fields)].copy()
    f["field"] = pd.Categorical(f["field.showcase"], categories=fields, ordered=True)
    f = f.sort_values("field").reset_index().drop("field", axis=1)
    return f

def get_data_fields(fields, data, data_field):
    f = get_fields(fields, data, data_field)
    return data[["eid"]+f["col.name"].to_list()].copy()

def get_data_fields_all(fields, data, data_field):
    f = get_fields_all(fields, data, data_field)
    return data[["eid"]+f["col.name"].to_list()].copy()

In [None]:
coding10 = pd.read_csv(f"{mapping_path}/codings/coding10.tsv", sep="\t").assign(coding = lambda x: x.coding.astype("int")).rename(columns={"coding":"uk_biobank_assessment_centre_f54_0_0"})
coding10["uk_biobank_assessment_centre_f54_0_0"] = coding10["uk_biobank_assessment_centre_f54_0_0"].astype("int")

In [None]:
data = pd.read_feather(f"{base_path}/data/1_decoded/ukb_data.feather")
data_field = pd.read_feather(f"{base_path}/data/1_decoded/ukb_data_field.feather")
data_columns = data.columns.to_list()

In [None]:
# Drop obviouse missing data
print(len(data))
data = data.dropna(subset=["sex_f31_0_0"], axis=0)
print(len(data))

In [None]:
fields_basics = [
    "21022", # age at recruitment
    "31", # sex
    "21000", # ethnicity
   # "189", # Townsend index
    "53", # date of baseline assessment
    "54", # assessment center
]

temp = get_data_fields(fields_basics, data, data_field)

temp["sex_f31_0_0"] = temp["sex_f31_0_0"].cat.set_categories(["Female", 'Male'], ordered=False)

#temp["ethnic_background_f21000_0_0"] = temp["ethnic_background_f21000_0_0"].astype("string")

ethn_bg_def = {#"White": ["White", "British", "Irish", "Any other white background"],
#                "Mixed": ["Mixed", "White and Black Caribbean", "White and Black African", "White and Asian", "Any other mixed background"],  
##                "Asian": ["Asian or Asian British", "Indian", "Pakistani", "Bangladeshi", "Any other Asian background"], 
#                "Black": ["Black or Black British", "Caribbean", "African", "Any other Black background"],
#                "Chinese": ["Chinese"],  
                np.nan: ["Other ethnic group", "Do not know", "Prefer not to answer"]}

ethn_bg_dict = {}
for key, values in ethn_bg_def.items(): 
    for value in values:
        ethn_bg_dict[value]=key 
        
temp["ethnic_background_f21000_0_0"].replace(ethn_bg_dict, inplace=True)
temp["ethnic_background_f21000_0_0"] = temp["ethnic_background_f21000_0_0"].astype("category")

#
#temp["ethnic_background_f21000_0_0"] = temp["ethnic_background_f21000_0_0"].astype("category").cat.set_categories(['White', 'Black', 'Asien', 'Mixed', 'Chinese'], ordered=False)

basics = temp
print(len(temp))

from dateutil.relativedelta import relativedelta
calc_birth_date = [date_of_attending_assessment_centre - relativedelta(years=age_at_recruitment) 
                                                             for date_of_attending_assessment_centre, age_at_recruitment 
                                                             in zip(basics["date_of_attending_assessment_centre_f53_0_0"], basics["age_at_recruitment_f21022_0_0"])]

basics = basics.assign(birth_date = calc_birth_date)
basics["uk_biobank_assessment_centre_f54_0_0"] = basics.assign(uk_biobank_assessment_centre_f54_0_0 = lambda x: x.uk_biobank_assessment_centre_f54_0_0.astype("int")).merge(coding10, on="uk_biobank_assessment_centre_f54_0_0")["meaning"]


display(basics.head())

In [None]:
artifact_name = "baseline_covariates"
type_name = "prepare_covariates"
description = "Dataframe of covariates at recruitment"
artifact_path = f"{dataset_path}/{artifact_name}_{artifact_date}.feather"
basics.to_feather(artifact_path)
create_artifact(project, entity, artifact_name, type_name, description, artifact_path, basics)

# Load complete data from GP and HES and ONS

In [None]:
## Format should be similar as example below

In [None]:
out_path = '/sc-projects/sc-proj-ukb-cvd/data/1_decoded'

In [None]:
patient_records = pd.read_feather(f"{out_path}/dataportal_records_omop_220407.feather")

In [None]:
patient_records.head()

In [None]:
patient_records.sample(10)

In [None]:
artifact_name = "complete_records"
type_name = "prepare_records"
description = "Complete records from dataportal in long format"
artifact_path = f"{dataset_path}/{artifact_name}_{artifact_date}.feather"
patient_records.to_feather(artifact_path)
create_artifact(project, entity, artifact_name, type_name, description, artifact_path, patient_records)

## Prepare Long Records Matrix 

In [None]:
import pathlib
import wandb

def get_path_from_wandb(reference: str):
    path = pathlib.Path(reference.split("file://")[1])
    assert path.exists()
    return path

def read_artifact(run, type_name, artifact_name):
    
    artifact = run.use_artifact(f'cardiors/recordgraphs/{artifact_name}:latest', type=type_name)
    file_path = get_path_from_wandb(artifact.manifest.entries[artifact_name].ref)
    print(file_path)

    return file_path


In [None]:
run = wandb.init(project=project, job_type="log_artifact", entity=entity, tags=["artifacts"])

In [None]:
basics = pd.read_feather(read_artifact(run, "prepare_covariates", "baseline_covariates"))

In [None]:
complete_records = pd.read_feather(read_artifact(run, "prepare_records", "complete_records"))\
    .assign(concept_id = lambda x: x.concept_id.astype(int))

In [None]:
complete_records.head()

In [None]:
# set sensible exit date
complete_records.groupby("origin")["date"].max()

In [None]:
import datetime

deaths = complete_records.query("origin=='death_records'")[["eid", "date"]].drop_duplicates().rename(columns={"date":"death_date"})
extended = basics.merge(deaths, on="eid", how="left")[["eid", "birth_date", "date_of_attending_assessment_centre_f53_0_0", "death_date"]].set_index("eid").rename(columns={"date_of_attending_assessment_centre_f53_0_0":"recruitment_date"})
extended = extended.where(extended.notnull(), pd.NaT).assign(cens_date=datetime.date(2021, 9, 24))
extended["exit_date"] = np.minimum(extended["death_date"].values, extended["cens_date"].values)
#extended.reset_index().to_feather(os.path.join(dataset_path, 'temp_extended.feather'))

In [None]:
extended = extended.reset_index()

In [None]:
extended.head()

In [None]:
extended.dtypes

In [None]:
def add_artifact(run, artifact_name, type_name, description, artifact_path, df):
   
    artifact = wandb.Artifact(artifact_name, type=type_name, 
                              description=description,
                             metadata = {"1_shape": f"{len(df)}x{len(df.columns)}",
                                         "2_cols": str(df.columns.to_list())})
    artifact.add_reference(f"""file://{artifact_path}""", artifact_name, checksum=True)
    run.log_artifact(artifact)

In [None]:
# log in wandb
artifact_name = "metadata_individuals"
type_name = "prepare_records"
description = "Metadata for individuals"
artifact_path = f"{dataset_path}/artifacts/{artifact_name}_{artifact_date}.feather"
extended.to_feather(artifact_path)
add_artifact(run, artifact_name, type_name, description, artifact_path, extended)

In [None]:
artifact_path

In [None]:
artifact_path

In [None]:
records_matrix = complete_records\
    .sort_values(["eid", "date", "origin", "domain_id", "code", "concept_id"])\
    .reset_index(drop=True).set_index("eid")

In [None]:
records_matrix_long = extended.set_index("eid")\
    .merge(records_matrix, left_index=True, right_index=True, how="left")\
    .reset_index()

In [None]:
records_matrix_long = records_matrix_long.query("concept_id==concept_id").reset_index(drop=True)

In [None]:
# keep origin column
records_matrix_long.info()

In [None]:
def add_artifact(run, artifact_name, type_name, description, artifact_path, df):
   
    artifact = wandb.Artifact(artifact_name, type=type_name, 
                              description=description,
                             metadata = {"1_shape": f"{len(df)}x{len(df.columns)}",
                                         "2_cols": str(df.columns.to_list())})
    artifact.add_reference(f"""file://{artifact_path}""", artifact_name, checksum=True)
    run.log_artifact(artifact)

In [None]:
# log in wandb
artifact_name = "complete_records_extended"
type_name = "prepare_records"
description = "First patient records from the data portal in long format"
artifact_path = f"{dataset_path}/{artifact_name}_{artifact_date}.feather"
records_matrix_long.to_feather(artifact_path)
add_artifact(run, artifact_name, type_name, description, artifact_path, records_matrix_long)

In [None]:
run.finish()

In [None]:
artifact_path