# Preprocessing

In [None]:
import pandas as pd
import numpy as np
import os
import yaml
from tqdm.auto import tqdm

node = !hostname
if "sc" in node[0]:
    base_path = "/sc-projects/sc-proj-ukb-cvd"
else: base_path = "/data/analysis/ag-reils/ag-reils-shared/cardioRS"
print(base_path)

dataset_name = "230424_medicalhistory"
mapping_path = f"{base_path}/data/mapping"
dataset_path = f"{base_path}/data/2_datasets_pre/{dataset_name}"

In [None]:
from pathlib import Path
Path(dataset_path).mkdir(parents=True, exist_ok=True)

In [None]:
data = pd.read_feather(f"{base_path}/data/1_decoded/ukb_data.feather")
data_field = pd.read_feather(f"{base_path}/data/1_decoded/ukb_data_field.feather")
data_columns = data.columns.to_list()

## Mappings + Vocabulary

In [None]:
# Drop obviouse missing data
print(len(data))
data = data.dropna(subset=["sex_f31_0_0"], axis=0)
print(len(data))

# Starting information

In [None]:
#time0_col="birth_date"
time0_col="date_of_attending_assessment_centre_f53_0_0"

# Baseline covariates

In [None]:
def get_fields(fields, data, data_field):
    f = data_field[data_field["field.showcase"].isin(fields) & data_field["field.tab"].str.contains("f\\.\\d+\\.0\\.\\d")].copy()
    f["field"] = pd.Categorical(f["field.showcase"], categories=fields, ordered=True)
    f = f.sort_values("field").reset_index().drop("field", axis=1)
    return f

def get_fields_all(fields, data, data_field):
    f = data_field[data_field["field.showcase"].isin(fields)].copy()
    f["field"] = pd.Categorical(f["field.showcase"], categories=fields, ordered=True)
    f = f.sort_values("field").reset_index().drop("field", axis=1)
    return f

def get_data_fields(fields, data, data_field):
    f = get_fields(fields, data, data_field)
    return data[["eid"]+f["col.name"].to_list()].copy()

def get_data_fields_all(fields, data, data_field):
    f = get_fields_all(fields, data, data_field)
    return data[["eid"]+f["col.name"].to_list()].copy()

### Basics

In [None]:
coding10 = pd.read_csv(f"{mapping_path}/codings/coding10.tsv", sep="\t").assign(coding = lambda x: x.coding.astype("int")).rename(columns={"coding":"uk_biobank_assessment_centre_f54_0_0"})
coding10["uk_biobank_assessment_centre_f54_0_0"] = coding10["uk_biobank_assessment_centre_f54_0_0"].astype("int")

In [None]:
fields_basics = [
    "21022", # age at recruitment
    "31", # sex
    "21000", # ethnicity
    "189", # Townsend index
    "53", # date of baseline assessment
    "54", # assessment center
]

temp = get_data_fields(fields_basics, data, data_field)

temp["sex_f31_0_0"] = temp["sex_f31_0_0"].cat.set_categories(["Female", 'Male'], ordered=False)

#temp["ethnic_background_f21000_0_0"] = temp["ethnic_background_f21000_0_0"].astype("string")

ethn_bg_def = {#"White": ["White", "British", "Irish", "Any other white background"],
#                "Mixed": ["Mixed", "White and Black Caribbean", "White and Black African", "White and Asian", "Any other mixed background"],  
##                "Asian": ["Asian or Asian British", "Indian", "Pakistani", "Bangladeshi", "Any other Asian background"], 
#                "Black": ["Black or Black British", "Caribbean", "African", "Any other Black background"],
#                "Chinese": ["Chinese"],  
                np.nan: ["Other ethnic group", "Do not know", "Prefer not to answer"]}

ethn_bg_dict = {}
for key, values in ethn_bg_def.items(): 
    for value in values:
        ethn_bg_dict[value]=key 
        
temp["ethnic_background_f21000_0_0"].replace(ethn_bg_dict, inplace=True)
temp["ethnic_background_f21000_0_0"] = temp["ethnic_background_f21000_0_0"].astype("category")

#
#temp["ethnic_background_f21000_0_0"] = temp["ethnic_background_f21000_0_0"].astype("category").cat.set_categories(['White', 'Black', 'Asien', 'Mixed', 'Chinese'], ordered=False)

basics = temp
print(len(temp))

from dateutil.relativedelta import relativedelta
calc_birth_date = [date_of_attending_assessment_centre - relativedelta(years=age_at_recruitment) 
                                                             for date_of_attending_assessment_centre, age_at_recruitment 
                                                             in zip(basics["date_of_attending_assessment_centre_f53_0_0"], basics["age_at_recruitment_f21022_0_0"])]

basics = basics.assign(birth_date = calc_birth_date)
centre_temp = basics.assign(uk_biobank_assessment_centre_f54_0_0 = lambda x: x.uk_biobank_assessment_centre_f54_0_0.astype("int")).merge(coding10, on="uk_biobank_assessment_centre_f54_0_0")[["eid", "meaning"]].sort_values("eid")
basics = basics.merge(centre_temp, how="left", on="eid").drop(columns="uk_biobank_assessment_centre_f54_0_0").rename(columns={"meaning":"uk_biobank_assessment_centre_f54_0_0"})

display(basics.head())
basics.to_feather(os.path.join(dataset_path, 'temp_basics.feather'))

### Questionnaire

In [None]:
fields_questionnaire = [
    "1647"	, # Country of birth (UK/elsewhere)
    "1677"	, # Breastfed as a baby
    "1687"	, # Comparative body size at age 10
    "1697"	, # Comparative height size at age 10
    "1707"	, # Handedness (chirality/laterality)
    "1767"	, # Adopted as a child
    "1777"	, # Part of a multiple birth
    "1787"	, # Maternal smoking around birth
    "20121"	, # Cascot confidence score
    "20277"	, # Job code at visit
    "20001"	, # Cancer code, self-reported
    "84" , # Cancer year/age first occurred
    "20007"	, # Interpolated Age of participant when cancer first diagnosed
    "20009"	, # Interpolated Age of participant when non-cancer illness first diagnosed
    "20006"	, # Interpolated Year when cancer first diagnosed
    "20008"	, # Interpolated Year when non-cancer illness first diagnosed
    "20012"	, # Method of recording time when cancer first diagnosed
    "20013"	, # Method of recording time when non-cancer illness first diagnosed
    "20002"	, # Non-cancer illness code, self-reported
    "87" , # Non-cancer illness year/age first occurred
    "134" , # Number of self-reported cancers
    "135" , # Number of self-reported non-cancer illnesses
    "3140"	, # Pregnant
    "137 "	, # Number of treatments/medications taken
    "20076"	, # Treatment/medication READ-code
    "20003"	, # Treatment/medication code
    "6671"	, # Number of antibiotics taken in last 3 months
    "20199"	, # Antibiotic codes for last 3 months
    "20011"	, # Interpolated Age of participant when operation took place
    "20010"	, # Interpolated Year when operation took place
    "20014"	, # Method of recording time when operation occurred
    "136"	, # Number of operations, self-reported
    "20004"	, # Operation code
    "92"	, # Operation year/age first occurred
    "3079"	, # Pace-maker
    "1031"	, # Frequency of friend/family visits
    "10740"	, # Frequency of friend/family visits (pilot)
    "6160"	, # Leisure/social activities
    "2110"	, # Able to confide
    "20126"	, # Bipolar and major depression status
    "20122"	, # Bipolar disorder status
    "20127"	, # Neuroticism score
    "20124"	, # Probable recurrent major depression (moderate)
    "20125"	, # Probable recurrent major depression (severe)
    "20123"	, # Single episode of probable major depression
    "1920"	, # Mood swings
    "1930"	, # Miserableness
    "1940"	, # Irritability
    "1950"	, # Sensitivity / hurt feelings
    "1960"	, # Fed-up feelings
    "1970"	, # Nervous feelings
    "1980"	, # Worrier / anxious feelings
    "1990"	, # Tense / 'highly strung'
    "2000"	, # Worry too long after embarrassment
    "2010"	, # Suffer from 'nerves'
    "2020"	, # Loneliness, isolation
    "2030"	, # Guilty feelings
    "2040"	, # Risk taking
    "4526"	, # Happiness
    "4537"	, # Work/job satisfaction
    "4548"	, # Health satisfaction
    "4559"	, # Family relationship satisfaction
    "4570"	, # Friendships satisfaction
    "4581"	, # Financial situation satisfaction
    "2050"	, # Frequency of depressed mood in last 2 weeks
    "2060"	, # Frequency of unenthusiasm / disinterest in last 2 weeks
    "2070"	, # Frequency of tenseness / restlessness in last 2 weeks
    "2080"	, # Frequency of tiredness / lethargy in last 2 weeks
    "2090"	, # Seen doctor (GP) for nerves, anxiety, tension or depression
    "2100"	, # Seen a psychiatrist for nerves, anxiety, tension or depression
    "4598"	, # Ever depressed for a whole week
    "4609"	, # Longest period of depression
    "4620"	, # Number of depression episodes
    "4631"	, # Ever unenthusiastic/disinterested for a whole week
    "5375"	, # Longest period of unenthusiasm / disinterest
    "5386"	, # Number of unenthusiastic/disinterested episodes
    "4642"	, # Ever manic/hyper for 2 days
    "4653"	, # Ever highly irritable/argumentative for 2 days
    "6156"	, # Manic/hyper symptoms
    "5663"	, # Length of longest manic/irritable episode
    "5674"	, # Severity of manic/irritable episodes
    "6145"	, # Illness, injury, bereavement, stress in last 2 years
    "2207"	, # Wears glasses or contact lenses
    "2217"	, # Age started wearing glasses or contact lenses
    "6147"	, # Reason for glasses/contact lenses
    "5843"	, # Which eye(s) affected by myopia (short sight)
    "5832"	, # Which eye(s) affected by hypermetropia (long sight)
    "5610"	, # Which eye(s) affected by presbyopia
    "5855"	, # Which eye(s) affected by astigmatism
    "6205"	, # Which eye(s) affected by strabismus (squint)
    "5408"	, # Which eye(s) affected by amblyopia (lazy eye)
    "5877"	, # Which eye(s) affected by other eye condition
    "5934"	, # Which eye(s) affected by other serious eye condition
    "2227"	, # Other eye problems
    "6148"	, # Eye problems/disorders
    "5890"	, # Which eye(s) affected by diabetes-related eye disease
    "6119"	, # Which eye(s) affected by glaucoma
    "5419"	, # Which eye(s) affected by injury or trauma resulting in loss of vision
    "5441"	, # Which eye(s) are affected by cataract
    "5912"	, # Which eye(s) affected by macular degeneration
    "5901"	, # Age when diabetes-related eye disease diagnosed
    "4689"	, # Age glaucoma diagnosed
    "5430"	, # Age when loss of vision due to injury or trauma diagnosed
    "4700"	, # Age cataract diagnosed
    "5923"	, # Age macular degeneration diagnosed
    "5945"	, # Age other serious eye condition diagnosed
    "6149"	, # Mouth/teeth dental problems
    "2178"	, # Overall health rating
    "2188"	, # Long-standing illness, disability or infirmity
    "2296"	, # Falls in the last year
    "2306"	, # Weight change compared with 1 year ago
    "2316"	, # Wheeze or whistling in the chest in last year
    "4717"	, # Shortness of breath walking on level ground
    "4728"	, # Leg pain on walking
    "5452"	, # Leg pain when standing still or sitting
    "5463"	, # Leg pain in calf/calves
    "5474"	, # Leg pain when walking uphill or hurrying
    "5485"	, # Leg pain when walking normally
    "5496"	, # Leg pain when walking ever disappears while walking
    "5507"	, # Leg pain on walking : action taken
    "5518"	, # Leg pain on walking : effect of standing still
    "5529"	, # Surgery on leg arteries (other than for varicose veins)
    "5540"	, # Surgery/amputation of toe or leg
    "6159"	, # Pain type(s) experienced in last month
    "3799"	, # Headaches for 3+ months
    "4067"	, # Facial pains for 3+ months
    "3404"	, # Neck/shoulder pain for 3+ months
    "3571"	, # Back pain for 3+ months
    "3741"	, # Stomach/abdominal pain for 3+ months
    "3414"	, # Hip pain for 3+ months
    "3773"	, # Knee pain for 3+ months
    "2956"	, # General pain for 3+ months
    "2335"	, # Chest pain or discomfort
    "3606"	, # Chest pain or discomfort walking normally
    "3616"	, # Chest pain due to walking ceases when standing still
    "3751"	, # Chest pain or discomfort when walking uphill or hurrying
    "2345"	, # Ever had bowel cancer screening
    "2355"	, # Most recent bowel cancer screening
    "2365"	, # Ever had prostate specific antigen (PSA) test
    "3809"	, # Time since last prostate specific antigen (PSA) test
    "2415"	, # Had major operations
    "2844"	, # Had other major operations
    "6150"	, # Vascular/heart problems diagnosed by doctor
    "3894"	, # Age heart attack diagnosed
    "3627"	, # Age angina diagnosed
    "4056"	, # Age stroke diagnosed
    "2966"	, # Age high blood pressure diagnosed
    "6152"	, # Blood clot, DVT, bronchitis, emphysema, asthma, rhinitis, eczema, allergy diagnosed by doctor
    "4012"	, # Age deep-vein thrombosis (DVT, blood clot in leg) diagnosed
    "4022"	, # Age pulmonary embolism (blood clot in lung) diagnosed
    "3992"	, # Age emphysema/chronic bronchitis diagnosed
    "3786"	, # Age asthma diagnosed
    "3761"	, # Age hay fever, rhinitis or eczema diagnosed
    "2443"	, # Diabetes diagnosed by doctor
    "4041"	, # Gestational diabetes only
    "10844"	, # Gestational diabetes only (pilot)
    "2976"	, # Age diabetes diagnosed
    "2986"	, # Started insulin within one year diagnosis of diabetes
    "2453"	, # Cancer diagnosed by doctor
    "2463"	, # Fractured/broken bones in last 5 years
    "6151"	, # Fractured bone site(s)
    "3005"	, # Fracture resulting from simple fall
    "2473"	, # Other serious medical condition/disability diagnosed by doctor
    "6177"	, # Medication for cholesterol, blood pressure or diabetes
    "6153"	, # Medication for cholesterol, blood pressure, diabetes, or take exogenous hormones
    "2492"	, # Taking other prescription medications
    "6154"	, # Medication for pain relief, constipation, heartburn
    "10004"	, # Medication for pain relief, constipation, heartburn (pilot)
    "10005"	, # Medication for smoking cessation, constipation, heartburn, allergies (pilot)
    "6155"	, # Vitamin and mineral supplements
    "10007"	, # Vitamin and mineral supplements (pilot)
    "6179"	, # Mineral and other dietary supplements
    "10723"	, # Vitamin supplements (pilot)
    "10854"	, # Other dietary supplements (pilot)
    "2247"	, # Hearing difficulty/problems
    "10793"	, # Hearing difficulty/problems (pilot)
    "2257"	, # Hearing difficulty/problems with background noise
    "3393"	, # Hearing aid user
    "4792"	, # Cochlear implant
    "4803"	, # Tinnitus
    "4814"	, # Tinnitus severity/nuisance
    "4825"	, # Noisy workplace
    "4836"	, # Loud music exposure frequency
    "2375"	, # Relative age of first facial hair
    "2385"	, # Relative age voice broke
    "2395"	, # Hair/balding pattern
    "2405"	, # Number of children fathered
    "2674"	, # Ever had breast cancer screening / mammogram
    "2684"	, # Years since last breast cancer screening / mammogram
    "2694"	, # Ever had cervical smear test
    "2704"	, # Years since last cervical smear test
    "2714"	, # Age when periods started (menarche)
    "2724"	, # Had menopause
    "3581"	, # Age at menopause (last menstrual period)
    "3700"	, # Time since last menstrual period
    "3710"	, # Length of menstrual cycle
    "3720"	, # Menstruating today
    "2734"	, # Number of live births
    "2744"	, # Birth weight of first child
    "3872"	, # Age of primiparous women at birth of child
    "2754"	, # Age at first live birth
    "2764"	, # Age at last live birth
    "2774"	, # Ever had stillbirth, spontaneous miscarriage or termination
    "3829"	, # Number of stillbirths
    "3839"	, # Number of spontaneous miscarriages
    "3849"	, # Number of pregnancy terminations
    "2784"	, # Ever taken oral contraceptive pill
    "10132"	, # Type of progestan-only oral contraceptive used (pilot)
    "2794"	, # Age started oral contraceptive pill
    "2804"	, # Age when last used oral contraceptive pill
    "2814"	, # Ever used hormone-replacement therapy (HRT)
    "3536"	, # Age started hormone-replacement therapy (HRT)
    "3546"	, # Age last used hormone-replacement therapy (HRT)
    "3591"	, # Ever had hysterectomy (womb removed)
    "2824"	, # Age at hysterectomy
    "2834"	, # Bilateral oophorectomy (both ovaries removed)
    "3882"	, # Age at bilateral oophorectomy (both ovaries removed)
    "1110"	, # Length of mobile phone use
    "1120"	, # Weekly usage of mobile phone in last 3 months
    "10749"	, # Time using mobile phone in last 3 months (pilot)
    "10016"	, # Regular use of hands-free device/speakerphone with mobile phone (pilot)
    "1130"	, # Hands-free device/speakerphone use with mobile phone in last 3 month
    "1140"	, # Difference in mobile phone use compared to two years previously
    "10886"	, # Difference in mobile phone use compared to one year previously (pilot)
    "1150"	, # Usual side of head for mobile phone use
    "2237"	, # Plays computer games
    "10105"	, # Internet user (pilot)
    "10114"	, # Willing to be contacted by email (pilot)
    "20160"	, # Ever smoked
    "20162"	, # Pack years adult smoking as proportion of life span exposed to smoking
    "20161"	, # Pack years of smoking
    "10895"	, # Light smokers, at least 100 smokes in lifetime (pilot)
    "20116"	, # Smoking status
    "1239"	, # Current tobacco smoking
    "1249"	, # Past tobacco smoking
    "2644"	, # Light smokers, at least 100 smokes in lifetime
    "3436"	, # Age started smoking in current smokers
    "3446"	, # Type of tobacco currently smoked
    "5959"	, # Previously smoked cigarettes on most/all days
    "3456"	, # Number of cigarettes currently smoked daily (current cigarette smokers)
    "6194"	, # Age stopped smoking cigarettes (current cigar/pipe or previous cigarette smoker)
    "6183"	, # Number of cigarettes previously smoked daily (current cigar/pipe smokers)
    "3466"	, # Time from waking to first cigarette
    "3476"	, # Difficulty not smoking for 1 day
    "3486"	, # Ever tried to stop smoking
    "3496"	, # Wants to stop smoking
    "3506"	, # Smoking compared to 10 years previous
    "6158"	, # Why reduced smoking
    "2867"	, # Age started smoking in former smokers
    "2877"	, # Type of tobacco previously smoked
    "2887"	, # Number of cigarettes previously smoked daily
    "2897"	, # Age stopped smoking
    "2907"	, # Ever stopped smoking for 6+ months
    "10827"	, # Ever stopped smoking for 6+ months (pilot)
    "6157"	, # Why stopped smoking
    "10115"	, # Why stopped smoking (pilot)
    "2926"	, # Number of unsuccessful stop-smoking attempts
    "2936"	, # Likelihood of resuming smoking
    "1259"	, # Smoking/smokers in household
    "1269"	, # Exposure to tobacco smoke at home
    "1279"	, # Exposure to tobacco smoke outside home
    "20117"	, # Alcohol drinker status
    "1558"	, # Alcohol intake frequency.
    "3731"	, # Former alcohol drinker
    "4407"	, # Average monthly red wine intake
    "4418"	, # Average monthly champagne plus white wine intake
    "4429"	, # Average monthly beer plus cider intake
    "4440"	, # Average monthly spirits intake
    "4451"	, # Average monthly fortified wine intake
    "4462"	, # Average monthly intake of other alcoholic drinks
    "1568"	, # Average weekly red wine intake
    "1578"	, # Average weekly champagne plus white wine intake
    "1588"	, # Average weekly beer plus cider intake
    "1598"	, # Average weekly spirits intake
    "1608"	, # Average weekly fortified wine intake
    "5364"	, # Average weekly intake of other alcoholic drinks
    "1618"	, # Alcohol usually taken with meals
    "1628"	, # Alcohol intake versus 10 years previously
    "2664"	, # Reason for reducing amount of alcohol drunk
    "10818"	, # Reason for reducing amount of alcohol drunk (pilot)
    "3859"	, # Reason former drinker stopped drinking alcohol
    "10853"	, # Reason former drinker stopped drinking alcohol (pilot)
    "1050"	, # Time spend outdoors in summer
    "1060"	, # Time spent outdoors in winter
    "1717"	, # Skin colour
    "1727"	, # Ease of skin tanning
    "1737"	, # Childhood sunburn occasions
    "1747"	, # Hair colour (natural, before greying)
    "1757"	, # Facial ageing
    "2267"	, # Use of sun/uv protection
    "2277"	, # Frequency of solarium/sunlamp use
    "2129"	, # Answered sexual history questions
    "2139"	, # Age first had sexual intercourse
    "2149"	, # Lifetime number of sexual partners
    "2159"	, # Ever had same-sex intercourse
    "3669"	, # Lifetime number of same-sex sexual partners
    "670"	, # Type of accommodation lived in
    "680"	, # Own or rent accommodation lived in
    "6139"	, # Gas or solid-fuel cooking/heating
    "10860"	, # Gas or solid-fuel cooking/heating (pilot)
    "6140"	, # Heating type(s) in home
    "699"	, # Length of time at current address
    "709"	, # Number in household
    "6141"	, # How are people in household related to participant
    "728"	, # Number of vehicles in household
    "738"	, # Average total household income before tax
    "20119"	, # Current employment status - corrected
    "796"	, # Distance between home and job workplace
    "6142"	, # Current employment status
    "757"	, # Time employed in main current job
    "767"	, # Length of working week for main job
    "777"	, # Frequency of travelling from home to job workplace
    "6143"	, # Transport type for commuting to job workplace
    "806"	, # Job involves mainly walking or standing
    "816"	, # Job involves heavy manual or physical work
    "826"	, # Job involves shift work
    "3426"	, # Job involves night shift work
    "6138"	, # Qualifications
    "845"	, # Age completed full time education
    "6146"	, # Attendance/disability/mobility allowance
    "4674"	, # Private healthcare
    "1289"	, # Cooked vegetable intake
    "1299"	, # Salad / raw vegetable intake
    "1309"	, # Fresh fruit intake
    "1319"	, # Dried fruit intake
    "1329"	, # Oily fish intake
    "1339"	, # Non-oily fish intake
    "1349"	, # Processed meat intake
    "1359"	, # Poultry intake
    "1369"	, # Beef intake
    "1379"	, # Lamb/mutton intake
    "1389"	, # Pork intake
    "3680"	, # Age when last ate meat
    "6144"	, # Never eat eggs, dairy, wheat, sugar
    "10855"	, # Never eat eggs, dairy, wheat, sugar (pilot)
    "1408"	, # Cheese intake
    "1418"	, # Milk type used
    "1428"	, # Spread type
    "2654"	, # Non-butter spread type details
    "10767"	, # Spread type (pilot)
    "1438"	, # Bread intake
    "1448"	, # Bread type
    "10776"	, # Bread type/intake (pilot)
    "1458"	, # Cereal intake
    "1468"	, # Cereal type
    "1478"	, # Salt added to food
    "1488"	, # Tea intake
    "1498"	, # Coffee intake
    "1508"	, # Coffee type
    "1518"	, # Hot drink temperature
    "1528"	, # Water intake
    "1538"	, # Major dietary changes in the last 5 years
    "1548"	, # Variation in diet
    "10912"	, # Variation in diet (pilot)
]

temp = get_data_fields(fields_questionnaire, data, data_field)

temp["overall_health_rating_f2178_0_0"] = temp["overall_health_rating_f2178_0_0"]\
    .replace({"Do not know": np.nan, "Prefer not to answer": np.nan})\
    .astype("category").cat.set_categories(['Poor', 'Fair', 'Good', 'Excellent'], ordered=True)


temp["smoking_status_f20116_0_0"] = temp["smoking_status_f20116_0_0"]\
    .replace({"Prefer not to answer": np.nan}, inplace=False)\
    .astype("category").cat.set_categories(['Current', 'Previous', 'Never'], ordered=True)

temp["alcohol_intake_frequency_f1558_0_0"] = temp["alcohol_intake_frequency_f1558_0_0"]\
    .replace({"Prefer not to answer": np.nan}, inplace=False)\
    .astype("category").cat.set_categories([
        'Daily or almost daily', 
        'Three or four times a week', 
        'Once or twice a week',
        'One to three times a month',
        'Special occasions only', 
        'Never'], ordered=True)

questionnaire = temp
print(len(temp))
display(temp.head())

questionnaire.to_feather(os.path.join(dataset_path, 'temp_questionnaire.feather'))

In [None]:
print(temp["alcohol_intake_frequency_f1558_0_0"].unique())

### Physical measurements

In [None]:
from statistics import mean

fields_measurements = [
"36"	, # Blood pressure device ID
"37"	, # Blood pressure manual sphygmomanometer device ID
"4079"	, # Diastolic blood pressure, automated reading
"94"	, # Diastolic blood pressure, manual reading
"4081"	, # Method of measuring blood pressure
"95"	, # Pulse rate (during blood-pressure measurement)
"102"	, # Pulse rate, automated reading
"4080"	, # Systolic blood pressure, automated reading
"93"	, # Systolic blood pressure, manual reading
"96"	, # Time since interview start at which blood pressure screen(s) shown
"12292"	, # Carotid ultrasound measurement completed
"12291"	, # Carotid ultrasound measuring method
"22672"	, # Maximum carotid IMT (intima-medial thickness) at 120 degrees
"22675"	, # Maximum carotid IMT (intima-medial thickness) at 150 degrees
"22678"	, # Maximum carotid IMT (intima-medial thickness) at 210 degrees
"22681"	, # Maximum carotid IMT (intima-medial thickness) at 240 degrees
"22671"	, # Mean carotid IMT (intima-medial thickness) at 120 degrees
"22674"	, # Mean carotid IMT (intima-medial thickness) at 150 degrees
"22677"	, # Mean carotid IMT (intima-medial thickness) at 210 degrees
"22680"	, # Mean carotid IMT (intima-medial thickness) at 240 degrees
"22670"	, # Minimum carotid IMT (intima-medial thickness) at 120 degrees
"22673"	, # Minimum carotid IMT (intima-medial thickness) at 150 degrees
"22676"	, # Minimum carotid IMT (intima-medial thickness) at 210 degrees
"22679"	, # Minimum carotid IMT (intima-medial thickness) at 240 degrees
"22682"	, # Quality control indicator for IMT at 120 degrees
"22683"	, # Quality control indicator for IMT at 150 degrees
"22684"	, # Quality control indicator for IMT at 210 degrees
"22685"	, # Quality control indicator for IMT at 240 degrees
"20051"	, # Reason for skipping arterial stiffness
"4204"	, # Absence of notch position in the pulse waveform
"4136"	, # Arterial pulse-wave stiffness device ID
"4206"	, # Arterial stiffness device ID
"4199"	, # Position of pulse wave notch
"4198"	, # Position of the pulse wave peak
"4200"	, # Position of the shoulder on the pulse waveform
"4194"	, # Pulse rate
"21021"	, # Pulse wave Arterial Stiffness index
"4196"	, # Pulse wave peak to peak time
"4205"	, # Pulse wave pressure versus time response curve	†
"4195"	, # Pulse wave reflection index
"4207"	, # Pulse wave velocity (manual entry)
"4186"	, # Stiffness method
"4849"	, # Hearing test done
"20019"	, # Speech-reception-threshold (SRT) estimate (left)
"20021"	, # Speech-reception-threshold (SRT) estimate (right)
"4272"	, # Duration of hearing test (left)
"4279"	, # Duration of hearing test (right)
"4268"	, # Completion status (left)
"4275"	, # Completion status (right)
"4233"	, # Mean signal-to-noise ratio (SNR), (left)
"4244"	, # Mean signal-to-noise ratio (SNR), (right)
"4269"	, # Number of triplets attempted (left)
"4276"	, # Number of triplets attempted (right)
"4230"	, # Signal-to-noise-ratio (SNR) of triplet (left)
"4241"	, # Signal-to-noise-ratio (SNR) of triplet (right)
"4232"	, # Triplet correct (left)
"4243"	, # Triplet correct (right)
"4229"	, # Triplet played (left)
"4240"	, # Triplet played (right)
"4236"	, # Triplet entered (left)
"4242"	, # Triplet entered (right)
"4239"	, # Number of times 'clear' was pressed (left)
"4249"	, # Number of times 'clear' was pressed (right)
"4237"	, # Time to press 'next' (left)
"4247"	, # Time to press 'next' (right)
"4235"	, # Time to press last digit (left)
"4246"	, # Time to press last digit (right)
"4234"	, # Time to press first digit (left)
"4245"	, # Time to press first digit (right)
"4270"	, # Volume level set by participant (left)
"4277"	, # Volume level set by participant (right)
"4238"	, # Keystroke history (left)
"4248"	, # Keystroke history (right)
"5181"	, # Ever had eye surgery
"5324"	, # Ever had cataract surgery
"5325"	, # Ever had refractive laser eye surgery
"5326"	, # Ever had surgery for glaucoma or high eye pressure
"5327"	, # Ever had laser treatment for glaucoma or high eye pressure
"5328"	, # Ever had corneal graft surgery
"5182"	, # Both eyes present
"5183"	, # Current eye infection
"20057"	, # Reason for skipping visual acuity (left)
"20262"	, # Myopia diagnosis
"20261"	, # avMSE
"20056"	, # Reason for skipping visual acuity (right)
"6075"	, # Glasses worn/required (left)
"6074"	, # Glasses worn/required (right)
"5187"	, # Visual acuity measured (left)
"5185"	, # Visual acuity measured (right)
"5211"	, # Distance of viewer to screen (left)
"5204"	, # Distance of viewer to screen (right)
"5212"	, # Direct or mirror view (left)
"5205"	, # Direct or mirror view (right)
"5074"	, # Number of letters shown in round (left)
"5075"	, # Number of letters shown in round (right)
"5081"	, # Displayed letters in round (left)
"5080"	, # Displayed letters in round (right)
"5077"	, # Number of letters correct in round (left)
"5076"	, # Number of letters correct in round (right)
"5207"	, # Final number of letters displayed (left)
"5200"	, # Final number of letters displayed (right)
"5209"	, # Number of rounds to result (left)
"5202"	, # Number of rounds to result (right)
"5206"	, # logMAR, initial (left)
"5199"	, # logMAR, initial (right)
"5078"	, # logMAR in round (left)
"5079"	, # logMAR in round (right)
"5208"	, # logMAR, final (left)
"5201"	, # logMAR, final (right)
"5082"	, # Visual acuity result in round (left)
"5083"	, # Visual acuity result in round (right)
"5188"	, # Duration visual-acuity screen displayed (left)
"5186"	, # Duration visual-acuity screen displayed (right)
"20052"	, # Reason for skipping refractometry (left)
"5193"	, # Duration at which refractometer first shown (left)
"20055"	, # Reason for skipping refractometry (right)
"5190"	, # Duration at which refractometer first shown (right)
"5191"	, # Auto-refraction method (left)
"5189"	, # Auto-refraction method (right)
"5111"	, # 3mm asymmetry angle (left)
"5108"	, # 3mm asymmetry angle (right)
"5141"	, # 3mm asymmetry index unreliable (left)
"5144"	, # 3mm asymmetry index unreliable (right)
"5156"	, # 3mm asymmetry index (left)
"5159"	, # 3mm asymmetry index (right)
"5155"	, # 3mm asymmetry index for irregular astigmatism level (left)
"5152"	, # 3mm asymmetry index for irregular astigmatism level (right)
"5112"	, # 3mm cylindrical power angle (left)
"5115"	, # 3mm cylindrical power angle (right)
"5119"	, # 3mm cylindrical power (left)
"5116"	, # 3mm cylindrical power (right)
"5292"	, # 3mm index of best keratometry results (left)
"5237"	, # 3mm index of best keratometry results (right)
"5136"	, # 3mm keratometry result unreliable (left)
"5140"	, # 3mm keratometry result unreliable (right)
"5163"	, # 3mm regularity index (left)
"5160"	, # 3mm regularity index (right)
"5148"	, # 3mm regularity index unreliable (left)
"5145"	, # 3mm regularity index unreliable (right)
"5149"	, # 3mm regularity index for irregular astigmatism level (left)
"5164"	, # 3mm regularity index for irregular astigmatism level (right)
"5104"	, # 3mm strong meridian angle (left)
"5107"	, # 3mm strong meridian angle (right)
"5135"	, # 3mm strong meridian (left)
"5132"	, # 3mm strong meridian (right)
"5103"	, # 3mm weak meridian angle (left)
"5100"	, # 3mm weak meridian angle (right)
"5096"	, # 3mm weak meridian (left)
"5099"	, # 3mm weak meridian (right)
"5306"	, # 6mm index of best keratometry results (left)
"5251"	, # 6mm index of best keratometry results (right)
"5138"	, # 6mm keratometry result unreliable (left)
"5139"	, # 6mm keratometry result unreliable (right)
"5110"	, # 6mm asymmetry angle (left)
"5109"	, # 6mm asymmetry angle (right)
"5157"	, # 6mm asymmetry index (left)
"5158"	, # 6mm asymmetry index (right)
"5142"	, # 6mm asymmetry index unreliable (left)
"5143"	, # 6mm asymmetry index unreliable (right)
"5113"	, # 6mm cylindrical power angle (left)
"5114"	, # 6mm cylindrical power angle (right)
"5118"	, # 6mm cylindrical power (left)
"5117"	, # 6mm cylindrical power (right)
"5162"	, # 6mm regularity index (left)
"5161"	, # 6mm regularity index (right)
"5147"	, # 6mm regularity index unreliable (left)
"5146"	, # 6mm regularity index unreliable (right)
"5105"	, # 6mm strong meridian angle (left)
"5106"	, # 6mm strong meridian angle (right)
"5134"	, # 6mm strong meridian (left)
"5133"	, # 6mm strong meridian (right)
"5102"	, # 6mm weak meridian angle (left)
"5101"	, # 6mm weak meridian angle (right)
"5097"	, # 6mm weak meridian (left)
"5098"	, # 6mm weak meridian (right)
"5089"	, # Astigmatism angle (left)
"5088"	, # Astigmatism angle (right)
"5086"	, # Cylindrical power (left)
"5087"	, # Cylindrical power (right)
"5085"	, # Spherical power (left)
"5084"	, # Spherical power (right)
"5274"	, # Vertex distance (left)
"5215"	, # Vertex distance (right)
"5276"	, # Index of best refractometry result (left)
"5221"	, # Index of best refractometry result (right)
"5090"	, # Refractometry result unreliable (left)
"5091"	, # Refractometry result unreliable (right)
"5273"	, # Auto-refractor device ID (left)
"5214"	, # Auto-refractor device ID (right)
"20054"	, # Reason for skipping IOP (left)
"20053"	, # Reason for skipping IOP (right)
"5196"	, # Intra-ocular pressure (IOP) method (left)
"5194"	, # Intra-ocular pressure (IOP) method (right)
"5264"	, # Corneal hysteresis (left)
"5256"	, # Corneal hysteresis (right)
"5265"	, # Corneal resistance factor (left)
"5257"	, # Corneal resistance factor (right)
"5262"	, # Intra-ocular pressure, corneal-compensated (left)
"5254"	, # Intra-ocular pressure, corneal-compensated (right)
"5263"	, # Intra-ocular pressure, Goldmann-correlated (left)
"5255"	, # Intra-ocular pressure, Goldmann-correlated (right)
"5266"	, # Applanation curve (left)	†
"5258"	, # Applanation curve (right)	†
"5267"	, # Pressure curve (left)	†
"5259"	, # Pressure curve (right)	†
"5261"	, # Intra-ocular pressure device ID (left)
"5253"	, # Intra-ocular pressure device ID (right)
"28520"	, # Average ELM-ISOS thickness (left)
"28521"	, # Average ELM-ISOS thickness (right)
"28512"	, # Average INL-ELM thickness (left)
"28513"	, # Average INL-ELM thickness (right)
"28536"	, # Average INL-RPE thickness (left)
"28537"	, # Average INL-RPE thickness (right)
"28528"	, # Average ISOS-RPE thickness (left)
"28529"	, # Average ISOS-RPE thickness (right)
"28504"	, # Average ganglion cell-inner plexiform layer thickness (left)
"28505"	, # Average ganglion cell-inner plexiform layer thickness (right)
"28502"	, # Average inner nuclear layer thickness (left)
"28503"	, # Average inner nuclear layer thickness (right)
"28500"	, # Average retinal nerve fibre layer thickness (left)
"28501"	, # Average retinal nerve fibre layer thickness (right)
"27851"	, # Disc diameter after inverse rank normal transformation (left)
"27852"	, # Disc diameter after inverse rank normal transformation (right)
"28514"	, # ELM-ISOS thickness of central subfield (left)
"28515"	, # ELM-ISOS thickness of central subfield (right)
"28516"	, # ELM-ISOS thickness of inner subfield (left)
"28517"	, # ELM-ISOS thickness of inner subfield (right)
"28518"	, # ELM-ISOS thickness of outer subfield (left)
"28519"	, # ELM-ISOS thickness of outer subfield (right)
"28506"	, # INL-ELM thickness of the central subfield (left)
"28507"	, # INL-ELM thickness of the central subfield (right)
"28508"	, # INL-ELM thickness of the inner subfield (left)
"28509"	, # INL-ELM thickness of the inner subfield (right)
"28510"	, # INL-ELM thickness of the outer subfield (left)
"28511"	, # INL-ELM thickness of the outer subfield (right)
"28530"	, # INL-RPE thickness of central subfield (left)
"28531"	, # INL-RPE thickness of central subfield (right)
"28532"	, # INL-RPE thickness of inner subfield (left)
"28533"	, # INL-RPE thickness of inner subfield (right)
"28534"	, # INL-RPE thickness of outer subfield (left)
"28535"	, # INL-RPE thickness of outer subfield (right)
"28522"	, # ISOS-RPE thickness of central subfield (left)
"28523"	, # ISOS-RPE thickness of central subfield (right)
"28524"	, # ISOS-RPE thickness of inner subfield (left)
"28525"	, # ISOS-RPE thickness of inner subfield (right)
"28526"	, # ISOS-RPE thickness of outer subfield (left)
"28527"	, # ISOS-RPE thickness of outer subfield (right)
"27802"	, # Macular thickness at the central subfield (left)
"27803"	, # Macular thickness at the central subfield (right)
"27804"	, # Macular thickness at the inner inferior subfield (left)
"27805"	, # Macular thickness at the inner inferior subfield (right)
"27806"	, # Macular thickness at the inner nasal subfield (left)
"27807"	, # Macular thickness at the inner nasal subfield (right)
"27808"	, # Macular thickness at the inner superior subfield (left)
"27809"	, # Macular thickness at the inner superior subfield (right)
"27810"	, # Macular thickness at the inner temporal subfield (left)
"27811"	, # Macular thickness at the inner temporal subfield (right)
"27812"	, # Macular thickness at the outer inferior subfield (left)
"27813"	, # Macular thickness at the outer inferior subfield (right)
"27814"	, # Macular thickness at the outer nasal subfield (left)
"27815"	, # Macular thickness at the outer nasal subfield (right)
"27816"	, # Macular thickness at the outer superior subfield (left)
"27817"	, # Macular thickness at the outer superior subfield (right)
"27818"	, # Macular thickness at the outer temporal subfield (left)
"27819"	, # Macular thickness at the outer temporal subfield (right)
"27853"	, # Mean of vertical disc diameter (left)
"27854"	, # Mean of vertical disc diameter (right)
"27822"	, # Overall average retinal pigment epithelium thickness (left)
"27823"	, # Overall average retinal pigment epithelium thickness (right)
"27800"	, # Overall macular thickness (left)
"27801"	, # Overall macular thickness (right)
"27824"	, # Retinal pigment epithelium thickness at central subfield (left)
"27825"	, # Retinal pigment epithelium thickness at central subfield (right)
"27826"	, # Retinal pigment epithelium thickness at inner inferior subfield (left)
"27827"	, # Retinal pigment epithelium thickness at inner inferior subfield (right)
"27828"	, # Retinal pigment epithelium thickness at inner nasal subfield (left)
"27829"	, # Retinal pigment epithelium thickness at inner nasal subfield (right)
"27830"	, # Retinal pigment epithelium thickness at inner superior subfield (left)
"27831"	, # Retinal pigment epithelium thickness at inner superior subfield (right)
"27832"	, # Retinal pigment epithelium thickness at inner temporal subfield (left)
"27833"	, # Retinal pigment epithelium thickness at inner temporal subfield (right)
"27834"	, # Retinal pigment epithelium thickness at outer inferior subfield (left)
"27835"	, # Retinal pigment epithelium thickness at outer inferior subfield (right)
"27836"	, # Retinal pigment epithelium thickness at outer nasal subfield (left)
"27837"	, # Retinal pigment epithelium thickness at outer nasal subfield (right)
"27838"	, # Retinal pigment epithelium thickness at outer superior subfield (left)
"27839"	, # Retinal pigment epithelium thickness at outer superior subfield (right)
"27840"	, # Retinal pigment epithelium thickness at outer temporal subfield (left)
"27841"	, # Retinal pigment epithelium thickness at outer temporal subfield (right)
"27820"	, # Total macular volume (left)
"27821"	, # Total macular volume (right)
"27857"	, # Vertical cup to disc ratio (VCDR) (left)
"27858"	, # Vertical cup to disc ratio (VCDR) (right)
"27855"	, # Vertical cup to disc ratio (VCDR) regressed and transformed (left)
"27856"	, # Vertical cup to disc ratio (VCDR) regressed and transformed (right)
"28542"	, # QC - ILM indicator (left)
"28543"	, # QC - ILM indicator (right)
"28552"	, # QC - Image quality (left)
"28553"	, # QC - Image quality (right)
"28540"	, # QC - Macula center aline (left)
"28541"	, # QC - Macula center aline (right)
"28538"	, # QC - Macula center frame (left)
"28539"	, # QC - Macula center frame (right)
"28548"	, # QC - Max motion delta (left)
"28549"	, # QC - Max motion delta (right)
"28550"	, # QC - Max motion factor (left)
"28551"	, # QC - Max motion factor (right)
"28546"	, # QC - Min motion correlation (left)
"28547"	, # QC - Min motion correlation (right)
"28544"	, # QC - Valid count (left)
"28545"	, # QC - Valid count (right)
"20044"	, # Reason for skipping grip strength (left)
"20043"	, # Reason for skipping grip strength (right)
"46"	, # Hand grip strength (left)
"47"	, # Hand grip strength (right)
"38"	, # Hand grip dynamometer device ID
"20046"	, # Reason for skipping hip measurement
"20048"	, # Reason for skipping sitting height
"20047"	, # Reason for skipping standing height
"20045"	, # Reason for skipping waist
"20041"	, # Reason for skipping weight
"12144"	, # Height
"12143"	, # Weight (pre-imaging)
"48"	, # Waist circumference
"21002"	, # Weight
"21001"	, # Body mass index (BMI)
"49"	, # Hip circumference
"44"	, # Tape measure device ID
"50"	, # Standing height
"39"	, # Height measure device ID
"51"	, # Seated height
"20015"	, # Sitting height
"3077"	, # Seating box height
"41"	, # Seating box device ID
"3160"	, # Weight, manual entry
"21"	, # Weight method
"40"	, # Manual scales device ID
"23098"	, # Weight
"23104"	, # Body mass index (BMI)
"23113"	, # Leg fat-free mass (right)
"23118"	, # Leg predicted mass (left)
"23114"	, # Leg predicted mass (right)
"23123"	, # Arm fat percentage (left)
"23119"	, # Arm fat percentage (right)
"23124"	, # Arm fat mass (left)
"23120"	, # Arm fat mass (right)
"23121"	, # Arm fat-free mass (right)
"23125"	, # Arm fat-free mass (left)
"23126"	, # Arm predicted mass (left)
"23122"	, # Arm predicted mass (right)
"23127"	, # Trunk fat percentage
"23128"	, # Trunk fat mass
"23129"	, # Trunk fat-free mass
"23130"	, # Trunk predicted mass
"23105"	, # Basal metabolic rate
"23099"	, # Body fat percentage
"23100"	, # Whole body fat mass
"23101"	, # Whole body fat-free mass
"23102"	, # Whole body water mass
"23115"	, # Leg fat percentage (left)
"23111"	, # Leg fat percentage (right)
"23116"	, # Leg fat mass (left)
"23112"	, # Leg fat mass (right)
"23117"	, # Leg fat-free mass (left)
"43"	, # Impedance device ID
"23106"	, # Impedance of whole body
"23110"	, # Impedance of arm (left)
"23109"	, # Impedance of arm (right)
"23108"	, # Impedance of leg (left)
"23107"	, # Impedance of leg (right)
"6218"	, # Impedance of whole body, manual entry
"6222"	, # Impedance of arm, manual entry (left)
"6221"	, # Impedance of arm, manual entry (right)
"6220"	, # Impedance of leg, manual entry (left)
"6219"	, # Impedance of leg, manual entry (right)
"3082"	, # Fractured heel
"3081"	, # Foot measured for bone density
"19"	, # Heel ultrasound method
"3146"	, # Speed of sound through heel
"3143"	, # Ankle spacing width
"3144"	, # Heel Broadband ultrasound attenuation, direct entry
"3147"	, # Heel quantitative ultrasound index (QUI), direct entry
"3148"	, # Heel bone mineral density (BMD)
"78"	, # Heel bone mineral density (BMD) T-score, automated
"3086"	, # Speed of sound through heel, manual entry
"3085"	, # Heel Broadband ultrasound attenuation (BUA), manual entry
"3083"	, # Heel quantitative ultrasound index (QUI), manual entry
"3084"	, # Heel bone mineral density (BMD), manual entry
"77"	, # Heel bone ultrasound T-score, manual entry
"4093"	, # Fractured heel (left)
"4096"	, # Fractured heel (right)
"4092"	, # Heel ultrasound method (left)
"4095"	, # Heel ultrasound method (right)
"4100"	, # Ankle spacing width (left)
"4119"	, # Ankle spacing width (right)
"4103"	, # Speed of sound through heel (left)
"4122"	, # Speed of sound through heel (right)
"4101"	, # Heel broadband ultrasound attenuation (left)
"4120"	, # Heel broadband ultrasound attenuation (right)
"4104"	, # Heel quantitative ultrasound index (QUI), direct entry (left)
"4123"	, # Heel quantitative ultrasound index (QUI), direct entry (right)
"4105"	, # Heel bone mineral density (BMD) (left)
"4124"	, # Heel bone mineral density (BMD) (right)
"4106"	, # Heel bone mineral density (BMD) T-score, automated (left)
"4125"	, # Heel bone mineral density (BMD) T-score, automated (right)
"4142"	, # Speed of sound through heel, manual entry (left)
"4147"	, # Speed of sound through heel, manual entry (right)
"4141"	, # Heel broadband ultrasound attenuation (BUA), manual entry (left)
"4146"	, # Heel broadband ultrasound attenuation (BUA), manual entry (right)
"4139"	, # Heel quantitative ultrasound index (QUI), manual entry (left)
"4144"	, # Heel quantitative ultrasound index (QUI), manual entry (right)
"4140"	, # Heel bone mineral density (BMD), manual entry (left)
"4145"	, # Heel bone mineral density (BMD), manual entry (right)
"4138"	, # Heel bone mineral density (BMD) T-score, manual entry (left)
"4143"	, # Heel bone mineral density (BMD) T-score, manual entry (right)
"45"	, # Heel ultrasound device id
"20031"	, # Acceptability of each blow result (text)
"20042"	, # Reason for skipping spirometry
"20032"	, # Acceptability of each blow result (text) (pilot)
"20152"	, # Reproduciblity of spirometry measurement using ERS/ATS criteria
"3088"	, # Contra-indications for spirometry
"3089"	, # Caffeine drink within last hour
"3090"	, # Used an inhaler for chest within last hour
"3159"	, # Smoked cigarette or pipe within last hour
"23"	, # Spirometry method
"3137"	, # Number of measurements made
"10717"	, # Number of measurements made (pilot)
"20255"	, # Spirometry QC measure
"3065"	, # Ordering of blows
"3059"	, # Result ranking
"10691"	, # Result ranking (pilot)
"3066"	, # Data points for blow	†
"10711"	, # Spirometry method (pilot)
"10697"	, # Data points for blow (pilot)	†
"3061"	, # Acceptability of each blow result
"10693"	, # Acceptability of each blow result (pilot)
"3062"	, # Forced vital capacity (FVC)
"10694"	, # Forced vital capacity (FVC) (pilot)
"20151"	, # Forced vital capacity (FVC), Best measure
"20257"	, # Forced vital capacity (FVC) Z-score
"3063"	, # Forced expiratory volume in 1-second (FEV1)
"10695"	, # Forced expiratory volume in 1-second (FEV1) (pilot)
"20150"	, # Forced expiratory volume in 1-second (FEV1), Best measure
"20153"	, # Forced expiratory volume in 1-second (FEV1), predicted
"20154"	, # Forced expiratory volume in 1-second (FEV1), predicted percentage
"20256"	, # Forced expiratory volume in 1-second (FEV1) Z-score
"20258"	, # FEV1/ FVC ratio Z-score
"3064"	, # Peak expiratory flow (PEF)
"10696"	, # Peak expiratory flow (PEF) (pilot)
"3060"	, # Time of blow measurement
"42"	, # Spirometer device ID
"3132"	, # Spirometry device serial number
"10714"	, # Spirometry device serial number (pilot)
"12323"	, # 12-lead ECG measuring method
"12657"	, # Suspicious flag for 12-lead ECG
"12653"	, # ECG automated diagnoses
"20205"	, # ECG datasets	‡
"12654"	, # Number of automated diagnostic comments recorded during 12-lead ECG
"12336"	, # Ventricular rate
"12338"	, # P duration
"22334"	, # PP interval
"22330"	, # PQ interval
"22338"	, # QRS num
"12340"	, # QRS duration
"22331"	, # QT interval
"22332"	, # QTC interval
"22333"	, # RR interval
"22335"	, # P axis
"22336"	, # R axis
"22337"	, # T axis
"12658"	, # Identifier for 12-lead ECG device
"20059"	, # Reason ECG not completed
"20060"	, # Reason at-rest ECG performed without bicycle
"20058"	, # Reason for skipping ECG
"6014"	, # Doctor restricts physical activity due to heart condition
"6015"	, # Chest pain felt during physical activity
"6016"	, # Chest pain felt outside physical activity
"6017"	, # Able to walk or cycle unaided for 10 minutes
"6023"	, # Description of exercise protocol recommended
"6019"	, # ECG/bike method for fitness test
"6024"	, # Program category
"6038"	, # Number of trend entries
"5985"	, # Bicycle speed
"5991"	, # ECG, phase name
"5987"	, # ECG, trend phase name
"5992"	, # ECG, phase duration
"5986"	, # ECG, phase time
"5988"	, # ECG, stage name
"5993"	, # ECG, number of stages in a phase
"5990"	, # ECG, stage duration
"5984"	, # ECG, load
"5983"	, # ECG, heart rate
"6033"	, # Maximum heart rate during fitness test
"6032"	, # Maximum workload during fitness test
"6034"	, # Target heart rate achieved
"6025"	, # Fitness test results, including ECG data	‡
"6039"	, # Duration of fitness test
"6020"	, # Completion status of test
"20023"	, # Mean time to correctly identify matches

    
]
temp = get_data_fields(fields_measurements, data, data_field)

measurements = temp
print(len(temp))
display(temp.head())

measurements.to_feather(os.path.join(dataset_path, 'temp_measurements.feather'))

In [None]:
# cognitive function
fields= [
"20023"	, #Mean time to correctly identify matches
"401"	, #Index for card A in round
"10139"	, #Index for card A in round (pilot)
"402"	, #Index for card B in round
"10140"	, #Index for card B in round (pilot)
"403"	, #Number of times snap-button pressed
"10141"	, #Number of times snap-button pressed (pilot)
"404"	, #Duration to first press of snap-button in each round
"10147"	, #Duration to first press of snap-button in each round (pilot)
"4281"	, # Completion status of numeric memory test
"4283"	, # Number of rounds of numeric memory test performed
"4260"	, # Round of numeric memory test
"4250"	, # Number of digits to be memorised/recalled
"4251"	, # Target number to be memorised
"4252"	, # Target number to be entered
"4258"	, # Number entered by participant
"4253"	, # Time number displayed for
"4259"	, # Digits entered correctly
"4282"	, # Maximum digits remembered correctly
"4254"	, # Time first key touched
"4255"	, # Time last key touched
"4256"	, # Time elapsed
"4257"	, # Keystroke history
"4285"	, # Time to complete test
"20016"	, # Fluid intelligence score
"20128"	, # Number of fluid intelligence questions attempted within time limit
"4924"	, # Attempted fluid intelligence (FI) test.
"4935"	, # FI1 : numeric addition test
"4946"	, # FI2 : identify largest number
"4957"	, # FI3 : word interpolation
"4968"	, # FI4 : positional arithmetic
"4979"	, # FI5 : family relationship calculation
"4990"	, # FI6 : conditional arithmetic
"5001"	, # FI7 : synonym
"5012"	, # FI8 : chained arithmetic
"5556"	, # FI9 : concept interpolation
"5699"	, # FI10 : arithmetic sequence recognition
"5779"	, # FI11 : antonym
"5790"	, # FI12 : square sequence recognition
"5866"	, # FI13 : subset inclusion logic
"6348"	, # Duration to complete numeric path (trail #1)
"6350"	, # Duration to complete alphanumeric path (trail #2)
"6349"	, # Total errors traversing numeric path (trail #1)
"6351"	, # Total errors traversing alphanumeric path (trail #2)
"6772"	, # Interval between previous point and current one in numeric path (trail #1)
"6773"	, # Interval between previous point and current one in alphanumeric path (trail #2)
"6770"	, # Errors before selecting correct item in numeric path (trail #1)
"6771"	, # Errors before selecting correct item in alphanumeric path (trail #2)
"6373"	, # Number of puzzles correctly solved
"6374"	, # Number of puzzles viewed
"6332"	, # Item selected for each puzzle
"6333"	, # Duration spent answering each puzzle
"21004"	, # Number of puzzles correct
"6383"	, # Number of puzzles attempted
"6312"	, # Value entered
"6313"	, # Duration to entering selection
"6364"	, # Vocabulary level	*
"6365"	, # Uncertainty in vocabulary level	*
"6314"	, # Word/picture group used for each round	*
"6315"	, # Picture selected for each round	*
"6317"	, # Duration of selection in each round	*
"6325"	, # Duration to entering symbol choice
"23323"	, # Number of symbol digit matches attempted
"23324"	, # Number of symbol digit matches made correctly
"6362"	, # First code array presented
"23321"	, # Values wanted
"23322"	, # Values entered
"20197"	, # Number of word pairs correctly associated
"6448"	, # Word associated with "huge"
"6459"	, # Word associated with "happy"
"6470"	, # Word associated with "tattered"
"6481"	, # Word associated with "old"
"6492"	, # Word associated with "long"
"6503"	, # Word associated with "red"
"6514"	, # Word associated with "sulking"
"6525"	, # Word associated with "pretty"
"6536"	, # Word associated with "tiny"
"6547"	, # Word associated with "new"
"20018"	, # Prospective memory result
"4287"	, # Test completion status
"4292"	, # PM: initial answer
"4293"	, # PM: final answer
"4294"	, # Final attempt correct
"4291"	, # Number of attempts
"4295"	, # History of attempts
"4290"	, # Duration screen displayed
"4286"	, # Time when initial screen shown
"4288"	, # Time to answer
"4289"	, # Time screen exited
"399"	, # Number of incorrect matches in round
"400"	, # Time to complete round
"6334"	, # Screen layout
"396"	, # Number of columns displayed in round
"397"	, # Number of rows displayed in round
"398"	, # Number of correct matches in round
]

temp = get_data_fields(fields, data, data_field).replace({-1: np.nan, -3:np.nan, -10:np.nan, "Do not know": np.nan, "Prefer not to answer": np.nan}, inplace=False).dropna(axis=1, thresh=400000, subset=None, inplace=False)
display(temp.head())

temp.to_feather(os.path.join(dataset_path, 'temp_cognitive_function.feather'))

In [None]:
fields_imaging = [
"22402"	, # Proton density fat fraction (PDFF)
"22401"	, # Liver inflammation factor (LIF)
"22417"	, # Liver iron corrected T1 (ct1)
"22400"	, # Liver iron (Fe)
"20202"	, # Pancreatic fat - DICOM
"22436"	, # 10P Liver PDFF (proton density fat fraction)
"23363"	, # 10P liver PDFF mean error indicator
"23358"	, # ASAT error indicator
"22434"	, # Abdominal fat ratio
"23359"	, # Anterior thigh error indicator (left)
"23361"	, # Anterior thigh error indicator (right)
"24353"	, # Anterior thigh muscle fat infiltration (MFI) (left)
"24354"	, # Anterior thigh muscle fat infiltration (MFI) (right)
"24352"	, # FR liver PDFF mean
"23364"	, # FR liver PDFF mean error indicator
"22435"	, # Muscle fat infiltration
"23360"	, # Posterior thigh error indicator (left)
"23362"	, # Posterior thigh error indicator (right)
"23355"	, # Posterior thigh muscle fat infiltration (MFI) (left)
"23356"	, # Posterior thigh muscle fat infiltration (MFI) (right)
"22432"	, # Total abdominal adipose tissue index
"23357"	, # VAT error indicator
"22433"	, # Weight-to-muscle ratio
"22410"	, # Total trunk fat volume
"22415"	, # Total adipose tissue volume
"22416"	, # Total lean tissue volume
"22408"	, # Abdominal subcutaneous adipose tissue volume (ASAT)
"22407"	, # Visceral adipose tissue volume (VAT)
"22409"	, # Total thigh fat-free muscle volume
"22405"	, # Anterior thigh fat-free muscle volume (left)
"22403"	, # Anterior thigh fat-free muscle volume (right)
"22406"	, # Posterior thigh fat-free muscle volume (left)
"22404"	, # Posterior thigh fat-free muscle volume (right)
"21084"	, # Lung volume
"21080"	, # Liver volume
"21088"	, # Liver PDFF (fat fraction)
"21089"	, # Liver iron
"21087"	, # Pancreas volume
"21090"	, # Pancreas PDFF (fat fraction)
"21091"	, # Pancreas iron
"21083"	, # Spleen volume
"21081"	, # Left kidney volume
"21082"	, # Right kidney volume
"21086"	, # Subcutaneous fat volume
"21085"	, # Visceral fat volume
"24120"	, # Ascending aorta distensibility
"24118"	, # Ascending aorta maximum area
"24119"	, # Ascending aorta minimum area
"24123"	, # Descending aorta distensibility
"24121"	, # Descending aorta maximum area
"24122"	, # Descending aorta minimum area
"24113"	, # LA ejection fraction
"24110"	, # LA maximum volume
"24111"	, # LA minimum volume
"24112"	, # LA stroke volume
"24104"	, # LV cardiac output
"24141"	, # LV circumferential strain AHA 1
"24150"	, # LV circumferential strain AHA 10
"24151"	, # LV circumferential strain AHA 11
"24152"	, # LV circumferential strain AHA 12
"24153"	, # LV circumferential strain AHA 13
"24154"	, # LV circumferential strain AHA 14
"24155"	, # LV circumferential strain AHA 15
"24156"	, # LV circumferential strain AHA 16
"24142"	, # LV circumferential strain AHA 2
"24143"	, # LV circumferential strain AHA 3
"24144"	, # LV circumferential strain AHA 4
"24145"	, # LV circumferential strain AHA 5
"24146"	, # LV circumferential strain AHA 6
"24147"	, # LV circumferential strain AHA 7
"24148"	, # LV circumferential strain AHA 8
"24149"	, # LV circumferential strain AHA 9
"24157"	, # LV circumferential strain global
"24103"	, # LV ejection fraction
"24100"	, # LV end diastolic volume
"24101"	, # LV end systolic volume
"24175"	, # LV longitudinal strain Segment 1
"24176"	, # LV longitudinal strain Segment 2
"24177"	, # LV longitudinal strain Segment 3
"24178"	, # LV longitudinal strain Segment 4
"24179"	, # LV longitudinal strain Segment 5
"24180"	, # LV longitudinal strain Segment 6
"24181"	, # LV longitudinal strain global
"24124"	, # LV mean myocardial wall thickness AHA 1
"24133"	, # LV mean myocardial wall thickness AHA 10
"24134"	, # LV mean myocardial wall thickness AHA 11
"24135"	, # LV mean myocardial wall thickness AHA 12
"24136"	, # LV mean myocardial wall thickness AHA 13
"24137"	, # LV mean myocardial wall thickness AHA 14
"24138"	, # LV mean myocardial wall thickness AHA 15
"24139"	, # LV mean myocardial wall thickness AHA 16
"24125"	, # LV mean myocardial wall thickness AHA 2
"24126"	, # LV mean myocardial wall thickness AHA 3
"24127"	, # LV mean myocardial wall thickness AHA 4
"24128"	, # LV mean myocardial wall thickness AHA 5
"24129"	, # LV mean myocardial wall thickness AHA 6
"24130"	, # LV mean myocardial wall thickness AHA 7
"24131"	, # LV mean myocardial wall thickness AHA 8
"24132"	, # LV mean myocardial wall thickness AHA 9
"24140"	, # LV mean myocardial wall thickness global
"24105"	, # LV myocardial mass
"24158"	, # LV radial strain AHA 1
"24167"	, # LV radial strain AHA 10
"24168"	, # LV radial strain AHA 11
"24169"	, # LV radial strain AHA 12
"24170"	, # LV radial strain AHA 13
"24171"	, # LV radial strain AHA 14
"24172"	, # LV radial strain AHA 15
"24173"	, # LV radial strain AHA 16
"24159"	, # LV radial strain AHA 2
"24160"	, # LV radial strain AHA 3
"24161"	, # LV radial strain AHA 4
"24162"	, # LV radial strain AHA 5
"24163"	, # LV radial strain AHA 6
"24164"	, # LV radial strain AHA 7
"24165"	, # LV radial strain AHA 8
"24166"	, # LV radial strain AHA 9
"24174"	, # LV radial strain global
"24102"	, # LV stroke volume
"24117"	, # RA ejection fraction
"24114"	, # RA maximum volume
"24115"	, # RA minimum volume
"24116"	, # RA stroke volume
"24109"	, # RV ejection fraction
"24106"	, # RV end diastolic volume
"24107"	, # RV end systolic volume
"24108"	, # RV stroke volume
"22426"	, # Average heart rate
"22427"	, # Body surface area
"22425"	, # Cardiac index
"22424"	, # Cardiac output
"22420"	, # LV ejection fraction
"22421"	, # LV end diastolic volume
"22422"	, # LV end systolic volume
"22423"	, # LV stroke volume
"12681"	, # Augmentation index for PWA
"12695"	, # Blood pressure test start time
"12702"	, # Cardiac index during PWA
"12682"	, # Cardiac output during PWA
"12680"	, # Central augmentation pressure during PWA
"12678"	, # Central pulse pressure during PWA
"12677"	, # Central systolic blood pressure during PWA
"12698"	, # Diastolic brachial blood pressure
"12675"	, # Diastolic brachial blood pressure during PWA
"12683"	, # End systolic pressure during PWA
"12684"	, # End systolic pressure index during PWA
"12673"	, # Heart rate during PWA
"12687"	, # Mean arterial pressure during PWA
"12679"	, # Number of beats in waveform average for PWA
"12676"	, # Peripheral pulse pressure during PWA
"12686"	, # Stroke volume during PWA
"12697"	, # Systolic brachial blood pressure
"12674"	, # Systolic brachial blood pressure during PWA
"12685"	, # Total peripheral resistance during PWA
"12671"	, # PWA start time
"12699"	, # Number of PWA tests performed
"12700"	, # Vicorder results plausible
"23305"	, # Head bone area
"23220"	, # Arm BMC (bone mineral content) (left)
"23222"	, # Arm BMC (bone mineral content) (right)
"23317"	, # Arms combined bone area
"23221"	, # Arm BMD (bone mineral density) (left)
"23223"	, # Arm BMD (bone mineral density) (right)
"23313"	, # Arm bone area (left)
"23314"	, # Arm bone area (right)
"23309"	, # Ribs bone area
"23224"	, # Arms BMC (bone mineral content)
"23225"	, # Arms BMD (bone mineral density)
"23311"	, # Spine bone area
"23302"	, # Femur lower neck BMD (bone mineral density) (left)
"23206"	, # Femur lower neck BMD (bone mineral density) (right)
"23304"	, # Trunk bone area
"23307"	, # Pelvis bone area
"23318"	, # Legs combined bone area
"23315"	, # Leg bone area (left)
"23207"	, # Femur lower neck BMD (bone mineral density) T-score (right)
"23294"	, # Femur lower neck BMD (bone mineral density) T-score (left)
"23299"	, # Femur neck BMD (bone mineral density) (left)
"23208"	, # Femur neck BMD (bone mineral density) (right)
"23316"	, # Leg bone area (right)
"23327"	, # Femur neck BMC (bone mineral content) (left)
"23328"	, # Femur neck BMC (bone mineral content) (right)
"23325"	, # Femur neck bone area (left)
"23326"	, # Femur neck bone area (right)
"23331"	, # Femur shaft BMC (bone mineral content) (left)
"23332"	, # Femur shaft BMC (bone mineral content) (right)
"23329"	, # Femur shaft bone area (left)
"23330"	, # Femur shaft bone area (right)
"23335"	, # Femur total BMC (bone mineral content) (left)
"23336"	, # Femur total BMC (bone mineral content) (right)
"23333"	, # Femur total area (left)
"23334"	, # Femur total area (right)
"23339"	, # Femur troch BMC (bone mineral content) (left)
"23340"	, # Femur troch BMC (bone mineral content) (right)
"23337"	, # Femur troch bone area (left)
"23338"	, # Femur troch bone area (right)
"23343"	, # Femur wards BMC (bone mineral content) (left)
"23344"	, # Femur wards BMC (bone mineral content) (right)
"23341"	, # Femur wards bone area (left)
"23342"	, # Femur wards bone area (right)
"23300"	, # Femur neck BMD (bone mineral density) T-score (left)
"23209"	, # Femur neck BMD (bone mineral density) T-score (right)
"23290"	, # Femur shaft BMD (bone mineral density) (left)
"23210"	, # Femur shaft BMD (bone mineral density) (right)
"23303"	, # Femur shaft BMD (bone mineral density) T-score (left)
"23211"	, # Femur shaft BMD (bone mineral density) T-score (right)
"23291"	, # Femur total BMD (bone mineral density) (left)
"23212"	, # Femur total BMD (bone mineral density) (right)
"23293"	, # Femur total BMD (bone mineral density) T-score (left)
"23213"	, # Femur total BMD (bone mineral density) T-score (right)
"23295"	, # Femur troch BMD (bone mineral density) (left)
"23214"	, # Femur troch BMD (bone mineral density) (right)
"23298"	, # Femur troch BMD (bone mineral density) T-score (left)
"23215"	, # Femur troch BMD (bone mineral density) T-score (right)
"23200"	, # L1-L4 area
"23201"	, # L1-L4 average height
"23202"	, # L1-L4 average width
"21005"	, # L1-L4 TBS (trabecular bone score)
"23292"	, # Femur upper neck BMD (bone mineral density) (left)
"23216"	, # Femur upper neck BMD (bone mineral density) (right)
"23296"	, # Femur upper neck BMD (bone mineral density) T-score (left)
"23217"	, # Femur upper neck BMD (bone mineral density) T-score (right)
"23297"	, # Femur wards BMD (bone mineral density) (left)
"23218"	, # Femur wards BMD (bone mineral density) (right)
"23301"	, # Femur wards BMD (bone mineral density) T-score (left)
"23219"	, # Femur wards BMD (bone mineral density) T-score (right)
"23306"	, # Head BMC (bone mineral content)
"23226"	, # Head BMD (bone mineral density)
"23203"	, # L1-L4 BMC (bone mineral content)
"23204"	, # L1-L4 BMD (bone mineral density)
"23205"	, # L1-L4 BMD (bone mineral density) T-score
"23320"	, # Leg BMC (bone mineral content) (left)
"23228"	, # Leg BMC (bone mineral content) (right)
"23227"	, # Leg BMD (bone mineral density) (left)
"23229"	, # Leg BMD (bone mineral density) (right)
"23230"	, # Legs BMC (bone mineral content)
"23231"	, # Legs BMD (bone mineral density)
"23308"	, # Pelvis BMC (bone mineral content)
"23232"	, # Pelvis BMD (bone mineral density)
"23310"	, # Ribs BMC (bone mineral content)
"23233"	, # Ribs BMD (bone mineral density)
"23312"	, # Spine BMC (bone mineral content)
"23234"	, # Spine BMD (bone mineral density)
"23235"	, # Total BMC (bone mineral content)
"23236"	, # Total BMD (bone mineral density)
"23237"	, # Total BMD (bone mineral density) (left)
"23238"	, # Total BMD (bone mineral density) (right)
"23239"	, # Total BMD (bone mineral density) T-score
"23240"	, # Trunk BMC (bone mineral content)
"23241"	, # Trunk BMD (bone mineral density)
"23242"	, # Trunk BMD (bone mineral density) (left)
"23243"	, # Trunk BMD (bone mineral density) (right)
"23244"	, # Android bone mass
"23245"	, # Android fat mass
"23246"	, # Android lean mass
"23247"	, # Android tissue fat percentage
"23248"	, # Android total mass
"23249"	, # Arm fat mass (left)
"23253"	, # Arm fat mass (right)
"23250"	, # Arm lean mass (left)
"23254"	, # Arm lean mass (right)
"23251"	, # Arm tissue fat percentage (left)
"23255"	, # Arm tissue fat percentage (right)
"23252"	, # Arm total mass (left)
"23256"	, # Arm total mass (right)
"23257"	, # Arms fat mass
"23258"	, # Arms lean mass
"23259"	, # Arms tissue fat percentage
"23260"	, # Arms total mass
"23261"	, # Gynoid bone mass
"23262"	, # Gynoid fat mass
"23263"	, # Gynoid lean mass
"23264"	, # Gynoid tissue fat percentage
"23265"	, # Gynoid total mass
"23266"	, # Leg fat mass (left)
"23270"	, # Leg fat mass (right)
"23267"	, # Leg lean mass (left)
"23271"	, # Leg lean mass (right)
"23268"	, # Leg tissue fat percentage (left)
"23272"	, # Leg tissue fat percentage (right)
"23269"	, # Leg total mass (left)
"23273"	, # Leg total mass (right)
"23274"	, # Legs fat mass
"23275"	, # Legs lean mass
"23276"	, # Legs tissue fat percentage
"23277"	, # Legs total mass
"23278"	, # Total fat mass
"23279"	, # Total fat-free mass
"23280"	, # Total lean mass
"23281"	, # Total tissue fat percentage
"23282"	, # Total tissue mass
"23283"	, # Total mass
"23284"	, # Trunk fat mass
"23285"	, # Trunk lean mass
"23286"	, # Trunk tissue fat percentage
"23287"	, # Trunk total mass
"23288"	, # VAT (visceral adipose tissue) mass
"23289"	, # VAT (visceral adipose tissue) volume
]

imaging = temp = get_data_fields(fields_imaging, data, data_field)
print(len(temp))
display(temp.head())

imaging.to_feather(os.path.join(dataset_path, 'temp_imaging.feather'))

### Lab measurements

In [None]:
fields_context = [
"20049"	, # Blood sample #, note contents
"20050"	, # Reason blood sampling not attempted
"35"	, # Was blood sampling attempted
"68"	, # Number of blood samples taken
"3166"	, # Time blood sample collected
"74"	, # Fasting time
]

fields_blood_count = [
    "30160", #	Basophill count
    "30220", #	Basophill percentage
    "30150", #	Eosinophill count
    "30210", #	Eosinophill percentage
    "30030", #	Haematocrit percentage
    "30020", #	Haemoglobin concentration
    "30300", #	High light scatter reticulocyte count
    "30290", #	High light scatter reticulocyte percentage
    "30280", #	Immature reticulocyte fraction
    "30120", #	Lymphocyte count
    "30180", #	Lymphocyte percentage
    "30050", #	Mean corpuscular haemoglobin
    "30060", #	Mean corpuscular haemoglobin concentration
    "30040", #	Mean corpuscular volume
    "30100", #	Mean platelet (thrombocyte) volume
    "30260", #	Mean reticulocyte volume
    "30270", #	Mean sphered cell volume
    "30130", #	Monocyte count
    "30190", #	Monocyte percentage
    "30140", #	Neutrophill count
    "30200", #	Neutrophill percentage
    "30170", #	Nucleated red blood cell count
    "30230", #	Nucleated red blood cell percentage
    "30080", #	Platelet count
    "30090", #	Platelet crit
    "30110", #	Platelet distribution width
    "30010", #	Red blood cell (erythrocyte) count
    "30070", #	Red blood cell (erythrocyte) distribution width
    "30250", #	Reticulocyte count
    "30240", #	Reticulocyte percentage
    "30000", #	White blood cell (leukocyte) count
]

fields_blood_biochemistry = [
    "30620",#	Alanine aminotransferase
    "30600",#	Albumin
    "30610",#	Alkaline phosphatase
    "30630",#	Apolipoprotein A
    "30640",#	Apolipoprotein B
    "30650",#	Aspartate aminotransferase
    "30710",#	C-reactive protein
    "30680",#	Calcium
    "30690",#	Cholesterol
    "30700",#	Creatinine
    "30720",#	Cystatin C
    "30660",#	Direct bilirubin
    "30730",#	Gamma glutamyltransferase
    "30740",#	Glucose
    "30750",#	Glycated haemoglobin (HbA1c)
    "30760",#	HDL cholesterol
    "30770",#	IGF-1
    "30780",#	LDL direct
    "30790",#	Lipoprotein A
    "30800",#	Oestradiol
    "30810",#	Phosphate
    "30820",#	Rheumatoid factor
    "30830",#	SHBG
    "30850",#	Testosterone
    "30840",#	Total bilirubin
    "30860",#	Total protein
    "30870",#	Triglycerides
    "30880",#	Urate
    "30670",#	Urea
    "30890",#	Vitamin D
]

fields_blood_infectious = [
    "23000", #	1gG antigen for Herpes Simplex virus-1
    "23001", #	2mgG unique antigen for Herpes Simplex virus-2
    "23049", #	Antigen assay QC indicator
    "23048", #	Antigen assay date
    "23026", #	BK VP1 antigen for Human Polyomavirus BKV
    "23039", #	CagA antigen for Helicobacter pylori
    "23043", #	Catalase antigen for Helicobacter pylori
    "23018", #	Core antigen for Hepatitis C Virus
    "23030", #	E6 antigen for Human Papillomavirus type-16
    "23031", #	E7 antigen for Human Papillomavirus type-16
    "23006", #	EA-D antigen for Epstein-Barr Virus
    "23004", #	EBNA-1 antigen for Epstein-Barr Virus
    "23042", #	GroEL antigen for Helicobacter pylori
    "23016", #	HBc antigen for Hepatitis B Virus
    "23017", #	HBe antigen for Hepatitis B Virus
    "23025", #	HIV-1 env antigen for Human Immunodeficiency Virus
    "23024", #	HIV-1 gag antigen for Human Immunodeficiency Virus
    "23023", #	HTLV-1 env antigen for Human T-Lymphotropic Virus 1
    "23022", #	HTLV-1 gag antigen for Human T-Lymphotropic Virus 1
    "23010", #	IE1A antigen for Human Herpesvirus-6
    "23011", #	IE1B antigen for Human Herpesvirus-6
    "23027", #	JC VP1 antigen for Human Polyomavirus JCV
    "23015", #	K8.1 antigen for Kaposi's Sarcoma-Associated Herpesvirus
    "23029", #	L1 antigen for Human Papillomavirus type-16
    "23032", #	L1 antigen for Human Papillomavirus type-18
    "23014", #	LANA antigen for Kaposi's Sarcoma-Associated Herpesvirus
    "23028", #	MC VP1 antigen for Merkel Cell Polyomavirus
    "23019", #	NS3 antigen for Hepatitis C Virus
    "23041", #	OMP antigen for Helicobacter pylori
    "23037", #	PorB antigen for Chlamydia trachomatis
    "23013", #	U14 antigen for Human Herpesvirus-7
    "23044", #	UreA antigen for Helicobacter pylori
    "23003", #	VCA p18 antigen for Epstein-Barr Virus
    "23040", #	VacA antigen for Helicobacter pylori
    "23005", #	ZEBRA antigen for Epstein-Barr Virus
    "23002", #	gE / gI antigen for Varicella Zoster Virus
    "23034", #	momp A antigen for Chlamydia trachomatis
    "23033", #	momp D antigen for Chlamydia trachomatis
    "23012", #	p101 k antigen for Human Herpesvirus-6
    "23020", #	p22 antigen for Toxoplasma gondii
    "23038", #	pGP3 antigen for Chlamydia trachomatis
    "23009", #	pp 28 antigen for Human Cytomegalovirus
    "23008", #	pp 52 antigen for Human Cytomegalovirus
    "23007", #	pp150 Nter antigen for Human Cytomegalovirus
    "23021", #	sag1 antigen for Toxoplasma gondii
    "23035", #	tarp-D F1 antigen for Chlamydia trachomatis
    "23036", #	tarp-D F2 antigen for Chlamydia trachomatis
]

fields_urine = [
"30510" , # Creatinine (enzymatic) in urine
"30515" , # Creatinine (enzymatic) in urine result flag
"30500" , # Microalbumin in urine
"30505" , # Microalbumin in urine result flag
"30520" , # Potassium in urine
"30525" , # Potassium in urine result flag
"30530" , # Sodium in urine
"30535" , # Sodium in urine result flag
]
labs = temp = get_data_fields(fields_urine+fields_context+fields_blood_count+fields_blood_biochemistry+fields_blood_infectious, data, data_field)
print(len(temp))
display(temp.head())

labs.to_feather(os.path.join(dataset_path, 'temp_labs.feather'))

### Genomics

In [None]:
fields_genomics = [
"26202" , # PRS for standard age at menopause (AAM)
"26204" , # PRS for standard age-related macular degeneration (AMD)
"26206" , # PRS for standard alzheimer's disease (AD)
"26210" , # PRS for standard asthma (AST)
"26212" , # PRS for standard atrial fibrillation (AF)
"26214" , # PRS for standard bipolar disorder (BD)
"26216" , # PRS for standard body mass index (BMI)
"26218" , # PRS for standard bowel cancer (CRC)
"26220" , # PRS for standard breast cancer (BC)
"26223" , # PRS for standard cardiovascular disease (CVD)
"26225" , # PRS for standard coeliac disease (CED)
"26227" , # PRS for standard coronary artery disease (CAD)
"26229" , # PRS for standard crohn's disease (CD)
"26232" , # PRS for standard epithelial ovarian cancer (EOC)
"26234" , # PRS for standard estimated bone mineral density t-score (EBMDT)
"26238" , # PRS for standard glycated haemoglobin (HBA1C_DF)
"26240" , # PRS for standard height (HEIGHT)
"26242" , # PRS for standard high density lipoprotein cholesterol (HDL)
"26244" , # PRS for standard hypertension (HT)
"26246" , # PRS for standard intraocular pressure (IOP)
"26248" , # PRS for standard ischaemic stroke (ISS)
"26250" , # PRS for standard low density lipoprotein cholesterol (LDL_SF)
"26252" , # PRS for standard melanoma (MEL)
"26254" , # PRS for standard multiple sclerosis (MS)
"26258" , # PRS for standard osteoporosis (OP)
"26260" , # PRS for standard parkinson's disease (PD)
"26265" , # PRS for standard primary open angle glaucoma (POAG)
"26267" , # PRS for standard prostate cancer (PC)
"26269" , # PRS for standard psoriasis (PSO)
"26273" , # PRS for standard rheumatoid arthritis (RA)
"26275" , # PRS for standard schizophrenia (SCZ)
"26278" , # PRS for standard systemic lupus erythematosus (SLE)
"26283" , # PRS for standard type 1 diabetes (T1D)
"26285" , # PRS for standard type 2 diabetes (T2D)
"26287" , # PRS for standard ulcerative colitis (UC)
"26289" , # PRS for standard venous thromboembolic disease (VTE)
"26203" , # PRS for enhanced age at menopause (AAM)
"26205" , # PRS for enhanced age-related macular degeneration (AMD)
"26207" , # PRS for enhanced alzheimer's disease (AD)
"26208" , # PRS for enhanced apolipoprotein a1 (APOEA)
"26209" , # PRS for enhanced apolipoprotein b (APOEB)
"26211" , # PRS for enhanced asthma (AST)
"26213" , # PRS for enhanced atrial fibrillation (AF)
"26215" , # PRS for enhanced bipolar disorder (BD)
"26217" , # PRS for enhanced body mass index (BMI)
"26219" , # PRS for enhanced bowel cancer (CRC)
"26221" , # PRS for enhanced breast cancer (BC)
"26222" , # PRS for enhanced calcium (CAL)
"26224" , # PRS for enhanced cardiovascular disease (CVD)
"26226" , # PRS for enhanced coeliac disease (CED)
"26228" , # PRS for enhanced coronary artery disease (CAD)
"26230" , # PRS for enhanced crohn's disease (CD)
"26231" , # PRS for enhanced docosahexaenoic acid (DOA)
"26233" , # PRS for enhanced epithelial ovarian cancer (EOC)
"26235" , # PRS for enhanced estimated bone mineral density t-score (EBMDT)
"26236" , # PRS for enhanced estimated glomerular filtration rate (creatinine based) (EGCR)
"26237" , # PRS for enhanced estimated glomerular filtration rate (cystatin based) (EGCY)
"26239" , # PRS for enhanced glycated haemoglobin (HBA1C_DF)
"26241" , # PRS for enhanced height (HEIGHT)
"26243" , # PRS for enhanced high density lipoprotein cholesterol (HDL)
"26245" , # PRS for enhanced hypertension (HT)
"26247" , # PRS for enhanced intraocular pressure (IOP)
"26249" , # PRS for enhanced ischaemic stroke (ISS)
"26251" , # PRS for enhanced low density lipoprotein cholesterol (LDL_SF)
"26253" , # PRS for enhanced melanoma (MEL)
"26255" , # PRS for enhanced multiple sclerosis (MS)
"26256" , # PRS for enhanced omega-3 fatty acids (OTFA)
"26257" , # PRS for enhanced omega-6 fatty acids (OSFA)
"26259" , # PRS for enhanced osteoporosis (OP)
"26261" , # PRS for enhanced parkinson's disease (PD)
"26262" , # PRS for enhanced phosphatidylcholines (PDCL)
"26263" , # PRS for enhanced phosphoglycerides (PHG)
"26264" , # PRS for enhanced polyunsaturated fatty acids (PFA)
"26266" , # PRS for enhanced primary open angle glaucoma (POAG)
"26268" , # PRS for enhanced prostate cancer (PC)
"26270" , # PRS for enhanced psoriasis (PSO)
"26271" , # PRS for enhanced remnant cholesterol (Non-HDL, Non-LDL cholesterol) (RMNC)
"26272" , # PRS for enhanced resting heart rate (RHR)
"26274" , # PRS for enhanced rheumatoid arthritis (RA)
"26276" , # PRS for enhanced schizophrenia (SCZ)
"26277" , # PRS for enhanced sphingomyelins (SGM)
"26279" , # PRS for enhanced systemic lupus erythematosus (SLE)
"26280" , # PRS for enhanced total cholesterol (TCH)
"26281" , # PRS for enhanced total fatty acids (TFA)
"26282" , # PRS for enhanced total triglycerides (TTG)
"26284" , # PRS for enhanced type 1 diabetes (T1D)
"26286" , # PRS for enhanced type 2 diabetes (T2D)
"26288" , # PRS for enhanced ulcerative colitis (UC)
"26290" , # PRS for enhanced venous thromboembolic disease (VTE)
"23165" , # Blood-type haplotype
"22191" , # Adjusted T/S ratio
"22194" , # T/S ratio for regression dilution bias
"22193" , # Telomere measurement plate	#
"22190" , # Unadjusted T/S ratio
"22192" , # Z-adjusted T/S log
]

genomics = temp = get_data_fields(fields_genomics, data, data_field)
print(len(temp))
display(temp.head())

genomics.to_feather(os.path.join(dataset_path, 'temp_genomics.feather'))

### Metabolomics

In [None]:
fields_metabolomics = [
"23652" , # High Lactate
"23653" , # High Pyruvate
"23654" , # Low Glucose
"23655" , # Low Protein
"23651" , # Measurement Quality Flagged
"23658" , # Sample Measured Date and Time
"23659" , # Sample Prepared Date and Time
"23649" , # Shipment Plate
"23650" , # Spectrometer
"23660" , # Well position within plate
"23474"	, # 3-Hydroxybutyrate
"23475"	, # Acetate
"23476"	, # Acetoacetate
"23477"	, # Acetone
"23460"	, # Alanine
"23479"	, # Albumin
"23440"	, # Apolipoprotein A1
"23439"	, # Apolipoprotein B
"23441"	, # Apolipoprotein B to Apolipoprotein A1 ratio
"23433"	, # Average Diameter for HDL Particles
"23432"	, # Average Diameter for LDL Particles
"23431"	, # Average Diameter for VLDL Particles
"23484"	, # Cholesterol in Chylomicrons and Extremely Large VLDL
"23526"	, # Cholesterol in IDL
"23561"	, # Cholesterol in Large HDL
"23533"	, # Cholesterol in Large LDL
"23498"	, # Cholesterol in Large VLDL
"23568"	, # Cholesterol in Medium HDL
"23540"	, # Cholesterol in Medium LDL
"23505"	, # Cholesterol in Medium VLDL
"23575"	, # Cholesterol in Small HDL
"23547"	, # Cholesterol in Small LDL
"23512"	, # Cholesterol in Small VLDL
"23554"	, # Cholesterol in Very Large HDL
"23491"	, # Cholesterol in Very Large VLDL
"23519"	, # Cholesterol in Very Small VLDL
"23580"	, # Cholesterol to Total Lipids in Chylomicrons and Extremely Large VLDL percentage
"23610"	, # Cholesterol to Total Lipids in IDL percentage
"23635"	, # Cholesterol to Total Lipids in Large HDL percentage
"23615"	, # Cholesterol to Total Lipids in Large LDL percentage
"23590"	, # Cholesterol to Total Lipids in Large VLDL percentage
"23640"	, # Cholesterol to Total Lipids in Medium HDL percentage
"23620"	, # Cholesterol to Total Lipids in Medium LDL percentage
"23595"	, # Cholesterol to Total Lipids in Medium VLDL percentage
"23645"	, # Cholesterol to Total Lipids in Small HDL percentage
"23625"	, # Cholesterol to Total Lipids in Small LDL percentage
"23600"	, # Cholesterol to Total Lipids in Small VLDL percentage
"23630"	, # Cholesterol to Total Lipids in Very Large HDL percentage
"23585"	, # Cholesterol to Total Lipids in Very Large VLDL percentage
"23605"	, # Cholesterol to Total Lipids in Very Small VLDL percentage
"23485"	, # Cholesteryl Esters in Chylomicrons and Extremely Large VLDL
"23418"	, # Cholesteryl Esters in HDL
"23527"	, # Cholesteryl Esters in IDL
"23417"	, # Cholesteryl Esters in LDL
"23562"	, # Cholesteryl Esters in Large HDL
"23534"	, # Cholesteryl Esters in Large LDL
"23499"	, # Cholesteryl Esters in Large VLDL
"23569"	, # Cholesteryl Esters in Medium HDL
"23541"	, # Cholesteryl Esters in Medium LDL
"23506"	, # Cholesteryl Esters in Medium VLDL
"23576"	, # Cholesteryl Esters in Small HDL
"23548"	, # Cholesteryl Esters in Small LDL
"23513"	, # Cholesteryl Esters in Small VLDL
"23416"	, # Cholesteryl Esters in VLDL
"23555"	, # Cholesteryl Esters in Very Large HDL
"23492"	, # Cholesteryl Esters in Very Large VLDL
"23520"	, # Cholesteryl Esters in Very Small VLDL
"23581"	, # Cholesteryl Esters to Total Lipids in Chylomicrons and Extremely Large VLDL percentage
"23611"	, # Cholesteryl Esters to Total Lipids in IDL percentage
"23636"	, # Cholesteryl Esters to Total Lipids in Large HDL percentage
"23616"	, # Cholesteryl Esters to Total Lipids in Large LDL percentage
"23591"	, # Cholesteryl Esters to Total Lipids in Large VLDL percentage
"23641"	, # Cholesteryl Esters to Total Lipids in Medium HDL percentage
"23621"	, # Cholesteryl Esters to Total Lipids in Medium LDL percentage
"23596"	, # Cholesteryl Esters to Total Lipids in Medium VLDL percentage
"23646"	, # Cholesteryl Esters to Total Lipids in Small HDL percentage
"23626"	, # Cholesteryl Esters to Total Lipids in Small LDL percentage
"23601"	, # Cholesteryl Esters to Total Lipids in Small VLDL percentage
"23631"	, # Cholesteryl Esters to Total Lipids in Very Large HDL percentage
"23586"	, # Cholesteryl Esters to Total Lipids in Very Large VLDL percentage
"23606"	, # Cholesteryl Esters to Total Lipids in Very Small VLDL percentage
"23473"	, # Citrate
"23404"	, # Clinical LDL Cholesterol
"23481"	, # Concentration of Chylomicrons and Extremely Large VLDL Particles
"23430"	, # Concentration of HDL Particles
"23523"	, # Concentration of IDL Particles
"23429"	, # Concentration of LDL Particles
"23558"	, # Concentration of Large HDL Particles
"23530"	, # Concentration of Large LDL Particles
"23495"	, # Concentration of Large VLDL Particles
"23565"	, # Concentration of Medium HDL Particles
"23537"	, # Concentration of Medium LDL Particles
"23502"	, # Concentration of Medium VLDL Particles
"23572"	, # Concentration of Small HDL Particles
"23544"	, # Concentration of Small LDL Particles
"23509"	, # Concentration of Small VLDL Particles
"23428"	, # Concentration of VLDL Particles
"23551"	, # Concentration of Very Large HDL Particles
"23488"	, # Concentration of Very Large VLDL Particles
"23516"	, # Concentration of Very Small VLDL Particles
"23478"	, # Creatinine
"23443"	, # Degree of Unsaturation
"23450"	, # Docosahexaenoic Acid
"23457"	, # Docosahexaenoic Acid to Total Fatty Acids percentage
"23486"	, # Free Cholesterol in Chylomicrons and Extremely Large VLDL
"23422"	, # Free Cholesterol in HDL
"23528"	, # Free Cholesterol in IDL
"23421"	, # Free Cholesterol in LDL
"23563"	, # Free Cholesterol in Large HDL
"23535"	, # Free Cholesterol in Large LDL
"23500"	, # Free Cholesterol in Large VLDL
"23570"	, # Free Cholesterol in Medium HDL
"23542"	, # Free Cholesterol in Medium LDL
"23507"	, # Free Cholesterol in Medium VLDL
"23577"	, # Free Cholesterol in Small HDL
"23549"	, # Free Cholesterol in Small LDL
"23514"	, # Free Cholesterol in Small VLDL
"23420"	, # Free Cholesterol in VLDL
"23556"	, # Free Cholesterol in Very Large HDL
"23493"	, # Free Cholesterol in Very Large VLDL
"23521"	, # Free Cholesterol in Very Small VLDL
"23582"	, # Free Cholesterol to Total Lipids in Chylomicrons and Extremely Large VLDL percentage
"23612"	, # Free Cholesterol to Total Lipids in IDL percentage
"23637"	, # Free Cholesterol to Total Lipids in Large HDL percentage
"23617"	, # Free Cholesterol to Total Lipids in Large LDL percentage
"23592"	, # Free Cholesterol to Total Lipids in Large VLDL percentage
"23642"	, # Free Cholesterol to Total Lipids in Medium HDL percentage
"23622"	, # Free Cholesterol to Total Lipids in Medium LDL percentage
"23597"	, # Free Cholesterol to Total Lipids in Medium VLDL percentage
"23647"	, # Free Cholesterol to Total Lipids in Small HDL percentage
"23627"	, # Free Cholesterol to Total Lipids in Small LDL percentage
"23602"	, # Free Cholesterol to Total Lipids in Small VLDL percentage
"23632"	, # Free Cholesterol to Total Lipids in Very Large HDL percentage
"23587"	, # Free Cholesterol to Total Lipids in Very Large VLDL percentage
"23607"	, # Free Cholesterol to Total Lipids in Very Small VLDL percentage
"23470"	, # Glucose
"23461"	, # Glutamine
"23462"	, # Glycine
"23480"	, # Glycoprotein Acetyls
"23406"	, # HDL Cholesterol
"23463"	, # Histidine
"23465"	, # Isoleucine
"23405"	, # LDL Cholesterol
"23471"	, # Lactate
"23466"	, # Leucine
"23449"	, # Linoleic Acid
"23456"	, # Linoleic Acid to Total Fatty Acids percentage
"23447"	, # Monounsaturated Fatty Acids
"23454"	, # Monounsaturated Fatty Acids to Total Fatty Acids percentage
"23444"	, # Omega-3 Fatty Acids
"23451"	, # Omega-3 Fatty Acids to Total Fatty Acids percentage
"23445"	, # Omega-6 Fatty Acids
"23459"	, # Omega-6 Fatty Acids to Omega-3 Fatty Acids ratio
"23452"	, # Omega-6 Fatty Acids to Total Fatty Acids percentage
"23468"	, # Phenylalanine
"23437"	, # Phosphatidylcholines
"23434"	, # Phosphoglycerides
"23483"	, # Phospholipids in Chylomicrons and Extremely Large VLDL
"23414"	, # Phospholipids in HDL
"23525"	, # Phospholipids in IDL
"23413"	, # Phospholipids in LDL
"23560"	, # Phospholipids in Large HDL
"23532"	, # Phospholipids in Large LDL
"23497"	, # Phospholipids in Large VLDL
"23567"	, # Phospholipids in Medium HDL
"23539"	, # Phospholipids in Medium LDL
"23504"	, # Phospholipids in Medium VLDL
"23574"	, # Phospholipids in Small HDL
"23546"	, # Phospholipids in Small LDL
"23511"	, # Phospholipids in Small VLDL
"23412"	, # Phospholipids in VLDL
"23553"	, # Phospholipids in Very Large HDL
"23490"	, # Phospholipids in Very Large VLDL
"23518"	, # Phospholipids in Very Small VLDL
"23579"	, # Phospholipids to Total Lipids in Chylomicrons and Extremely Large VLDL percentage
"23609"	, # Phospholipids to Total Lipids in IDL percentage
"23634"	, # Phospholipids to Total Lipids in Large HDL percentage
"23614"	, # Phospholipids to Total Lipids in Large LDL percentage
"23589"	, # Phospholipids to Total Lipids in Large VLDL percentage
"23639"	, # Phospholipids to Total Lipids in Medium HDL percentage
"23619"	, # Phospholipids to Total Lipids in Medium LDL percentage
"23594"	, # Phospholipids to Total Lipids in Medium VLDL percentage
"23644"	, # Phospholipids to Total Lipids in Small HDL percentage
"23624"	, # Phospholipids to Total Lipids in Small LDL percentage
"23599"	, # Phospholipids to Total Lipids in Small VLDL percentage
"23629"	, # Phospholipids to Total Lipids in Very Large HDL percentage
"23584"	, # Phospholipids to Total Lipids in Very Large VLDL percentage
"23604"	, # Phospholipids to Total Lipids in Very Small VLDL percentage
"23446"	, # Polyunsaturated Fatty Acids
"23458"	, # Polyunsaturated Fatty Acids to Monounsaturated Fatty Acids ratio
"23453"	, # Polyunsaturated Fatty Acids to Total Fatty Acids percentage
"23472"	, # Pyruvate
"23402"	, # Remnant Cholesterol (Non-HDL, Non-LDL -Cholesterol)
"23448"	, # Saturated Fatty Acids
"23455"	, # Saturated Fatty Acids to Total Fatty Acids percentage
"23438"	, # Sphingomyelins
"23400"	, # Total Cholesterol
"23401"	, # Total Cholesterol Minus HDL-C
"23436"	, # Total Cholines
"23464"	, # Total Concentration of Branched-Chain Amino Acids (Leucine + Isoleucine + Valine)
"23427"	, # Total Concentration of Lipoprotein Particles
"23415"	, # Total Esterified Cholesterol
"23442"	, # Total Fatty Acids
"23419"	, # Total Free Cholesterol
"23482"	, # Total Lipids in Chylomicrons and Extremely Large VLDL
"23426"	, # Total Lipids in HDL
"23524"	, # Total Lipids in IDL
"23425"	, # Total Lipids in LDL
"23559"	, # Total Lipids in Large HDL
"23531"	, # Total Lipids in Large LDL
"23496"	, # Total Lipids in Large VLDL
"23423"	, # Total Lipids in Lipoprotein Particles
"23566"	, # Total Lipids in Medium HDL
"23538"	, # Total Lipids in Medium LDL
"23503"	, # Total Lipids in Medium VLDL
"23573"	, # Total Lipids in Small HDL
"23545"	, # Total Lipids in Small LDL
"23510"	, # Total Lipids in Small VLDL
"23424"	, # Total Lipids in VLDL
"23552"	, # Total Lipids in Very Large HDL
"23489"	, # Total Lipids in Very Large VLDL
"23517"	, # Total Lipids in Very Small VLDL
"23411"	, # Total Phospholipids in Lipoprotein Particles
"23407"	, # Total Triglycerides
"23487"	, # Triglycerides in Chylomicrons and Extremely Large VLDL
"23410"	, # Triglycerides in HDL
"23529"	, # Triglycerides in IDL
"23409"	, # Triglycerides in LDL
"23564"	, # Triglycerides in Large HDL
"23536"	, # Triglycerides in Large LDL
"23501"	, # Triglycerides in Large VLDL
"23571"	, # Triglycerides in Medium HDL
"23543"	, # Triglycerides in Medium LDL
"23508"	, # Triglycerides in Medium VLDL
"23578"	, # Triglycerides in Small HDL
"23550"	, # Triglycerides in Small LDL
"23515"	, # Triglycerides in Small VLDL
"23408"	, # Triglycerides in VLDL
"23557"	, # Triglycerides in Very Large HDL
"23494"	, # Triglycerides in Very Large VLDL
"23522"	, # Triglycerides in Very Small VLDL
"23435"	, # Triglycerides to Phosphoglycerides ratio
"23583"	, # Triglycerides to Total Lipids in Chylomicrons and Extremely Large VLDL percentage
"23613"	, # Triglycerides to Total Lipids in IDL percentage
"23638"	, # Triglycerides to Total Lipids in Large HDL percentage
"23618"	, # Triglycerides to Total Lipids in Large LDL percentage
"23593"	, # Triglycerides to Total Lipids in Large VLDL percentage
"23643"	, # Triglycerides to Total Lipids in Medium HDL percentage
"23623"	, # Triglycerides to Total Lipids in Medium LDL percentage
"23598"	, # Triglycerides to Total Lipids in Medium VLDL percentage
"23648"	, # Triglycerides to Total Lipids in Small HDL percentage
"23628"	, # Triglycerides to Total Lipids in Small LDL percentage
"23603"	, # Triglycerides to Total Lipids in Small VLDL percentage
"23633"	, # Triglycerides to Total Lipids in Very Large HDL percentage
"23588"	, # Triglycerides to Total Lipids in Very Large VLDL percentage
"23608"	, # Triglycerides to Total Lipids in Very Small VLDL percentage
"23469"	, # Tyrosine
"23403"	, # VLDL Cholesterol
"23467"	, # Valine
]

metabolomics = temp = get_data_fields(fields_metabolomics, data, data_field)
metabolomics.columns = ["eid"]+[f"NMR_{col}" for col in metabolomics.columns[1:]]
metabolomics["NMR_FLAG"] = [True if type(spectrometer)==str else False for spectrometer in metabolomics["NMR_spectrometer_f23650_0_0"].tolist()]

print(len(temp))
display(temp.sample(10))

metabolomics.to_feather(os.path.join(dataset_path, 'temp_metabolomics.feather'))

### Family History

In [None]:
fh_list=["Heart disease", "Stroke", "High blood pressure",  "Diabetes", "Lung cancer", "Severe depression", "Parkinson's disease", "Alzheimer's disease/dementia", "Chronic bronchitis/emphysema", "Breast cancer", "Bowel cancer"]
with open(os.path.join(dataset_path, 'fh_list.yaml'), 'w') as file: yaml.dump(fh_list, file, default_flow_style=False)

fields_family_history = [
    "20107", # Family history 
    "20110" # Family history
]

raw = get_data_fields(fields_family_history, data, data_field)
temp = pd.melt(raw, id_vars=["eid"], value_vars=raw.drop("eid", axis=1).columns.to_list(), var_name = "field", value_name="family_history").drop("field", axis=1)
temp = temp[temp.family_history.isin(fh_list)].assign(family_history=temp["family_history"].str.lower().replace(" ", "_", regex=True))
temp = temp.drop_duplicates().sort_values("eid").reset_index().drop("index", axis=1).assign(n=True)
temp = pd.pivot_table(temp, index="eid", columns="family_history", values="n", observed=True).add_prefix('fh_')
temp = data[["eid"]].copy().merge(temp, how="left", on="eid").fillna(0.0).replace({0: False, 1:True})

temp = temp.set_index("eid")
temp.columns = [f"{c}_0_0" for c in temp.columns]
family_history = temp = temp.reset_index()

print(len(temp))
temp.head()

family_history.to_feather(os.path.join(dataset_path, 'temp_family_history.feather'))

### Medications

In [None]:
# https://list.essentialmeds.org/?showRemoved=0
# essential medicines WHO?!

In [None]:
def clean_med(raw):
    return raw.lower().replace("-", " ").replace(".", " ").replace(",", " ").replace("/", " ").replace("  ", " ").replace(" ", "_")

atc_mapping = pd.read_csv(f"{mapping_path}/atc/atc_matched_list.csv")
athena_concepts = pd.read_csv(f"{mapping_path}/athena/CONCEPT.csv", sep="\t").assign(vocabulary_id = lambda x: x.vocabulary_id.astype("string"), concept_class_id = lambda x: x.concept_class_id.astype("string"))
atc_concepts = athena_concepts[athena_concepts.vocabulary_id=="ATC"]
atc2_concepts = atc_concepts[atc_concepts.concept_class_id=="ATC 2nd"].sort_values("concept_code")
medication_list = dict(zip([f"ATC_{atc}_{clean_med(name)}" for name, atc in zip(atc2_concepts.concept_name.to_list(), atc2_concepts.concept_code.to_list())], [[x] for x in atc2_concepts.concept_code.to_list()]))

with open(os.path.join(dataset_path, 'medication_list.yaml'), 'w') as file: yaml.dump(medication_list, file, default_flow_style=False)

In [None]:
def had_medication_before(data, data_field, medications, atc_mapping):
    fields = ["20003"]
    raw = get_data_fields(fields, data, data_field)
    temp = pd.melt(raw, id_vars=["eid"], value_vars=raw.drop("eid", axis=1).columns.to_list(), var_name = "field", value_name="UKBB_code").drop("field", axis=1).drop_duplicates()

    temp.UKBB_code = temp.UKBB_code.astype(str)
    temp = temp[temp.UKBB_code!="None"].copy()
    temp = temp[temp.UKBB_code!="nan"].copy()
    temp.UKBB_code = temp.UKBB_code.astype(int)

    temp_atc = temp.merge(atc_mapping, how="left", on="UKBB_code").sort_values("eid").reset_index(drop=True).dropna(subset=["ATC_code"], axis=0)
    temp_atc.ATC_code = temp_atc.ATC_code.astype("string")
    temp = data[["eid"]].copy()
    for med, med_codes in tqdm(medication_list.items()):
        regex_str = "^"+"|^".join(med_codes)
        df = temp_atc[temp_atc.ATC_code.str.contains(regex_str, case=False)][["eid"]]\
            .drop_duplicates(subset=["eid"])\
            .assign(medication=True)
        temp[f"{med}_0_0"] = temp.merge(df, how="left", on="eid").fillna(False).medication
        
    return temp.sort_values("eid")

In [None]:
medications = had_medication_before(data, data_field, medication_list, atc_mapping)
print(len(medications))
medications.head(10)

medications.to_feather(os.path.join(dataset_path, 'temp_medications.feather'))

## Merge Everything

In [None]:
data_dfs_dict = {"basics": pd.read_feather(os.path.join(dataset_path, 'temp_basics.feather')), 
                 "questionnaire": pd.read_feather(os.path.join(dataset_path, 'temp_questionnaire.feather')), 
                 "measurements": pd.read_feather(os.path.join(dataset_path, 'temp_measurements.feather')), 
                 "labs": pd.read_feather(os.path.join(dataset_path, 'temp_labs.feather')), 
                "metabolomics": pd.read_feather(os.path.join(dataset_path, 'temp_metabolomics.feather')), 
                 "family_history": pd.read_feather(os.path.join(dataset_path, 'temp_family_history.feather')), 
                 "medications": pd.read_feather(os.path.join(dataset_path, 'temp_medications.feather'))}

In [None]:
import pandas as pd
from functools import reduce

data_baseline = reduce(lambda x, y: pd.merge(x, y, on = 'eid'), list(data_dfs_dict.values()))

In [None]:
data_baseline.to_feather(os.path.join(dataset_path, 'baseline_covariates.feather'))