In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from DataFields import DataFields
from DataFields import DateReportedFields
from ProjectFunctions import get_columns_from_chunk, convert_date_to_binary, drop_rows_with_na_greater_than
from ProjectFunctions import map_education_levels, map_vascular_levels

UKBB_PATH = "~/biobank/ukb672220.csv"

# A dictionary containing the used features
datafields = DataFields

# Features of conditions specified by date
datereportedfields = DateReportedFields

#### Measure the distribution of age

In [None]:
df = pd.read_csv("diagnosed_imputed.csv")

# plot histogram of the birth year
df["Birth Year"].plot(kind='hist', bins=10, edgecolor='black')
birth_year_counts_histogram, birth_year_histogram_edges = np.histogram(df["Birth Year"], bins=10)
print(f"Counts {birth_year_counts_histogram}")
print(f"Edges: {birth_year_histogram_edges}")
# describe the birth year
df.describe()

#### Extract samples of patients undiagnosed with vascular dementia with similar birth year

In [None]:
drop_if_na_list = [
    "Smoking Status",
    "Education",
    "Ever Smoked",
    "Report of Vascular problems",
    
    "Alanine aminotransferase",
    "Albumin",
    "Alkaline phosphatase",
    "Apolipoprotein A",
    "Apolipoprotein B",
    "Aspartate aminotransferase",
    "C-reactive protein",
    "Calcium",
    "Cholesterol",
    "Creatinine",
    "Cystatin C",
    "Direct bilirubin",
    "Gamma glutamyltransferase",
    "Glucose",
    "Glycated haemoglobin (HbA1c)",
    "HDL cholesterol",
    "IGF-1",
    "LDL direct",
    "Lipoprotein A",
    "Phosphate",
    "SHBG",
    "Testosterone",
    "Total bilirubin",
    "Total protein",
    "Triglycerides",
    "Urate",
    "Urea",
    "Vitamin D",
    
    "Basophil count",
    "Basophil percentage",
    "Eosinophil count",
    "Eosinophil percentage",
    "Haematocrit percentage",
    "Haemoglobin concentration",
    "High light scatter reticulocyte count",
    "High light scatter reticulocyte percentage",
    "Immature reticulocyte fraction",
    "Lymphocyte count",
    "Lymphocyte percentage",
    "Mean corpuscular haemoglobin",
    "Mean corpuscular haemoglobin concentration",
    "Mean corpuscular volume",
    "Mean platelet (thrombocyte) volume",
    "Mean reticulocyte volume",
    "Mean sphered cell volume",
    "Monocyte count",
    "Monocyte percentage",
    "Neutrophil count",
    "Neutrophil percentage",
    "Nucleated red blood cell count",
    "Nucleated red blood cell percentage",
    "Platelet count",
    "Platelet crit",
    "Platelet distribution width",
    "Red blood cell (erythrocyte) count",
    "Red blood cell (erythrocyte) distribution width",
    "Reticulocyte count",
    "Reticulocyte percentage",
    "White blood cell (leukocyte) count",
    "Blood Pressure Diastolic",
    "Blood Pressure Systolic",
    "Pulse Rate at Blood Pressure"
]

In [None]:
# Sample "count" rows from dataframe following a condtion by chunks randomly.
def sample_by_condition(file_path, column, condition_function, count, samples_per_chunk, drop_na_count, chunksize=10000):
    result = pd.DataFrame()
    samples_collected = 0

    for chunk in pd.read_csv(file_path, chunksize=chunksize, low_memory=False):
        # remove unnecessary columns first.
        chunk = get_columns_from_chunk(chunk, datafields, oldest=True)
        
        mask = condition_function(chunk[column])
        filtered_chunk = chunk.loc[mask]

        filtered_chunk = filtered_chunk.dropna(subset=drop_if_na_list) # drop if missing value for some columns
        
        if filtered_chunk.empty:
            continue
    
        remaining_samples = count - samples_collected
        if remaining_samples <= 0:
            break
    
        k = min(samples_per_chunk, len(filtered_chunk))
        
        chunk_sample = filtered_chunk.sample(n=k)
    
        if result.empty:
            result = chunk_sample
        else:
            result = pd.concat([result, chunk_sample], ignore_index=True, copy=False)

        samples_collected += len(chunk_sample)

        if samples_collected >= count:
            break
    if samples_collected > count:
        return result.sample(n=count)
    else:
        return result

In [None]:
per_chunk = 500 # maximum samples to take from a single chunk per histogram column
col_name = "Birth Year" # The column to use as a rule for "sample_by_condition"
vd_1 = "Vascular Dementia Date First Reported" # vd_1 and vd_2 used to drop diagnosed patients
vd_2 = "Date of vascular dementia report"
drop_na_count = 25 # drop rows with larger NA features count.

mult = 1.2 # multiply number of people per histogram
const = 20 # constant addition per histogram

undiagnosed_df = pd.DataFrame()

for i in range(len(birth_year_counts_histogram)):
    start, end = birth_year_histogram_edges[i], birth_year_histogram_edges[i+1]
    count = birth_year_counts_histogram[i]
    
    df = sample_by_condition(UKBB_PATH, col_name, lambda x:(x >= start) & (x < end), int(count*mult + const), per_chunk, drop_na_count)
    
    df = df[df[vd_1].isna() & df[vd_2].isna()] # drop if has vascular dementia
    
    df = convert_date_to_binary(df, DateReportedFields)
    
    undiagnosed_df = pd.concat([undiagnosed_df, df], ignore_index=True)

# drop vascular dementia columns, and assign new with 0 (undiagnosed)
undiagnosed_df = undiagnosed_df.drop(columns=[vd_1, vd_2])
undiagnosed_df["Has Vascular Dementia"] = 0

# map education
undiagnosed_df = map_education_levels(undiagnosed_df)

# map vascular
undiagnosed_df = map_vascular_levels(undiagnosed_df)

undiagnosed_df = undiagnosed_df.drop(columns=["id"])

#### Save the CSV, and analyse the extracted data

In [None]:
from ProjectFunctions import count_na_in_dataframe

undiagnosed_df = undiagnosed_df.dropna()

# count number of NA per column
count_na_in_dataframe(undiagnosed_df)

# Plot Brith Year histogram, compare with diagnosed patients
undiagnosed_df["Birth Year"].plot(kind='hist', bins=10, edgecolor='black')

undiagnosed_df.to_csv("undiagnosed.csv")

# analyze distributions
undiagnosed_df.describe()