In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from DataFields import DataFields, DropNAList, DateReportedFields
from ProjectFunctions import get_columns_from_chunk, convert_date_to_binary, drop_rows_with_na_greater_than
from ProjectFunctions import map_education_levels, map_vascular_levels
from typing import Callable, Any

UKBB_PATH = "~/biobank/ukb672220.csv"

VD_COL = "Vascular Dementia Report Date"

# A dictionary containing the used features
datafields = DataFields

# Features of conditions specified by date
datereportedfields = DateReportedFields

#### Measure the distribution of age

In [None]:
df = pd.read_csv("diagnosed_processed.csv")

# plot histogram of the birth year
df["Birth Year"].plot(kind='hist', bins=10, edgecolor='black')
birth_year_counts_histogram, birth_year_histogram_edges = np.histogram(df["Birth Year"], bins=10)
birth_year_histogram_edges = np.floor(birth_year_histogram_edges).astype(int)

#### Extract samples of patients undiagnosed with vascular dementia with similar birth year

##### Define features to drop if na

In [None]:
drop_if_na_list = [
    "Birth Year",
    "Sex",
    "Education",
    "Smoking Status",
    "Ever Smoked",
    "Alcohol Intake Frequency",
    "Diabetes Diagnosed By Doctor",
    "Report of vascular problems",
    
# Blood Chem
    "Alanine aminotransferase",
    "Albumin",
    "Alkaline phosphatase",
    "Apolipoprotein A",
    "Apolipoprotein B",
    "Aspartate aminotransferase",
    "C-reactive protein",
    "Calcium",
    "Cholesterol",
    "Creatinine",
    "Cystatin C",
    "Direct bilirubin",
    "Gamma glutamyltransferase",
    "Glucose",
    "Glycated haemoglobin (HbA1c)",
    "HDL cholesterol",
    "IGF-1",
    "LDL direct",
    "Lipoprotein A",
    "Phosphate",
    "SHBG",
    "Testosterone",
    "Total bilirubin",
    "Total protein",
    "Triglycerides",
    "Urate",
    "Urea",
    "Vitamin D",
    
# Blood Count
    "Basophil count",
    "Basophil percentage",
    "Eosinophil count",
    "Eosinophil percentage",
    "Haematocrit percentage",
    "Haemoglobin concentration",
    "High light scatter reticulocyte count",
    "High light scatter reticulocyte percentage",
    "Immature reticulocyte fraction",
    "Lymphocyte count",
    "Lymphocyte percentage",
    "Mean corpuscular haemoglobin",
    "Mean corpuscular haemoglobin concentration",
    "Mean corpuscular volume",
    "Mean platelet (thrombocyte) volume",
    "Mean reticulocyte volume",
    "Mean sphered cell volume",
    "Monocyte count",
    "Monocyte percentage",
    "Neutrophil count",
    "Neutrophil percentage",
    "Nucleated red blood cell count",
    "Nucleated red blood cell percentage",
    "Platelet count",
    "Platelet crit",
    "Platelet distribution width",
    "Red blood cell (erythrocyte) count",
    "Red blood cell (erythrocyte) distribution width",
    "Reticulocyte count",
    "Reticulocyte percentage",
    "White blood cell (leukocyte) count",

# Blood Presure
    "Blood Pressure Diastolic",
    "Blood Pressure Systolic",
    "Pulse Rate at Blood Pressure"
]

##### Do this

In [None]:
def filter_chunk(chunk, column, condition):
    # remove unnecessary columns first.
    chunk = get_columns_from_chunk(chunk, datafields, oldest=True)

    chunk = map_education_levels(chunk)
    
    chunk["Report of vascular problems"] = chunk["Report of vascular problems"].replace({-7: 0, -3: pd.NA})

    # take only patients undiagnosed with vascular dementia
    chunk = chunk[chunk[VD_COL].isna()]
    
    # filter by condition function
    mask = condition(chunk[column])
    filtered_chunk = chunk.loc[mask]
    
    # convert items with value less than 0 to NA
    filtered_chunk = filtered_chunk.mask(filtered_chunk.select_dtypes(include='number') < 0)
    
    # drop all NA
    #filtered_chunk = filtered_chunk.dropna(subset=DropNAList) # drop if missing value for some columns
    filtered_chunk = drop_rows_with_na_greater_than(filtered_chunk, x=40, include=DropNAList)
    
    return filtered_chunk


In [None]:
# Sample "count" rows from dataframe following a condtion by chunks randomly.
def sample_by_condition(file_path, column, condition, count, samples_per_chunk, chunksize=10000):
    result = pd.DataFrame()
    samples_collected = 0
    
    for chunk in pd.read_csv(file_path, chunksize=chunksize, low_memory=False):
            
        filtered_chunk = filter_chunk(chunk, column, condition)
        
        if filtered_chunk.empty:
            continue
        
        remaining_samples = count - samples_collected
        if remaining_samples <= 0:
            break
            
        # choose randomly if too many were collected
        k = min(samples_per_chunk, len(filtered_chunk))
        chunk_sample = filtered_chunk.sample(n=k)
    
        if result.empty:
            result = chunk_sample
        else:
            result = pd.concat([result, chunk_sample], ignore_index=True, copy=False)

        samples_collected += len(chunk_sample)

        if samples_collected >= count:
            break
            
    if samples_collected > count:
        return result.sample(n=count)
    else:
        return result

In [None]:
per_chunk = 100 # maximum samples to take from a single chunk per histogram column
col_name = "Birth Year" # The column to use as a rule for "sample_by_condition"

const = 20 # constant addition per histogram

undiagnosed_df = pd.DataFrame()

for i in range(len(birth_year_counts_histogram)):
    start, end = int(birth_year_histogram_edges[i]), int(birth_year_histogram_edges[i+1])
    count = birth_year_counts_histogram[i]
    
    df = sample_by_condition(UKBB_PATH,
                             col_name,
                             lambda x:(x >= start) & (x < end),
                             count + const,
                             per_chunk,
                            )
    print(f"{i+1} / {len(birth_year_counts_histogram)}: range {start} - {end}, gathered {len(df)}/{count+const}")
    
    undiagnosed_df = pd.concat([undiagnosed_df, df], ignore_index=True)

# assign undiagnosed
#undiagnosed_df["Vascular Dementia Report Date"] = pd.NA

# drop id column
undiagnosed_df = undiagnosed_df.drop(columns=["id"])

#### Save the CSV, and analyse the extracted data

In [None]:
from ProjectFunctions import count_na_in_dataframe

# count number of NA per column
count_na_in_dataframe(undiagnosed_df)

# Plot Brith Year histogram, compare with diagnosed patients
undiagnosed_df["Birth Year"].plot(kind='hist', bins=10, edgecolor='black')

undiagnosed_df.to_csv("undiagnosed.csv", index=False)

# analyze distributions
undiagnosed_df.describe()