## Define the used fields from the UK biobank, and extract the patients diagnosed with vascular dementia

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from DataFields import DataFields
from ProjectFunctions import get_columns_from_chunk

UKBB_PATH = "~/biobank/ukb672220.csv"

VD_COL = "Vascular Dementia Report Date"

# A dictionary containing the used features
datafields = DataFields

In [2]:
# Filter by rows to extract patients with vascular dementia
def filter_by_column(chunk, column):
    # Filter rows where either col_1_filter or col_2_filter is not NaN
    filter_mask = chunk[column].notna()
    filtered_chunk = chunk[filter_mask].copy()
    # Combine dates and add the new columns
    filtered_chunk.loc[:, column] = pd.to_datetime(
        filtered_chunk[column], errors="coerce"
        )

    filtered_chunk.loc[:, column] = filtered_chunk[column].where(
            filtered_chunk[column].between(pd.Timestamp("1950-01-01"), pd.Timestamp("2030-12-31"))
        )
    return filtered_chunk

### Extract rows diagnosed with vascular dementia

In [3]:
chunk_size = 10000
result_df = pd.DataFrame()

for chunk in pd.read_csv(UKBB_PATH, chunksize=chunk_size, low_memory=False):
    
    filtered_chunk = get_columns_from_chunk(chunk, datafields, oldest=True)

    filtered_chunk = filter_by_column(filtered_chunk, VD_COL)
    result_df = pd.concat([result_df, filtered_chunk], ignore_index=True)

# Save the result
result_df.to_csv('diagnosed.csv', index=False)

result_df.describe()

Unnamed: 0,id,Birth Year,Sex,Education,BMI Impedance,Smoking Status,Ever Smoked,Alcohol Intake Frequency,Report of stroke,Diabetes Diagnosed By Doctor,...,Platelet crit,Platelet distribution width,Red blood cell (erythrocyte) count,Red blood cell (erythrocyte) distribution width,Reticulocyte count,Reticulocyte percentage,White blood cell (leukocyte) count,Blood Pressure Diastolic,Blood Pressure Systolic,Pulse Rate at Blood Pressure
count,2181.0,2181.0,2181.0,2132.0,2157.0,2176.0,2154.0,2176.0,678.0,2176.0,...,2033.0,2033.0,2033.0,2033.0,2002.0,2002.0,2033.0,1978.0,1978.0,1978.0
mean,3517504.0,1943.104998,0.58276,-1.002814,28.563543,0.657629,0.670381,3.213235,9.035398,0.204963,...,0.228823,16.542735,4.489264,13.726473,0.064325,1.435948,7.337614,81.997472,147.262386,70.920627
std,1441920.0,4.319199,0.493216,4.964065,5.143175,0.770112,0.470184,1.732589,6.193954,0.429159,...,0.052181,0.532093,0.444526,1.094835,0.035336,0.804845,1.957398,11.211197,21.113931,13.164777
min,1002012.0,1936.0,0.0,-7.0,15.7576,-3.0,0.0,-3.0,0.0,-3.0,...,0.064,15.2,1.72,11.49,0.012,0.263,2.37,47.0,78.0,35.0
25%,2301136.0,1940.0,0.0,-7.0,25.0694,0.0,0.0,2.0,0.0,0.0,...,0.195,16.2,4.2,13.1,0.045,1.02,6.02,74.0,133.0,62.0
50%,3520350.0,1942.0,1.0,1.0,27.8058,1.0,1.0,3.0,11.0,0.0,...,0.224,16.5,4.484,13.53,0.06,1.33,7.12,82.0,146.0,70.0
75%,4741406.0,1945.0,1.0,3.0,31.4358,1.0,1.0,5.0,11.0,0.0,...,0.257,16.86,4.773,14.1,0.078,1.73975,8.41,89.0,160.0,79.0
max,6022126.0,1968.0,1.0,6.0,54.5283,2.0,1.0,6.0,22.0,1.0,...,0.624,19.4,6.39,31.7,1.077,25.278,34.13,132.0,241.0,169.0
