#### Define the used fields from the UK biobank, and extract the patients diagnosed with vascular dementia

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from DataFields import DataFields
from ProjectFunctions import get_columns_from_chunk

UKBB_PATH = "~/biobank/ukb672220.csv"

VD_COL = "Vascular Dementia Report Date"

# A dictionary containing the used features
datafields = DataFields

In [None]:
# Filter by rows to extract patients with vascular dementia
def filter_by_column(chunk, column):
    # Filter rows where either col_1_filter or col_2_filter is not NaN
    filter_mask = chunk[column].notna()
    filtered_chunk = chunk[filter_mask].copy()
    # Combine dates and add the new columns
    filtered_chunk.loc[:, column] = pd.to_datetime(
        filtered_chunk[column], errors="coerce"
        )

    filtered_chunk.loc[:, column] = filtered_chunk[column].where(
            filtered_chunk[column].between(pd.Timestamp("1950-01-01"), pd.Timestamp("2030-12-31"))
        )
    return filtered_chunk

#### Extract rows diagnosed with vascular dementia

In [None]:
chunk_size = 10000
result_df = pd.DataFrame()

for chunk in pd.read_csv(UKBB_PATH, chunksize=chunk_size, low_memory=False):
    
    filtered_chunk = get_columns_from_chunk(chunk, datafields, oldest=True)

    filtered_chunk = filter_by_column(filtered_chunk, VD_COL)
    result_df = pd.concat([result_df, filtered_chunk], ignore_index=True)

# Save the result
result_df.to_csv('diagnosed.csv', index=False)

result_df.describe()