#### Load data about patients diagnosed with vascular dementia

In [4]:
UKBB_PATH = "~/biobank/ukb672220.csv"

from DataFields import DataFields

datafields = DataFields


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Print all columns
columns = pd.read_csv(UKBB_PATH, nrows=0).columns.tolist()
print(columns)

In [None]:
fields = list(fields_dict.values())
vd_1 = fields_dict["Vascular Dementia Date First Reported"]
vd_2 = fields_dict["Date of vascular dementia report"]
chunk_size = 20000

counter = 0
print("Working...")

# Create an empty DataFrame to hold the filtered data
result_df = pd.DataFrame()

for chunk in pd.read_csv(UKBB_PATH, usecols=fields, chunksize=chunk_size, low_memory=False):
    # Filter rows where either vd_1 or vd_2 is not NaN
    filtered_chunk = chunk[chunk[vd_1].notna() | chunk[vd_2].notna()].copy()  # Make a copy here
    
    # Combine dates and add the new columns
    filtered_chunk.loc[:, "VascularDementiaReportDate"] = pd.to_datetime(
        filtered_chunk[vd_1].combine_first(filtered_chunk[vd_2]), errors="coerce"
    )
    filtered_chunk.loc[:, "hasVascularDementia"] = (
        filtered_chunk["VascularDementiaReportDate"]
        .between(pd.Timestamp("1950-01-01"), pd.Timestamp("2030-12-31"))
        .astype(int)
    )
    
    # Append to the result DataFrame
    result_df = pd.concat([result_df, filtered_chunk], ignore_index=True)
    
    # Update the counter
    counter += len(filtered_chunk)
    print(counter)

# Save the result
result_df.to_pickle('vascular_dementia_filtered.pkl')

# Display the first 30 rows
result_df.head(30)