#### Define the used fields from the UK biobank, and extract the patients diagnosed with vascular dementia

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from DataFields import DataFields
from DataFields import DateReportedFields
from ProjectFunctions import get_columns_from_chunk

UKBB_PATH = "~/biobank/ukb672220.csv"

# A dictionary containing the used features
datafields = DataFields

# Features of conditions specified by date
datereportedfields = DateReportedFields

In [None]:
# Filter by rows to extract patients with vascular dementia
def filter_by_vascular_dementia(chunk):
    col_1_filter = "Vascular Dementia Date First Reported"
    col_2_filter = "Date of vascular dementia report"
    col_combined = "Vascular Dementia Report Date"
    Y_col = "Has Vascular Dementia"
    
    # Filter rows where either col_1_filter or col_2_filter is not NaN
    filtered_chunk = chunk[chunk[col_1_filter].notna() | chunk[col_2_filter].notna()].copy()  # Make a copy here
    
    # Combine dates and add the new columns
    filtered_chunk.loc[:, col_combined] = pd.to_datetime(
        filtered_chunk[col_1_filter].combine_first(filtered_chunk[col_2_filter]), errors="coerce"
    )
    
    # Filter by dates to remove unspecified entries
    filtered_chunk.loc[:, Y_col] = (
        filtered_chunk[col_combined]
        .between(pd.Timestamp("1950-01-01"), pd.Timestamp("2030-12-31"))
        .astype(int)
    )

    return filtered_chunk


### Extract rows diagnosed with vascular dementia through the use of two fields

In [None]:
chunk_size = 10000
result_df = pd.DataFrame()

for chunk in pd.read_csv(UKBB_PATH, chunksize=chunk_size, low_memory=False):

    filtered_chunk = get_columns_from_chunk(chunk, datafields, oldest=True)

    filtered_chunk = filter_by_vascular_dementia(filtered_chunk)
    
    result_df = pd.concat([result_df, filtered_chunk], ignore_index=True)

col_1_filter = "Vascular Dementia Date First Reported"
col_2_filter = "Date of vascular dementia report"
col_combined = "Vascular Dementia Report Date"

# Drop columns which are not necessary anymore
result_df = result_df.drop(columns=[col_1_filter, col_2_filter, col_combined])

# Save the result
result_df.to_csv('diagnosed.csv', index=False)

# Display the first 5 rows
result_df.describe()