#### Define the used fields from the UK biobank, and extract the patients diagnosed with vascular dementia

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from DataFields import DataFields
from DataFields import DateReportedFields
from ProjectFunctions import get_columns_from_chunk

UKBB_PATH = "~/biobank/ukb672220.csv"

# A dictionary containing the used features
datafields = DataFields

# Features of conditions specified by date
datereportedfields = DateReportedFields

In [2]:
# Filter by rows to extract patients with vascular dementia
def filter_by_vascular_dementia(chunk):
    col_1_filter = "Vascular Dementia Date First Reported"
    col_2_filter = "Date of vascular dementia report"
    col_combined = "Vascular Dementia Report Date"
    Y_col = "Has Vascular Dementia"
    
    # Filter rows where either col_1_filter or col_2_filter is not NaN
    filtered_chunk = chunk[chunk[col_1_filter].notna() | chunk[col_2_filter].notna()].copy()  # Make a copy here
    
    # Combine dates and add the new columns
    filtered_chunk.loc[:, col_combined] = pd.to_datetime(
        filtered_chunk[col_1_filter].combine_first(filtered_chunk[col_2_filter]), errors="coerce"
    )
    
    # Filter by dates to remove unspecified entries
    filtered_chunk.loc[:, Y_col] = (
        filtered_chunk[col_combined]
        .between(pd.Timestamp("1950-01-01"), pd.Timestamp("2030-12-31"))
        .astype(int)
    )

    return filtered_chunk


### Extract rows diagnosed with vascular dementia through the use of two fields

In [3]:
chunk_size = 10000

# Create an empty DataFrame to hold the filtered data
result_df = pd.DataFrame()

for chunk in pd.read_csv(UKBB_PATH, chunksize=chunk_size, low_memory=False):

    filtered_chunk = get_columns_from_chunk(chunk, datafields, oldest=True)

    filtered_chunk = filter_by_vascular_dementia(filtered_chunk)
    
    # Append to the result DataFrame
    result_df = pd.concat([result_df, filtered_chunk], ignore_index=True)

col_1_filter = "Vascular Dementia Date First Reported"
col_2_filter = "Date of vascular dementia report"
col_combined = "Vascular Dementia Report Date"

result_df = result_df.drop(columns=[col_1_filter, col_2_filter, col_combined])

# Save the result
result_df.to_csv('diagnosed.csv')

# Display the first 5 rows
result_df.describe()

Unnamed: 0,id,Birth Year,Sex,Education,BMI Impedance,Smoking Status,Ever Smoked,Alcohol Intake Frequency,Report of stroke,Diabetes Diagnosed By Doctor,...,Platelet distribution width,Red blood cell (erythrocyte) count,Red blood cell (erythrocyte) distribution width,Reticulocyte count,Reticulocyte percentage,White blood cell (leukocyte) count,Blood Pressure Diastolic,Blood Pressure Systolic,Pulse Rate at Blood Pressure,Has Vascular Dementia
count,2186.0,2186.0,2186.0,2137.0,2162.0,2181.0,2159.0,2181.0,681.0,2181.0,...,2037.0,2037.0,2037.0,2005.0,2005.0,2037.0,1983.0,1983.0,1983.0,2186.0
mean,3515518.0,1943.111162,0.582342,-1.001404,28.557839,0.658414,0.670681,3.211371,9.0279,0.204493,...,16.542081,4.489278,13.725449,0.064314,1.435638,7.335621,81.993444,147.228946,70.938477,1.0
std,1442844.0,4.317173,0.493286,4.960943,5.139395,0.769999,0.470075,1.732522,6.190893,0.428779,...,0.531861,0.444468,1.094076,0.035324,0.804452,1.956365,11.199929,21.110899,13.162657,0.0
min,1002012.0,1936.0,0.0,-7.0,15.7576,-3.0,0.0,-3.0,0.0,-3.0,...,15.2,1.72,11.49,0.012,0.263,2.37,47.0,78.0,35.0,1.0
25%,2300567.0,1940.0,0.0,-7.0,25.06235,0.0,0.0,2.0,0.0,0.0,...,16.2,4.2,13.1,0.045,1.02,6.02,74.0,132.0,62.0,1.0
50%,3511698.0,1942.0,1.0,1.0,27.794,1.0,1.0,3.0,11.0,0.0,...,16.5,4.484,13.53,0.06,1.33,7.12,82.0,146.0,70.0,1.0
75%,4741248.0,1945.0,1.0,3.0,31.421725,1.0,1.0,5.0,11.0,0.0,...,16.86,4.773,14.1,0.078,1.74,8.41,89.0,160.0,79.0,1.0
max,6022126.0,1968.0,1.0,6.0,54.5283,2.0,1.0,6.0,22.0,1.0,...,19.4,6.39,31.7,1.077,25.278,34.13,132.0,241.0,169.0,1.0
