### Extract the data of patients diagnosed with vascular dementia

In [2]:
UKBB_PATH = "~/biobank/ukb672220.csv"

from DataFields import DataFields

datafields = DataFields


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
# Print all columns to view available columns
columns = pd.read_csv(UKBB_PATH, nrows=0).columns.tolist()
print(columns)

['eid', '3-0.0', '3-1.0', '3-2.0', '3-3.0', '4-0.0', '4-1.0', '4-2.0', '4-3.0', '5-0.0', '5-1.0', '5-2.0', '5-3.0', '6-0.0', '6-1.0', '6-2.0', '6-3.0', '19-0.0', '21-0.0', '21-1.0', '21-2.0', '21-3.0', '31-0.0', '34-0.0', '36-0.0', '36-1.0', '36-2.0', '36-3.0', '37-0.0', '37-1.0', '37-2.0', '37-3.0', '38-0.0', '38-1.0', '38-2.0', '38-3.0', '39-0.0', '39-1.0', '39-2.0', '39-3.0', '40-0.0', '40-1.0', '40-2.0', '40-3.0', '41-0.0', '41-1.0', '41-2.0', '41-3.0', '42-0.0', '42-1.0', '42-2.0', '42-3.0', '43-0.0', '43-1.0', '43-2.0', '43-3.0', '44-0.0', '44-1.0', '44-2.0', '44-3.0', '45-0.0', '45-1.0', '45-2.0', '45-3.0', '46-0.0', '46-1.0', '46-2.0', '46-3.0', '47-0.0', '47-1.0', '47-2.0', '47-3.0', '48-0.0', '48-1.0', '48-2.0', '48-3.0', '49-0.0', '49-1.0', '49-2.0', '49-3.0', '50-0.0', '50-1.0', '50-2.0', '50-3.0', '51-0.0', '51-1.0', '51-2.0', '51-3.0', '52-0.0', '53-0.0', '53-1.0', '53-2.0', '53-3.0', '54-0.0', '54-1.0', '54-2.0', '54-3.0', '55-0.0', '55-1.0', '55-2.0', '55-3.0', '77-0.0'

In [5]:

# Read the chunk and get columns from the datafields dictionary
# if Oldest=True, take the oldest instance, otherwise newest
def get_columns_from_chunk(chunk, datafields, oldest=False):
    selected_columns = {}
    for field_name, instances in datafields.items():
        instance_key = min(instances) if oldest else max(instances)
        selected_columns[field_name] = instances[instance_key]
    
    # Select only the necessary columns from the chunk
    filtered_chunk = chunk[list(selected_columns.values())].rename(columns={
        v: k for k, v in selected_columns.items()
    })
    
    return filtered_chunk
    

In [6]:
def filter_by_vascular_dementia(chunk):
    col_1_filter = "Vascular Dementia Date First Reported"
    col_2_filter = "Date of vascular dementia report"
    col_combined = "Vascular Dementia Report Date"
    Y_col = "Has Vascular Dementia"
    
    # Filter rows where either col_1_filter or col_2_filter is not NaN
    filtered_chunk = chunk[chunk[col_1_filter].notna() | chunk[col_2_filter].notna()].copy()  # Make a copy here
    
    # Combine dates and add the new columns
    filtered_chunk.loc[:, col_combined] = pd.to_datetime(
        filtered_chunk[col_1_filter].combine_first(filtered_chunk[col_2_filter]), errors="coerce"
    )
    
    # Filter by dates to remove unspecified entries
    filtered_chunk.loc[:, Y_col] = (
        filtered_chunk[col_combined]
        .between(pd.Timestamp("1950-01-01"), pd.Timestamp("2030-12-31"))
        .astype(int)
    )

    return filtered_chunk



In [7]:

# from the given "fields" list, convert all columns where date is in range, to 0 or 1 instead of a date.
def convert_date_to_binary(chunk, fields):
    start_date = pd.Timestamp("1950-01-01")
    end_date = pd.Timestamp("2030-01-01")
    
    for col in fields:
        df[col] = pd.to_datetime(df[col], errors='coerce')

        mask = (df[col] >= start_date) & (df[col] <= end_date)

        df[col] = np.where(mask, 1, 0)
    
    return df

### Extract rows diagnosed with vascular dementia through the use of two fields

In [6]:

chunk_size = 10000

# Create an empty DataFrame to hold the filtered data
result_df = pd.DataFrame()

for chunk in pd.read_csv(UKBB_PATH, chunksize=chunk_size, low_memory=False):

    filtered_chunk = get_columns_from_chunk(chunk, datafields, oldest=True)

    filtered_chunk = filter_by_vascular_dementia(filtered_chunk)
    
    # Append to the result DataFrame
    result_df = pd.concat([result_df, filtered_chunk], ignore_index=True)

# Save the result
result_df.to_csv('vascular_dementia_filtered.csv')

# Display the first 30 rows
result_df.head(5)

Unnamed: 0,id,Birth Year,Sex,Ethnicity,Education,Primary Hypertension,Secondary Hypertension,Vascular Dementia Date First Reported,Date of vascular dementia report,BMI Impedance,...,Red blood cell (erythrocyte) count,Red blood cell (erythrocyte) distribution width,Reticulocyte count,Reticulocyte percentage,White blood cell (leukocyte) count,Blood Pressure Diastolic,Blood Pressure Systolic,Pulse Rate at Blood Pressure,Vascular Dementia Report Date,Has Vascular Dementia
0,1002012,1940.0,1,1001.0,1.0,2009-08-13,,2019-09-06,2019-09-06,27.0801,...,4.799,13.07,0.05,1.047,7.79,75.0,182.0,62.0,2019-09-06,1
1,1002324,1938.0,1,1001.0,-7.0,2000-10-01,,2022-07-01,,33.7984,...,4.08,15.8,0.134,3.28,3.6,84.0,146.0,73.0,2022-07-01,1
2,1005163,1941.0,0,1001.0,1.0,,,2022-09-01,,30.388,...,4.33,15.0,0.058,1.33,6.0,94.0,143.0,87.0,2022-09-01,1
3,1009462,1938.0,1,1001.0,-7.0,2004-01-01,,2021-02-04,2021-02-04,32.2385,...,4.8,15.5,0.084,1.75,8.2,77.0,141.0,79.0,2021-02-04,1
4,1010028,1947.0,0,1001.0,-7.0,,,2019-04-11,2019-04-11,32.0399,...,4.44,13.4,0.042,0.95,7.1,71.0,131.0,55.0,2019-04-11,1


### Post process dataframe by converting data into common data types

In [8]:
### Save dataframe as csv, to manually observe the data
df = pd.read_csv("vascular_dementia_filtered.csv")

# convert field which use write date a disease is reported and mark it as true or false
# to determine if a person was diagnosed with the diseas or not rather than when.
DateReportedFields = [
    "Primary Hypertension",
    "Secondary Hypertension",
    "Report of stroke",
    "Stress Reported",
    "Seropositive Rheumatoid Arthritis",
    "Other Rheumatoid Arthritis",
    "Juvenile Arthritis",
    "Other Arthritis",
    "Psoriatic and enteropathic arthropathies",
    "Multiple Sclerosis",
    "Crohn's disease",
    "Ulcerative Colitis",
    "Thyrotoxicosis (Grave's disease)",
    "Sjogren Disease (M35)",
    "Myasthenia gravis",
    "Diagnosed with Coeliac disease",
    "B12 deficiency anaemia"
]

df = convert_date_to_binary(df, DateReportedFields)



#### Measure the distribution of age, education, and ethnicity of patients diagnosed with vascular dementia (or all columns)

In [9]:
df.describe()

Unnamed: 0.1,Unnamed: 0,id,Birth Year,Sex,Ethnicity,Education,Primary Hypertension,Secondary Hypertension,BMI Impedance,Smoking Status,...,Platelet distribution width,Red blood cell (erythrocyte) count,Red blood cell (erythrocyte) distribution width,Reticulocyte count,Reticulocyte percentage,White blood cell (leukocyte) count,Blood Pressure Diastolic,Blood Pressure Systolic,Pulse Rate at Blood Pressure,Has Vascular Dementia
count,2186.0,2186.0,2186.0,2186.0,2181.0,2137.0,2186.0,2186.0,2162.0,2181.0,...,2037.0,2037.0,2037.0,2005.0,2005.0,2037.0,1983.0,1983.0,1983.0,2186.0
mean,1092.5,3515518.0,1943.111162,0.582342,1072.19624,-1.001404,0.789113,0.002745,28.557839,0.658414,...,16.542081,4.489278,13.725449,0.064314,1.435638,7.335621,81.993444,147.228946,70.938477,1.0
std,631.188165,1442844.0,4.317173,0.493286,468.249479,4.960943,0.408032,0.05233,5.139395,0.769999,...,0.531861,0.444468,1.094076,0.035324,0.804452,1.956365,11.199929,21.110899,13.162657,0.0
min,0.0,1002012.0,1936.0,0.0,-3.0,-7.0,0.0,0.0,15.7576,-3.0,...,15.2,1.72,11.49,0.012,0.263,2.37,47.0,78.0,35.0,1.0
25%,546.25,2300567.0,1940.0,0.0,1001.0,-7.0,1.0,0.0,25.06235,0.0,...,16.2,4.2,13.1,0.045,1.02,6.02,74.0,132.0,62.0,1.0
50%,1092.5,3511698.0,1942.0,1.0,1001.0,1.0,1.0,0.0,27.794,1.0,...,16.5,4.484,13.53,0.06,1.33,7.12,82.0,146.0,70.0,1.0
75%,1638.75,4741248.0,1945.0,1.0,1001.0,3.0,1.0,0.0,31.421725,1.0,...,16.86,4.773,14.1,0.078,1.74,8.41,89.0,160.0,79.0,1.0
max,2185.0,6022126.0,1968.0,1.0,4003.0,6.0,1.0,1.0,54.5283,2.0,...,19.4,6.39,31.7,1.077,25.278,34.13,132.0,241.0,169.0,1.0


#### Extract uniform distribution of patients who adher to the distribution of patients diagnosed with vascular dementia but are not diagnosed with it

### Save the extracted dataset

In [None]:
df.to_csv('vascular_dementia_filtered_2.csv')

df.head(10)