In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from DataFields import DataFields
from DataFields import DateReportedFields

from ProjectFunctions import convert_date_to_binary, print_highly_correlated_features

In [2]:
df_diagnosed = pd.read_csv("diagnosed_processed.csv")
df_undiagnosed = pd.read_csv("undiagnosed.csv")
df = pd.concat([df_diagnosed, df_undiagnosed], ignore_index=True, sort=False)
df_dates = df[DateReportedFields]

In [3]:
def convert_time_distance(df, target_feature: str) -> pd.DataFrame:
    input_df = df.copy()  # Create a copy of the dataframe

    target_dates = pd.to_datetime(input_df[target_feature], errors='coerce')

    input_df[target_feature] = target_dates.notna().map({True: 0.0, False: pd.NA})

    for feature in input_df.columns:
        if feature == target_feature:
            continue
        feature_dates = pd.to_datetime(input_df[feature], errors='coerce')
        input_df[feature] = (feature_dates - target_dates).dt.days / 365.25
        
    return input_df

relation_df = convert_time_distance(df_dates, "Vascular Dementia Report Date")

relation_df = relation_df.apply(pd.to_numeric, errors='coerce')


In [4]:
relation_df.mean()

Vascular Dementia Report Date                0.000000
Primary Hypertension                       -15.040995
Secondary Hypertension                      -7.028063
Report of stroke                           -48.085554
Seropositive Rheumatoid Arthritis           -5.335797
Other Rheumatoid Arthritis                 -10.815762
Juvenile Arthritis                          -5.927447
Other Arthritis                             -7.049840
Psoriatic and enteropathic arthropathies    -6.587269
Multiple Sclerosis                         -23.122519
Crohn's disease                            -16.413611
Ulcerative Colitis                         -21.928013
Thyrotoxicosis (Grave's disease)           -11.767802
Sjogren Disease (M35)                       -4.144950
Myasthenia gravis                           -8.312115
Diagnosed with Coeliac disease             -50.010152
B12 deficiency anaemia                      -9.251716
dtype: float64

In [5]:
df.describe()

Unnamed: 0,Birth Year,Sex,Education,BMI Impedance,Smoking Status,Ever Smoked,Alcohol Intake Frequency,Report of stroke,Diabetes Diagnosed By Doctor,Report of vascular problems,...,Platelet distribution width,Red blood cell (erythrocyte) count,Red blood cell (erythrocyte) distribution width,Reticulocyte count,Reticulocyte percentage,White blood cell (leukocyte) count,Blood Pressure Diastolic,Blood Pressure Systolic,Pulse Rate at Blood Pressure,Stroke Report Date
count,4272.0,4272.0,1987.0,4248.0,4236.0,4237.0,4261.0,636.0,4253.0,2020.0,...,4211.0,4211.0,4211.0,4154.0,4154.0,4211.0,3946.0,3946.0,3946.0,0.0
mean,1943.189841,0.533474,1.187217,27.971439,0.639282,0.65046,3.066182,9.050314,0.141547,1.927723,...,16.533562,4.493688,13.630693,0.06244,1.390559,7.142636,82.087937,146.593512,70.546376,
std,4.902158,0.498937,1.130542,4.775084,0.662153,0.476881,1.633808,6.126795,0.348626,1.768057,...,0.531051,0.43113,1.027976,0.029997,0.671414,2.031409,10.947536,20.617342,12.581597,
min,1937.0,0.0,0.0,15.5256,0.0,0.0,1.0,0.0,0.0,0.0,...,15.2,0.634,11.49,0.004,0.227,0.98,46.0,78.0,35.0,
25%,1940.0,0.0,0.0,24.7721,0.0,0.0,2.0,0.0,0.0,0.0,...,16.17,4.215,13.0,0.044,1.00025,5.88,75.0,132.0,62.0,
50%,1942.0,1.0,1.0,27.2874,1.0,1.0,3.0,11.0,0.0,2.0,...,16.5,4.484,13.46,0.059,1.3055,6.9,82.0,146.0,69.0,
75%,1945.0,1.0,2.0,30.514075,1.0,1.0,4.0,11.0,0.0,4.0,...,16.83,4.7645,14.01,0.076,1.69,8.2,89.0,160.0,78.0,
max,1968.0,1.0,3.0,58.2609,2.0,1.0,6.0,22.0,1.0,4.0,...,19.4,6.47,31.7,1.077,25.278,46.6,132.0,241.0,169.0,


In [6]:
df_binary = convert_date_to_binary(df_dates)
df_correlation = pd.concat([df_binary, df[["Ever Smoked", "Education"]]], ignore_index=True, sort=False)

print_highly_correlated_features(df_binary, features=["Vascular Dementia Report Date"], threshold = 0.5)

Highly correlated features:
Vascular Dementia Report Date: Primary Hypertension
