In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from DataFields import DataFields
from DataFields import DateReportedFields
from ProjectFunctions import one_hot_encode_vascular_problems

In [2]:
df_diagnosed = pd.read_csv("diagnosed_processed.csv")
df_undiagnosed = pd.read_csv("undiagnosed.csv")

In [3]:
df_merged = pd.concat([df_diagnosed, df_undiagnosed], ignore_index=True, sort=False)

df_merged = one_hot_encode_vascular_problems(df_merged)
df_merged = df_merged.drop(columns=["High Blood Pressure"])

df_merged.to_csv("dataset_merged.csv", index=False)

df_merged.describe()

Unnamed: 0,Birth Year,Sex,Education,Primary Hypertension,Secondary Hypertension,BMI Impedance,Smoking Status,Ever Smoked,Alcohol Intake Frequency,Report of stroke,...,Reticulocyte count,Reticulocyte percentage,White blood cell (leukocyte) count,Blood Pressure Diastolic,Blood Pressure Systolic,Pulse Rate at Blood Pressure,Has Vascular Dementia,Heart Attack,Angina,Stroke
count,4061.0,4061.0,4013.0,4061.0,4061.0,4050.0,4040.0,4040.0,4055.0,4061.0,...,4032.0,4032.0,4061.0,3908.0,3908.0,3908.0,4061.0,4061.0,4061.0,4061.0
mean,1943.607732,0.616843,1.62472,0.376754,0.001477,28.501014,0.646782,0.663366,2.928237,0.204383,...,0.064743,1.428733,7.194437,83.566786,148.917093,70.265097,0.477469,0.09702,0.081015,0.0623
std,4.804711,0.486216,1.061756,0.484632,0.038414,4.798574,0.648948,0.472617,1.643792,0.4033,...,0.030208,0.676759,1.952232,11.031026,20.269234,12.952724,0.499554,0.296022,0.272891,0.241729
min,1937.0,0.0,0.0,0.0,0.0,15.7576,0.0,0.0,1.0,0.0,...,0.01,0.21,2.37,47.0,78.0,35.0,0.0,0.0,0.0,0.0
25%,1940.0,0.0,1.0,0.0,0.0,25.283675,0.0,0.0,2.0,0.0,...,0.046,1.04,5.9,76.0,135.0,61.0,0.0,0.0,0.0,0.0
50%,1942.0,1.0,2.0,0.0,0.0,27.7878,1.0,1.0,3.0,0.0,...,0.061,1.344,7.0,83.0,148.0,69.0,0.0,0.0,0.0,0.0
75%,1946.0,1.0,3.0,1.0,0.0,31.0949,1.0,1.0,4.0,0.0,...,0.079,1.73,8.27,90.0,161.0,78.0,1.0,0.0,0.0,0.0
max,1965.0,1.0,3.0,1.0,1.0,59.1883,2.0,1.0,6.0,1.0,...,1.077,25.278,34.13,133.0,241.0,169.0,1.0,1.0,1.0,1.0


#### Identify and drop highly correlated features by pearson correlation

In [4]:
def print_highly_correlated_features(df, threshold=0.8):
    corr_matrix = df.corr(method="pearson")
    correlated_features = {}

    for col in corr_matrix.columns:
        high_corr = corr_matrix[col][(corr_matrix[col] >= threshold) & (corr_matrix[col] < 1)].index.tolist()
        if high_corr:
            correlated_features[col] = high_corr
    print("Highly correlated features:")
    for feature, related_features in correlated_features.items():
        print(f"{feature}: {', '.join(related_features)}")

def drop_correlated_features(df, threshold=0.8, keep_list=None):
    if keep_list is None:
        keep_list = []
    
    corr_matrix = df.corr(method="pearson").abs()
    to_drop = set()
    
    for i in range(len(corr_matrix.columns)):
        for j in range(i + 1, len(corr_matrix.columns)):  # Avoid duplicate checks
            feature1 = corr_matrix.columns[i]
            feature2 = corr_matrix.columns[j]
            
            if corr_matrix.iloc[i, j] > threshold:
                if feature1 in keep_list and feature2 in keep_list:
                    raise ValueError(f"Cannot drop either '{feature1}' or '{feature2}' as both are in keep_list.")
                
                if feature1 in keep_list:
                    to_drop.add(feature2)  # Drop feature2 if feature1 is in keep_list
                elif feature2 in keep_list:
                    to_drop.add(feature1)
                else:
                    to_drop.add(feature2)
    
    # Drop identified features
    df = df.drop(columns=to_drop, errors="ignore")
    
    print("Features dropped:")
    for x in to_drop:
        print(x)
            
    return df

#### View the correlation matrix

In [5]:
df_merged.corr()

Unnamed: 0,Birth Year,Sex,Education,Primary Hypertension,Secondary Hypertension,BMI Impedance,Smoking Status,Ever Smoked,Alcohol Intake Frequency,Report of stroke,...,Reticulocyte count,Reticulocyte percentage,White blood cell (leukocyte) count,Blood Pressure Diastolic,Blood Pressure Systolic,Pulse Rate at Blood Pressure,Has Vascular Dementia,Heart Attack,Angina,Stroke
Birth Year,1.000000,-0.004045,0.078235,-0.082066,0.020489,0.077148,-0.025002,-0.045274,0.024659,-0.022297,...,0.057534,0.042939,0.002602,0.143559,-0.095790,0.053663,-0.087266,-0.026053,-0.060290,0.006202
Sex,-0.004045,1.000000,0.087688,-0.002895,0.030317,0.007913,0.154070,0.164799,-0.207868,0.055293,...,0.070311,-0.002936,0.059512,0.051939,-0.005424,-0.063625,-0.049749,0.143685,0.048373,0.058549
Education,0.078235,0.087688,1.000000,-0.320302,-0.034921,-0.080098,-0.104502,-0.023232,-0.171952,-0.105863,...,-0.048815,-0.062154,-0.105874,0.076114,0.007772,-0.052180,-0.377758,-0.058720,-0.000890,-0.011243
Primary Hypertension,-0.082066,-0.002895,-0.320302,1.000000,0.049474,0.093735,0.075458,0.027771,0.131790,0.268789,...,0.039184,0.056739,0.087217,-0.056948,0.005088,0.051335,0.813361,0.071351,0.026163,0.077121
Secondary Hypertension,0.020489,0.030317,-0.034921,0.049474,1.000000,0.020275,0.040806,0.013871,0.021198,0.012301,...,0.019501,0.025583,0.008353,-0.002015,0.007255,-0.033099,0.040241,0.052371,-0.011421,-0.009915
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Pulse Rate at Blood Pressure,0.053663,-0.063625,-0.052180,0.051335,-0.033099,0.114496,0.013572,-0.001458,0.061431,0.013793,...,0.072770,0.064435,0.170822,0.266565,0.032546,1.000000,0.041072,-0.130838,-0.073007,0.038964
Has Vascular Dementia,-0.087266,-0.049749,-0.377758,0.813361,0.040241,0.005247,0.079808,0.017479,0.149821,0.253923,...,-0.007446,0.016773,0.074923,-0.127341,-0.082575,0.041072,1.000000,0.034773,-0.021839,0.057520
Heart Attack,-0.026053,0.143685,-0.058720,0.071351,0.052371,0.055188,0.077920,0.033552,0.056906,0.069058,...,0.050401,0.049355,0.071531,-0.162385,-0.146604,-0.130838,0.034773,1.000000,-0.097324,-0.084490
Angina,-0.060290,0.048373,-0.000890,0.026163,-0.011421,0.046216,0.011892,0.027039,0.041550,0.017362,...,0.017690,0.016119,0.024758,-0.133711,-0.071264,-0.073007,-0.021839,-0.097324,1.000000,-0.076531


#### Print highly correlated features and drop those excluded from the keey_list

In [6]:
print_highly_correlated_features(df_merged, threshold=0.8)
print()
df_merged = drop_correlated_features(df_merged, threshold = 0.8, keep_list=["Has Vascular Dementia", "Sex"])

Highly correlated features:
Sex: Testosterone
Primary Hypertension: Has Vascular Dementia
Apolipoprotein A: HDL cholesterol
Apolipoprotein B: Cholesterol, LDL direct
Cholesterol: Apolipoprotein B, LDL direct
Creatinine: Cystatin C
Cystatin C: Creatinine
Direct bilirubin: Total bilirubin
HDL cholesterol: Apolipoprotein A
LDL direct: Apolipoprotein B, Cholesterol
Testosterone: Sex
Total bilirubin: Direct bilirubin
Basophil count: Basophil percentage
Basophil percentage: Basophil count
Eosinophil count: Eosinophil percentage
Eosinophil percentage: Eosinophil count
Haematocrit percentage: Haemoglobin concentration, Red blood cell (erythrocyte) count
Haemoglobin concentration: Haematocrit percentage
High light scatter reticulocyte count: High light scatter reticulocyte percentage
High light scatter reticulocyte percentage: High light scatter reticulocyte count
Mean corpuscular haemoglobin: Mean corpuscular volume
Mean corpuscular volume: Mean corpuscular haemoglobin
Neutrophil count: White 

#### Save the new dataframe

In [7]:
df_merged.to_csv("vascular_dementia_dataset.csv", index=False)
df_merged.describe()

Unnamed: 0,Birth Year,Sex,Education,Secondary Hypertension,BMI Impedance,Smoking Status,Ever Smoked,Alcohol Intake Frequency,Report of stroke,Diabetes Diagnosed By Doctor,...,Platelet distribution width,Red blood cell (erythrocyte) distribution width,Reticulocyte count,Blood Pressure Diastolic,Blood Pressure Systolic,Pulse Rate at Blood Pressure,Has Vascular Dementia,Heart Attack,Angina,Stroke
count,4061.0,4061.0,4013.0,4061.0,4050.0,4040.0,4040.0,4055.0,4061.0,4051.0,...,4061.0,4061.0,4032.0,3908.0,3908.0,3908.0,4061.0,4061.0,4061.0,4061.0
mean,1943.607732,0.616843,1.62472,0.001477,28.501014,0.646782,0.663366,2.928237,0.204383,0.158973,...,16.532465,13.612258,0.064743,83.566786,148.917093,70.265097,0.477469,0.09702,0.081015,0.0623
std,4.804711,0.486216,1.061756,0.038414,4.798574,0.648948,0.472617,1.643792,0.4033,0.365696,...,0.528219,1.066313,0.030208,11.031026,20.269234,12.952724,0.499554,0.296022,0.272891,0.241729
min,1937.0,0.0,0.0,0.0,15.7576,0.0,0.0,1.0,0.0,0.0,...,13.71,11.49,0.01,47.0,78.0,35.0,0.0,0.0,0.0,0.0
25%,1940.0,0.0,1.0,0.0,25.283675,0.0,0.0,2.0,0.0,0.0,...,16.18,13.0,0.046,76.0,135.0,61.0,0.0,0.0,0.0,0.0
50%,1942.0,1.0,2.0,0.0,27.7878,1.0,1.0,3.0,0.0,0.0,...,16.49,13.43,0.061,83.0,148.0,69.0,0.0,0.0,0.0,0.0
75%,1946.0,1.0,3.0,0.0,31.0949,1.0,1.0,4.0,0.0,0.0,...,16.83,14.0,0.079,90.0,161.0,78.0,1.0,0.0,0.0,0.0
max,1965.0,1.0,3.0,1.0,59.1883,2.0,1.0,6.0,1.0,1.0,...,19.4,37.19,1.077,133.0,241.0,169.0,1.0,1.0,1.0,1.0
