In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from DataFields import DataFields
from DataFields import DateReportedFields
from ProjectFunctions import one_hot_encode_vascular_problems, convert_date_to_binary

In [2]:
df_diagnosed = pd.read_csv("diagnosed_processed.csv")
df_undiagnosed = pd.read_csv("undiagnosed.csv")

In [3]:
df_merged = pd.concat([df_diagnosed, df_undiagnosed], ignore_index=True, sort=False)

df_merged = one_hot_encode_vascular_problems(df_merged)
df_merged = df_merged.drop(columns=["High Blood Pressure", "Education"])

df_merged = convert_date_to_binary(df_merged, DateReportedFields)

df_merged.to_csv("dataset_merged.csv", index=False)

df_merged.describe()

Unnamed: 0,Vascular Dementia Report Date,Birth Year,Sex,Primary Hypertension,Secondary Hypertension,BMI Impedance,Smoking Status,Ever Smoked,Alcohol Intake Frequency,Report of stroke,...,Reticulocyte count,Reticulocyte percentage,White blood cell (leukocyte) count,Blood Pressure Diastolic,Blood Pressure Systolic,Pulse Rate at Blood Pressure,Stroke Report Date,Heart Attack,Angina,Stroke
count,4272.0,4272.0,4272.0,4272.0,4272.0,4248.0,4236.0,4237.0,4261.0,4272.0,...,4154.0,4154.0,4211.0,3946.0,3946.0,3946.0,0.0,4272.0,4272.0,4272.0
mean,0.476592,1943.189841,0.533474,0.376404,0.001404,27.971439,0.639282,0.65046,3.066182,0.148876,...,0.06244,1.390559,7.142636,82.087937,146.593512,70.546376,,0.051498,0.036517,0.036985
std,0.49951,4.902158,0.498937,0.48454,0.037455,4.775084,0.662153,0.476881,1.633808,0.356008,...,0.029997,0.671414,2.031409,10.947536,20.617342,12.581597,,0.221037,0.187594,0.188747
min,0.0,1937.0,0.0,0.0,0.0,15.5256,0.0,0.0,1.0,0.0,...,0.004,0.227,0.98,46.0,78.0,35.0,,0.0,0.0,0.0
25%,0.0,1940.0,0.0,0.0,0.0,24.7721,0.0,0.0,2.0,0.0,...,0.044,1.00025,5.88,75.0,132.0,62.0,,0.0,0.0,0.0
50%,0.0,1942.0,1.0,0.0,0.0,27.2874,1.0,1.0,3.0,0.0,...,0.059,1.3055,6.9,82.0,146.0,69.0,,0.0,0.0,0.0
75%,1.0,1945.0,1.0,1.0,0.0,30.514075,1.0,1.0,4.0,0.0,...,0.076,1.69,8.2,89.0,160.0,78.0,,0.0,0.0,0.0
max,1.0,1968.0,1.0,1.0,1.0,58.2609,2.0,1.0,6.0,1.0,...,1.077,25.278,46.6,132.0,241.0,169.0,,1.0,1.0,1.0


#### Identify and drop highly correlated features by pearson correlation

In [4]:
def print_highly_correlated_features(df, threshold=0.8):
    corr_matrix = df.corr(method="pearson")
    correlated_features = {}

    for col in corr_matrix.columns:
        high_corr = corr_matrix[col][(corr_matrix[col] >= threshold) & (corr_matrix[col] < 1)].index.tolist()
        if high_corr:
            correlated_features[col] = high_corr
    print("Highly correlated features:")
    for feature, related_features in correlated_features.items():
        print(f"{feature}: {', '.join(related_features)}")

def drop_correlated_features(df, threshold=0.8, keep_list=None):
    if keep_list is None:
        keep_list = []
    
    corr_matrix = df.corr(method="pearson").abs()
    to_drop = set()
    
    for i in range(len(corr_matrix.columns)):
        for j in range(i + 1, len(corr_matrix.columns)):  # Avoid duplicate checks
            feature1 = corr_matrix.columns[i]
            feature2 = corr_matrix.columns[j]
            
            if corr_matrix.iloc[i, j] > threshold:
                if feature1 in keep_list and feature2 in keep_list:
                    raise ValueError(f"Cannot drop either '{feature1}' or '{feature2}' as both are in keep_list.")
                
                if feature1 in keep_list:
                    to_drop.add(feature2)  # Drop feature2 if feature1 is in keep_list
                elif feature2 in keep_list:
                    to_drop.add(feature1)
                else:
                    to_drop.add(feature2)
    
    # Drop identified features
    df = df.drop(columns=to_drop, errors="ignore")
    
    print("Features dropped:")
    for x in to_drop:
        print(x)
            
    return df

#### View the correlation matrix

In [5]:
df_merged.corr()

Unnamed: 0,Vascular Dementia Report Date,Birth Year,Sex,Primary Hypertension,Secondary Hypertension,BMI Impedance,Smoking Status,Ever Smoked,Alcohol Intake Frequency,Report of stroke,...,Reticulocyte count,Reticulocyte percentage,White blood cell (leukocyte) count,Blood Pressure Diastolic,Blood Pressure Systolic,Pulse Rate at Blood Pressure,Stroke Report Date,Heart Attack,Angina,Stroke
Vascular Dementia Report Date,1.000000,-0.007794,0.112592,0.814185,0.039302,0.111602,0.083425,0.039965,0.076125,0.438292,...,0.060557,0.064933,0.092700,-0.013882,0.031184,0.026648,,0.244188,0.204019,0.205373
Birth Year,-0.007794,1.000000,0.000321,-0.018262,0.022776,0.008159,0.029417,0.011789,0.002631,0.005267,...,0.040167,0.027388,0.026354,0.035663,-0.167109,0.002264,,0.006317,-0.010341,0.001267
Sex,0.112592,0.000321,1.000000,0.128009,0.035071,0.055768,0.176323,0.183929,-0.203480,0.111661,...,0.087165,0.012247,0.033561,0.069838,0.026309,-0.067561,,0.132979,0.031965,0.061438
Primary Hypertension,0.814185,-0.018262,0.128009,1.000000,0.048271,0.180772,0.077462,0.043369,0.074648,0.428378,...,0.093311,0.095332,0.100546,0.037692,0.100333,0.040342,,0.256194,0.227399,0.206162
Secondary Hypertension,0.039302,0.022776,0.035071,0.048271,1.000000,0.024065,0.039481,0.014443,0.017634,0.019433,...,0.022267,0.027567,0.008847,0.003252,0.011496,-0.034783,,0.076105,-0.007301,-0.007350
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Pulse Rate at Blood Pressure,0.026648,0.002264,-0.067561,0.040342,-0.034783,0.131915,0.011169,-0.009079,0.081383,0.024601,...,0.082029,0.074760,0.146573,0.274007,0.054170,1.000000,,-0.062652,-0.027094,0.032556
Stroke Report Date,,,,,,,,,,,...,,,,,,,,,,
Heart Attack,0.244188,0.006317,0.132979,0.256194,0.076105,0.069160,0.083333,0.054220,0.065392,0.173308,...,0.047636,0.046246,0.074211,-0.099366,-0.080529,-0.062652,,1.000000,-0.045363,-0.045664
Angina,0.204019,-0.010341,0.031965,0.227399,-0.007301,0.074834,0.023909,0.012772,0.034939,0.118410,...,0.046694,0.047743,0.053661,-0.058551,-0.026173,-0.027094,,-0.045363,1.000000,-0.038152


#### Print highly correlated features and drop those excluded from the keey_list

In [6]:
print_highly_correlated_features(df_merged, threshold=0.8)
print()
df_merged = drop_correlated_features(df_merged, threshold = 0.8, keep_list=["Vascular Dementia Report Date", "Sex"])

Highly correlated features:
Vascular Dementia Report Date: Primary Hypertension
Sex: Testosterone
Primary Hypertension: Vascular Dementia Report Date
Apolipoprotein A: HDL cholesterol
Apolipoprotein B: Cholesterol, LDL direct
Cholesterol: Apolipoprotein B, LDL direct
Direct bilirubin: Total bilirubin
HDL cholesterol: Apolipoprotein A
LDL direct: Apolipoprotein B, Cholesterol
Testosterone: Sex
Total bilirubin: Direct bilirubin
Basophil count: Basophil percentage
Basophil percentage: Basophil count
Eosinophil count: Eosinophil percentage
Eosinophil percentage: Eosinophil count
Haematocrit percentage: Haemoglobin concentration, Red blood cell (erythrocyte) count
Haemoglobin concentration: Haematocrit percentage
High light scatter reticulocyte count: High light scatter reticulocyte percentage
High light scatter reticulocyte percentage: High light scatter reticulocyte count
Mean corpuscular haemoglobin: Mean corpuscular volume
Mean corpuscular volume: Mean corpuscular haemoglobin
Neutrophil

#### Save the new dataframe

In [7]:
df_merged.to_csv("vascular_dementia_dataset.csv", index=False)
df_merged.describe()

Unnamed: 0,Vascular Dementia Report Date,Birth Year,Sex,Secondary Hypertension,BMI Impedance,Smoking Status,Ever Smoked,Alcohol Intake Frequency,Report of stroke,Diabetes Diagnosed By Doctor,...,Platelet distribution width,Red blood cell (erythrocyte) distribution width,Reticulocyte count,Blood Pressure Diastolic,Blood Pressure Systolic,Pulse Rate at Blood Pressure,Stroke Report Date,Heart Attack,Angina,Stroke
count,4272.0,4272.0,4272.0,4272.0,4248.0,4236.0,4237.0,4261.0,4272.0,4253.0,...,4211.0,4211.0,4154.0,3946.0,3946.0,3946.0,0.0,4272.0,4272.0,4272.0
mean,0.476592,1943.189841,0.533474,0.001404,27.971439,0.639282,0.65046,3.066182,0.148876,0.141547,...,16.533562,13.630693,0.06244,82.087937,146.593512,70.546376,,0.051498,0.036517,0.036985
std,0.49951,4.902158,0.498937,0.037455,4.775084,0.662153,0.476881,1.633808,0.356008,0.348626,...,0.531051,1.027976,0.029997,10.947536,20.617342,12.581597,,0.221037,0.187594,0.188747
min,0.0,1937.0,0.0,0.0,15.5256,0.0,0.0,1.0,0.0,0.0,...,15.2,11.49,0.004,46.0,78.0,35.0,,0.0,0.0,0.0
25%,0.0,1940.0,0.0,0.0,24.7721,0.0,0.0,2.0,0.0,0.0,...,16.17,13.0,0.044,75.0,132.0,62.0,,0.0,0.0,0.0
50%,0.0,1942.0,1.0,0.0,27.2874,1.0,1.0,3.0,0.0,0.0,...,16.5,13.46,0.059,82.0,146.0,69.0,,0.0,0.0,0.0
75%,1.0,1945.0,1.0,0.0,30.514075,1.0,1.0,4.0,0.0,0.0,...,16.83,14.01,0.076,89.0,160.0,78.0,,0.0,0.0,0.0
max,1.0,1968.0,1.0,1.0,58.2609,2.0,1.0,6.0,1.0,1.0,...,19.4,31.7,1.077,132.0,241.0,169.0,,1.0,1.0,1.0
