In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from DataFields import DataFields
from DataFields import DateReportedFields
from ProjectFunctions import one_hot_encode_vascular_problems

In [None]:
df_diagnosed = pd.read_csv("diagnosed_imputed.csv")
df_undiagnosed = pd.read_csv("undiagnosed.csv")

In [None]:
df_merged = pd.concat([df_diagnosed, df_undiagnosed], ignore_index=True, sort=False)

df_merged = one_hot_encode_vascular_problems(df_merged)
df_merged = df_merged.drop(columns=["High Blood Pressure"])

df_merged.to_csv("dataset_merged.csv", index=False)

df_merged.describe()

#### Identify and drop highly correlated features by pearson correlation

In [None]:
def print_highly_correlated_features(df, threshold=0.8):
    corr_matrix = df.corr(method="pearson")
    correlated_features = {}

    for col in corr_matrix.columns:
        high_corr = corr_matrix[col][(corr_matrix[col] >= threshold) & (corr_matrix[col] < 1)].index.tolist()
        if high_corr:
            correlated_features[col] = high_corr
    print("Highly correlated features:")
    for feature, related_features in correlated_features.items():
        print(f"{feature}: {', '.join(related_features)}")
"""
def drop_correlated_features(df, threshold=0.8):
    corr_matrix = df.corr(method="pearson").abs()  # Absolute correlation values
    to_drop = set()
    
    for i in range(len(corr_matrix.columns)):
        for j in range(i + 1, len(corr_matrix.columns)):  # Avoid duplicate checks
            feature1 = corr_matrix.columns[i]
            feature2 = corr_matrix.columns[j]
            
            if corr_matrix.iloc[i, j] > threshold:
                to_drop.add(feature2)  # Arbitrarily drop the second feature

    # Drop identified features
    df = df.drop(columns=to_drop, errors="ignore")

    print("Features dropped:")
    for x in to_drop:
        print(x)
        
    return df
"""

def drop_correlated_features(df, threshold=0.8, keep_list=None):
    if keep_list is None:
        keep_list = []
    
    corr_matrix = df.corr(method="pearson").abs()  # Absolute correlation values
    to_drop = set()
    
    for i in range(len(corr_matrix.columns)):
        for j in range(i + 1, len(corr_matrix.columns)):  # Avoid duplicate checks
            feature1 = corr_matrix.columns[i]
            feature2 = corr_matrix.columns[j]
            
            if corr_matrix.iloc[i, j] > threshold:
                if feature1 in keep_list and feature2 in keep_list:
                    raise ValueError(f"Cannot drop either '{feature1}' or '{feature2}' as both are in keep_list.")
                
                if feature1 in keep_list:
                    to_drop.add(feature2)  # Drop feature2 if feature1 is in keep_list
                elif feature2 in keep_list:
                    to_drop.add(feature1)  # Drop feature1 if feature2 is in keep_list
                else:
                    to_drop.add(feature2)  # Default behavior: drop the second feature arbitrarily
    
    # Drop identified features
    df = df.drop(columns=to_drop, errors="ignore")
    
    print("Features dropped:")
    for x in to_drop:
        print(x)
            
    return df

In [None]:
df_merged.corr()

In [None]:
print_highly_correlated_features(df_merged, threshold=0.82)
print()
df_merged = drop_correlated_features(df_merged, threshold = 0.82, keep_list=["Has Vascular Dementia", "Sex"])

In [None]:
df_merged.to_csv("vascular_dementia_dataset.csv", index=False)

In [None]:
df_merged.describe()