In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from DataFields import DataFields
from DataFields import DateReportedFields



In [None]:
df_diagnosed = pd.read_csv("diagnosed_processed.csv")
df_undiagnosed = pd.read_csv("undiagnosed.csv")

In [None]:
df = pd.concat([df_diagnosed, df_undiagnosed], ignore_index=True, sort=False)
df = df[DateReportedFields]

In [None]:
# from the given "fields" list, convert all columns where date is in range, to 0 or 1 instead of a date.
# Having date as not NA implies a person was diagnosed with said condition
def convert_date_to_binary(df: pd.DataFrame, fields: List[str]) -> pd.DataFrame:
    start_date = pd.Timestamp("1950-01-01")
    end_date = pd.Timestamp("2030-01-01")

    for col in fields:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce')
            mask = (df[col] >= start_date) & (df[col] <= end_date)
            df[col] = np.where(mask, 1, 0)
    return df


In [None]:
# Converts fields from date to distance in years as a float to target_field
def convert_time_distance(df, target_feature: str) -> pd.DataFrame:
    for feature in df.columns:
        df[feature] = pd.to_datetime(df[feature], errors='coerce')
        df[target_feature] = pd.to_datetime(df[target_feature], errors='coerce')
        delta_years = (df[target_feature] - df[feature]).dt.days / 365.25
        df[feature] = delta_years
    df[target_feature] = 0
    return df

df = convert_time_distance(df, "Vascular Dementia Report Date")

In [None]:
SENTINEL = 1000.0  # large value to represent "not diagnosed"
df = df.fillna(SENTINEL)

In [None]:
def plot_feature_distribution(df, features):
    for feature in features:
        plt.figure(figsize=(8, 4))
        sns.histplot(df[feature], kde=True, bins=30)
        plt.title(f'Distribution of {feature}')
        plt.xlabel(feature)
        plt.ylabel('Frequency')
        plt.show()

df.to_csv('test.csv', index=False)

df.head(20)

In [None]:
fields = DateReportedFields.copy()

plot_feature_distribution(df, fields)