In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from DataFields import DataFields
from DataFields import DateReportedFields
from ProjectFunctions import get_columns_from_chunk
from ProjectFunctions import convert_date_to_binary
from ProjectFunctions import count_na_in_dataframe
from ProjectFunctions import count_na_and_negative
from ProjectFunctions import one_hot_encode_vascular_problems
from ProjectFunctions import drop_rows_with_na_greater_than
from ProjectFunctions import map_education_levels, map_vascular_levels


In [None]:
diagnosed_df = pd.read_csv("diagnosed.csv")

# Some fields report diagnosis by assigning the diagnosis date, while NA implies undiagnosed.
# convert features specified in "DateRportedFields" from date values, to binary
# meaning: NA = 0, Any date in range (see function) as 1
#diagnosed_df = convert_date_to_binary(diagnosed_df, DateReportedFields)

# Count number of NA values per feature.
count_na_in_dataframe(diagnosed_df, exclude=DateReportedFields)

#### The following fields were removed due to significant count of NA values:
* Oestradiol
* Rheumatoid factor
* Recent trouble concentrating on things
* Bipolar or Major Depression
* Ever had prolonged feelings of sadness or depression

#### Map education and vascular problems

In [None]:

# map education in increasing hierarchy
diagnosed_df = map_education_levels(diagnosed_df)

# map vascular problems to meaningful values
diagnosed_df["Report of vascular problems"] = diagnosed_df["Report of vascular problems"].replace({-7: 0, -3: pd.NA})

diagnosed_df.to_csv("diagnosed_processed.csv", index=False)

#### Histogram of patients by NA values

In [None]:

# plot a histogram out of patients to see how many patients miss a lot of data, and which not much.
def plot_na_histogram(df):
    na_counts = df.isna().sum(axis=1)  # Count NA values per row
    bin_size = 5
    max_na = na_counts.max()
    bins = list(range(0, max_na + bin_size, bin_size))
    plt.figure(figsize=(10, 6))
    plt.hist(na_counts, bins=bins, edgecolor='black', alpha=0.7)
    plt.xlabel('Number of NA values per row')
    plt.ylabel('Frequency')
    plt.title('Histogram of NA Counts per Row')
    plt.xticks(bins)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.show()


df = pd.read_csv("diagnosed_processed.csv")

plot_na_histogram(df)

#### Remove patients with too many NA values

In [None]:

df = pd.read_csv("diagnosed_processed.csv")
df = df.drop(columns=["id"])
numeric_cols = df.select_dtypes(include=['number']).columns
df[numeric_cols] = df[numeric_cols].mask(df[numeric_cols] < 0)
df = drop_rows_with_na_greater_than(df, 45)
df.describe()


In [None]:
df.to_csv("diagnosed_processed.csv", index=False)

#### Estimate new values for missing columns using Iterative Imputer (Optinal for some xgb)

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor
import time

In [None]:
df = pd.read_csv("diagnosed_processed.csv")

imputer = IterativeImputer(max_iter=50)
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

df_imputed.to_csv("diagnosed_imputed.csv", index=False)
df_imputed.describe()