In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from DataFields import DataFields
from DataFields import DateReportedFields
from ProjectFunctions import get_columns_from_chunk
from ProjectFunctions import convert_date_to_binary
from ProjectFunctions import count_na_in_dataframe
from ProjectFunctions import count_na_and_negative

In [None]:
diagnosed_df = pd.read_csv("diagnosed.csv")

# convert field which use write date a disease is reported and mark it as true or false
# to determine if a person was diagnosed with the diseas or not rather than when.
diagnosed_df = convert_date_to_binary(diagnosed_df, DateReportedFields)

count_na_in_dataframe(diagnosed_df)

#### Post process Education column to convert into a meaningful info (higher number means better education)

In [None]:
from ProjectFunctions import map_education_levels

diagnosed_df = map_education_levels(diagnosed_df)

#### Post Process report of stroke negative values to 0

In [None]:
col_name = "Report of Vascular problems"
# Replace -7 with 0 and -3 with NA
diagnosed_df[col_name] = diagnosed_df[col_name].replace({-7: 0, -3: pd.NA})

# Map severity levels
severity_mapping = {
    1: 3,  # Heart attack
    2: 2,  # Angina
    3: 3,  # Stroke
    4: 1   # High blood pressure
}

diagnosed_df[col_name] = diagnosed_df[col_name].map(lambda x: severity_mapping.get(x, x))


In [None]:
diagnosed_df.to_csv("diagnosed_processed.csv")

#### Understanding the loss of data, to see if NA distribution is uniform, or not.

In [None]:
import matplotlib.pyplot as plt

# plot a histogram out of patients to see how many patients miss a lot of data, and which not much.
def plot_na_histogram(df):
    na_counts = df.isna().sum(axis=1)  # Count NA values per row
    
    bin_size = 5
    max_na = na_counts.max()
    bins = list(range(0, max_na + bin_size, bin_size))
    
    plt.figure(figsize=(10, 6))
    plt.hist(na_counts, bins=bins, edgecolor='black', alpha=0.7)
    plt.xlabel('Number of NA values per row')
    plt.ylabel('Frequency')
    plt.title('Histogram of NA Counts per Row')
    plt.xticks(bins)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.show()

df = pd.read_csv("diagnosed_processed.csv")

plot_na_histogram(df)



#### Remove patients with too many NA values (more than 20)

In [None]:
def count_rows_with_na_greater_than(df, x):
    na_counts = df.isna().sum(axis=1)
    return (na_counts > x).sum()

def drop_rows_with_na_greater_than(df, x):
    na_counts = df.isna().sum(axis=1)
    return df[na_counts <= x]

df = pd.read_csv("diagnosed_processed.csv")

df = drop_rows_with_na_greater_than(df, 20)

df.describe()

In [None]:
df.to_csv("diagnosed_processed.csv")

#### Estimate new values for missing columns using Iterative Imputer

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

In [None]:
df = pd.read_csv("diagnosed_processed.csv")
df = df.drop(columns=["Unnamed: 0.2","Unnamed: 0.1","Unnamed: 0","id"])

In [None]:
import time

# Note: At 100 estimators each iteration is 2.25 minutes.

start_time = time.time()

estimator = RandomForestRegressor(
    n_estimators=50,
    max_depth = 15,
    n_jobs = -1 # use all available threads
)

imputer = IterativeImputer(estimator=estimator, max_iter=30)
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

print(f"Code execution time: {(time.time() - start_time):.4f} seconds")

df_imputed.to_csv("diagnosed_imputed_2.csv")

In [None]:
df_imputed.describe()