# 05.02.2025 - Transformation & Post processing

In [None]:
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
from pandas.tseries.offsets import MonthBegin

In [None]:
generated_predictions_path = ".../predictions_raw.csv"
generated_predictions = pd.read_csv(generated_predictions_path)

# Get true predictions
true_predictions = pd.read_csv(".../ADNI_short_test_ground_truth.csv")

# Target cols
target_cols =  ["CDRSB", "ADAS11", "MMSE"]
true_predictions = true_predictions[["PATIENT_ID", "MONTH"] + target_cols]
true_predictions = true_predictions.sort_values(by=["PATIENT_ID", "MONTH"])

# Mapping used throughout
mapping = {
    "CDR-SB score": "CDRSB",
    "ADAS11 score": "ADAS11",
    "MMSE score": "MMSE"
}
reverse_mapping = {v: k for k, v in mapping.items()}



In [None]:
true_predictions

## First do post-processing of responses

In [None]:
#: sometimes the model repeats itself, so split anything after the first occurence of <patient_prediction> and keep only first part
processed_predictions = generated_predictions.copy()
processed_predictions["responses"] = processed_predictions["responses"].apply(lambda x: x.split("<patient_prediction>")[0])

#: apply stripping of whitespaces
processed_predictions["responses"] = processed_predictions["responses"].apply(lambda x: x.strip())

#: sometimes model makes 2 "]}" at the end instead of one -> in those cases keep only 1
processed_predictions["responses"] = processed_predictions["responses"].apply(lambda x: x.split("]}")[0] + "]}")


In [None]:
processed_predictions.iloc[10,1]

In [None]:
print("Length of processed predictions: ", len(processed_predictions))

## Process into dataframes, then average

In [None]:
generated_dfs = []

# Iterate through each row in the processed_predictions dataframe
for idx, row in tqdm(processed_predictions.iterrows()):
    patient_id = row["patientid"]
    
    try:
        # Parse the JSON string into a dictionary
        responses = json.loads(row["responses"])
    except json.JSONDecodeError:
        print(f"Invalid JSON for patient {patient_id} at index {idx}. Skipping.")
        continue  # Skip to the next iteration if JSON is invalid

    # Extract the true predictions for the current patient
    true_df = true_predictions[true_predictions["PATIENT_ID"] == patient_id]

    # Initialize a dataframe for the generated data with PATIENT_ID and MONTH
    generated_patient_df = pd.DataFrame({
        "PATIENT_ID": patient_id,
        "MONTH": true_df["MONTH"]
    })

    # Iterate through each target column to align generated values
    for target_col in target_cols:
        # Retrieve the generated values for the current target column
        generated_values = responses.get(reverse_mapping[target_col], [])

        # Identify the months where the true data for this target is not missing
        non_na_true = true_df[["PATIENT_ID", "MONTH", target_col]].dropna(subset=[target_col]).sort_values(by="MONTH").reset_index(drop=True)

        # Check if the number of generated values matches the number of non-missing entries
        if len(generated_values) != len(non_na_true):
            # If generated too much, cut the excess
            if len(generated_values) > len(non_na_true):
                print(f"Generated too many values for '{target_col}' in patient '{patient_id}': "
                      f"expected {len(non_na_true)}, got {len(generated_values)}. Cutting the excess.")
                generated_values = generated_values[:len(non_na_true)]
            else:
                raise ValueError(
                    f"Length mismatch for '{target_col}' in patient '{patient_id}': "
                    f"expected {len(non_na_true)}, got {len(generated_values)}."
                )

        # Initialize the target column with NaNs
        generated_patient_df[target_col] = np.nan

        # Assign the generated values to the corresponding months
        generated_patient_df.loc[
            generated_patient_df["MONTH"].isin(non_na_true["MONTH"]),
            target_col
        ] = generated_values

    # Append the generated dataframe for the current patient to the list
    generated_dfs.append(generated_patient_df)

# Concatenate all generated dataframes into a single dataframe
generated_df = pd.concat(generated_dfs, ignore_index=True)


# Display the first few rows of the generated dataframe
print(generated_df.head())

In [None]:
# Now average by patient and month
generated_df_averaged = generated_df.groupby(["PATIENT_ID", "MONTH"]).mean().reset_index()

In [None]:
# Double check that average is correct for first patient
patient_id_0 = generated_df_averaged["PATIENT_ID"].iloc[0]
original_df = generated_df[generated_df["PATIENT_ID"] == patient_id_0]
original_df_first_time = original_df[original_df["MONTH"] == original_df["MONTH"].min()]

assert generated_df_averaged[generated_df_averaged["PATIENT_ID"] == patient_id_0]["CDRSB"].iloc[0] == original_df_first_time["CDRSB"].mean()

## Compare with MAE with true

In [None]:
# Rename for consistency
generated_df_averaged = generated_df_averaged.rename(columns=mapping)

In [None]:
# Merge with true predictions
merged_df = pd.merge(generated_df_averaged, true_predictions, on=["PATIENT_ID", "MONTH"], suffixes=("_generated", "_true"))

In [None]:
# Get MAE by column
mae = {}
for col in target_cols:
    mae[col] = np.abs(merged_df[col + "_generated"] - merged_df[col + "_true"]).mean()

print("========== RESULTS ==========")
print(mae)

In [None]:
# Taken from statistics file
std_CDRSB = 1.8347716172641326
mean_CDSSB = 1.7012566427720828
std_ADAS11 = 6.62023532076858
mean_ADAS11 = 10.571872441365645
std_MMSE = 2.9418718345388455
mean_MMSE = 27.095790481554758

# Normalize
mae_normalized = {}
mae_normalized["CDRSB"] = mae["CDRSB"] / std_CDRSB
mae_normalized["ADAS11"] = mae["ADAS11"] / std_ADAS11
mae_normalized["MMSE"] = mae["MMSE"] / std_MMSE

print("========== NORMALIZED RESULTS ==========")
print(mae_normalized)

## Compare with Copy Forward & LightGBM

In [None]:
copy_forward_predictions = pd.read_csv(".../TEST_prediction_dataframe.csv")
copy_forward_targets = pd.read_csv(".../TEST_target_dataframe.csv")


lightgbm_predictions = pd.read_csv(".../TEST_prediction_dataframe.csv")

In [None]:
lightgbm_predictions

In [None]:
# Process generated predictions into same format as copy forward
generated_df_averaged_no_na = generated_df_averaged.dropna(subset=target_cols, how="all").copy()
start_date = pd.to_datetime('2020-01-01')
generated_df_averaged_no_na["date"] = start_date + generated_df_averaged_no_na["MONTH"]  * MonthBegin(1)
generated_df_averaged_no_na = generated_df_averaged_no_na.rename(columns={"PATIENT_ID": "patientid"})
generated_df_averaged_no_na = generated_df_averaged_no_na.drop(columns=["MONTH"])
generated_df_averaged_no_na = generated_df_averaged_no_na[["patientid", "date"] + target_cols]
generated_df_averaged_no_na["date"] = pd.to_datetime(generated_df_averaged_no_na["date"])

In [None]:
copy_forward_predictions_no_na = copy_forward_predictions.dropna(subset=target_cols, how="all").copy()
copy_forward_targets_no_na = copy_forward_targets.dropna(subset=target_cols, how="all").copy()

# Destandardize copy forward predictions, column by column
copy_forward_predictions_no_na["CDRSB"] = copy_forward_predictions_no_na["CDRSB"] * std_CDRSB + mean_CDSSB
copy_forward_predictions_no_na["ADAS11"] = copy_forward_predictions_no_na["ADAS11"] * std_ADAS11 + mean_ADAS11
copy_forward_predictions_no_na["MMSE"] = copy_forward_predictions_no_na["MMSE"] * std_MMSE + mean_MMSE

# Destandardize copy forward targets, column by column
copy_forward_targets_no_na["CDRSB"] = copy_forward_targets_no_na["CDRSB"] * std_CDRSB + mean_CDSSB
copy_forward_targets_no_na["ADAS11"] = copy_forward_targets_no_na["ADAS11"] * std_ADAS11 + mean_ADAS11
copy_forward_targets_no_na["MMSE"] = copy_forward_targets_no_na["MMSE"] * std_MMSE + mean_MMSE

# Destandardize lightgbm predictions, column by column
lightgbm_predictions["CDRSB"] = lightgbm_predictions["CDRSB"] * std_CDRSB + mean_CDSSB
lightgbm_predictions["ADAS11"] = lightgbm_predictions["ADAS11"] * std_ADAS11 + mean_ADAS11
lightgbm_predictions["MMSE"] = lightgbm_predictions["MMSE"] * std_MMSE + mean_MMSE

# Convert date to datetime
copy_forward_predictions_no_na["date"] = pd.to_datetime(copy_forward_predictions_no_na["date"])
copy_forward_targets_no_na["date"] = pd.to_datetime(copy_forward_targets_no_na["date"])
lightgbm_predictions["date"] = pd.to_datetime(lightgbm_predictions["date"])


In [None]:
# Get MAE
mae_copy_forward = {}
mae_generated = {}
mae_lightgbm = {}

for col in target_cols:
    # Extract from target the non-na values, then merge on patientid and date
    true_values = copy_forward_targets_no_na[["patientid", "date", col]]
    true_values_no_na = true_values.dropna(subset=[col], how="all")

    # Extract from predictions the non-na values, then merge on patientid and date
    copy_forward_values = copy_forward_predictions_no_na[["patientid", "date", col]]
    copy_forward_non_na = copy_forward_values.dropna(subset=[col], how="all")

    # Get MAE for generated
    generated_values = generated_df_averaged_no_na[["patientid", "date", col]]
    generated_non_na = generated_values.dropna(subset=[col], how="all")

    # Get lightGBM
    lightgbm_values = lightgbm_predictions[["patientid", "date", col]]
    lightgbm_non_na = lightgbm_values.dropna(subset=[col], how="all")

    # Merge on patientid and date
    merged_generated = pd.merge(true_values_no_na, generated_non_na, on=["patientid", "date"], suffixes=("_true", "_generated"))
    merged_copy_forward = pd.merge(true_values_no_na, copy_forward_non_na, on=["patientid", "date"], suffixes=("_true", "_copy_forward"))
    merged_lightgbm = pd.merge(true_values_no_na, lightgbm_non_na, on=["patientid", "date"], suffixes=("_true", "_lightgbm"))

    mae_generated[col] = np.abs(merged_generated[col + "_generated"] - merged_generated[col + "_true"]).mean()
    mae_copy_forward[col] = np.abs(merged_copy_forward[col + "_copy_forward"] - merged_copy_forward[col + "_true"]).mean()
    mae_lightgbm[col] = np.abs(merged_lightgbm[col + "_lightgbm"] - merged_lightgbm[col + "_true"]).mean()

    

print("========== COPY FORWARD RESULTS ==========")
print(mae_copy_forward)

print("========== LIGHTGBM RESULTS ==========")
print(mae_lightgbm)

print("========== DT-GPT RESULTS ==========")
print(mae_generated)

## Save DT-GPT Outputs in common format

In [None]:
dt_gpt_outputs = generated_df_averaged_no_na.copy()

# Standardize
dt_gpt_outputs["CDRSB"] = (dt_gpt_outputs["CDRSB"] - mean_CDSSB) / std_CDRSB
dt_gpt_outputs["ADAS11"] = (dt_gpt_outputs["ADAS11"] - mean_ADAS11) / std_ADAS11
dt_gpt_outputs["MMSE"] = (dt_gpt_outputs["MMSE"] - mean_MMSE) / std_MMSE

# Add fake patient_sample_index
dt_gpt_outputs["patient_sample_index"] = "split_0"

dt_gpt_outputs

In [None]:
# Save
dt_gpt_outputs.to_csv("./outputs/dt_gpt_outputs.csv", index=False)