# 05.02.2025 - Transformation & Post processing

In [None]:
import pandas as pd
import numpy as np
import json
from tqdm import tqdm
from pandas.tseries.offsets import MonthBegin

In [None]:

# From eval run: https://genentech.wandb.io/nikitamakarov/UC%20-%20ADNI/runs/tyhfwyg7?nw=nwusernikitamakarov
generated_predictions_path = "/pstore/data/dt-gpt/raw_experiments/uc2_nsclc/adni_dt_gpt/adni_dt_gpt/2025_02_03___17_32_50_444796/eval_meta_data/predictions_raw.csv"
generated_predictions = pd.read_csv(generated_predictions_path)

# Get true predictions
true_predictions = pd.read_csv("/home/makaron1/dt-gpt/uc4-alzheimers-disease/data/ADNI_short_DT_GPT/ADNI_short_test_ground_truth.csv")

# Target cols
target_cols =  ["CDRSB", "ADAS11", "MMSE"]
true_predictions = true_predictions[["PATIENT_ID", "MONTH"] + target_cols]
true_predictions = true_predictions.sort_values(by=["PATIENT_ID", "MONTH"])

# Mapping used throughout
mapping = {
    "CDR-SB score": "CDRSB",
    "ADAS11 score": "ADAS11",
    "MMSE score": "MMSE"
}
reverse_mapping = {v: k for k, v in mapping.items()}



In [None]:
true_predictions

## First do post-processing of responses

In [None]:
#: sometimes the model repeats itself, so split anything after the first occurence of <patient_prediction> and keep only first part
processed_predictions = generated_predictions.copy()
processed_predictions["responses"] = processed_predictions["responses"].apply(lambda x: x.split("<patient_prediction>")[0])

#: apply stripping of whitespaces
processed_predictions["responses"] = processed_predictions["responses"].apply(lambda x: x.strip())

#: sometimes model makes 2 "]}" at the end instead of one -> in those cases keep only 1
processed_predictions["responses"] = processed_predictions["responses"].apply(lambda x: x.split("]}")[0] + "]}")


In [None]:
processed_predictions.iloc[10,1]

In [None]:
print("Length of processed predictions: ", len(processed_predictions))

## Process into dataframes, then average

In [None]:
generated_dfs = []

# Iterate through each row in the processed_predictions dataframe
for idx, row in tqdm(processed_predictions.iterrows()):
    patient_id = row["patientid"]
    
    try:
        # Parse the JSON string into a dictionary
        responses = json.loads(row["responses"])
    except json.JSONDecodeError:
        print(f"Invalid JSON for patient {patient_id} at index {idx}. Skipping.")
        continue  # Skip to the next iteration if JSON is invalid

    # Extract the true predictions for the current patient
    true_df = true_predictions[true_predictions["PATIENT_ID"] == patient_id]

    # Initialize a dataframe for the generated data with PATIENT_ID and MONTH
    generated_patient_df = pd.DataFrame({
        "PATIENT_ID": patient_id,
        "MONTH": true_df["MONTH"]
    })

    # Iterate through each target column to align generated values
    for target_col in target_cols:
        # Retrieve the generated values for the current target column
        generated_values = responses.get(reverse_mapping[target_col], [])

        # Identify the months where the true data for this target is not missing
        non_na_true = true_df[["PATIENT_ID", "MONTH", target_col]].dropna(subset=[target_col]).sort_values(by="MONTH").reset_index(drop=True)

        # Check if the number of generated values matches the number of non-missing entries
        if len(generated_values) != len(non_na_true):
            # If generated too much, cut the excess
            if len(generated_values) > len(non_na_true):
                print(f"Generated too many values for '{target_col}' in patient '{patient_id}': "
                      f"expected {len(non_na_true)}, got {len(generated_values)}. Cutting the excess.")
                generated_values = generated_values[:len(non_na_true)]
            else:
                raise ValueError(
                    f"Length mismatch for '{target_col}' in patient '{patient_id}': "
                    f"expected {len(non_na_true)}, got {len(generated_values)}."
                )

        # Initialize the target column with NaNs
        generated_patient_df[target_col] = np.nan

        # Assign the generated values to the corresponding months
        generated_patient_df.loc[
            generated_patient_df["MONTH"].isin(non_na_true["MONTH"]),
            target_col
        ] = generated_values

    # Append the generated dataframe for the current patient to the list
    generated_dfs.append(generated_patient_df)

# Concatenate all generated dataframes into a single dataframe
generated_df = pd.concat(generated_dfs, ignore_index=True)

# (Optional) If you want to verify the alignment, you can perform additional checks here
# For example:
# assert generated_df.isna().sum().sum() == 0, "There are still missing values in the generated dataframe."

# Display the first few rows of the generated dataframe
print(generated_df.head())

In [None]:
# Now average by patient and month
generated_df_averaged = generated_df.groupby(["PATIENT_ID", "MONTH"]).mean().reset_index()

In [None]:
# Double check that average is correct for first patient
patient_id_0 = generated_df_averaged["PATIENT_ID"].iloc[0]
original_df = generated_df[generated_df["PATIENT_ID"] == patient_id_0]
original_df_first_time = original_df[original_df["MONTH"] == original_df["MONTH"].min()]

assert generated_df_averaged[generated_df_averaged["PATIENT_ID"] == patient_id_0]["CDRSB"].iloc[0] == original_df_first_time["CDRSB"].mean()

## Save DT-GPT Outputs in common format

In [None]:
dt_gpt_outputs = generated_df_averaged_no_na.copy()

# Standardize
dt_gpt_outputs["CDRSB"] = (dt_gpt_outputs["CDRSB"] - mean_CDSSB) / std_CDRSB
dt_gpt_outputs["ADAS11"] = (dt_gpt_outputs["ADAS11"] - mean_ADAS11) / std_ADAS11
dt_gpt_outputs["MMSE"] = (dt_gpt_outputs["MMSE"] - mean_MMSE) / std_MMSE

# Add fake patient_sample_index
dt_gpt_outputs["patient_sample_index"] = "split_0"

dt_gpt_outputs

In [None]:
# Save
dt_gpt_outputs.to_csv("/home/makaron1/dt-gpt/uc2_nsclc/2_experiments/2025_02_03_adni/3_dt_gpt/outputs/dt_gpt_outputs.csv", index=False)