In [None]:
import numpy as np
import pandas as pd

# pd.set_option('display.height', 1000)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("max_colwidth", None)
from IPython.core.display import HTML, display

display(HTML("<style>.container { width:100% !important; }</style>"))


def df_stats(df):
    from tabulate import tabulate

    while True:
        print("\n***** Shape: ", df.shape, " *****\n")

        columns_list = df.columns.values.tolist()
        isnull_list = df.isnull().sum().values.tolist()
        isunique_list = df.nunique().values.tolist()
        dtypes_list = df.dtypes.tolist()

        list_stat_val = list(zip(columns_list, isnull_list, isunique_list, dtypes_list))
        df_stat_val = pd.DataFrame(list_stat_val, columns=["Name", "Null", "Unique", "Dtypes"])
        print(tabulate(df_stat_val, headers="keys", tablefmt="psql"))
    return df.head()

In [None]:
# Load the specified CSV file
csv_file_path = "/media/data1/ravram/DeepCORO/processed_dataframes/ObjectRecon_SWIN3D_2016-2023_inference_predictions_with_df_metadata_and_report.csv"
df_predictions = pd.read_csv(csv_file_path)

In [None]:
import pandas as pd
from tqdm import tqdm

# Filtered labels to vessel names mapping
labels_to_vessel_names = {
    "lad_stenosis": "the proximal segment of the Left Anterior Descending (LAD) artery",
    "dist_lad_stenosis": "the distal segment of the Left Anterior Descending (LAD) artery",
    "mid_lad_stenosis": "the mid segment of the Left Anterior Descending (LAD) artery",
    "lcx_stenosis": "the proximal branch of the Left Circumflex (LCX) artery",
    "dist_lcx_stenosis": "the distal branch of the Left Circumflex (LCX) artery",
    "leftmain_stenosis": "the Left Main Coronary Artery (LMCA)",
    "prox_rca_stenosis": "the proximal Right Coronary Artery (RCA)",
    "mid_rca_stenosis": "the mid portion of the Right Coronary Artery (RCA)",
    "dist_rca_stenosis": "the distal portion of the Right Coronary Artery (RCA)",
    "posterolateral_stenosis": "the posterolateral branch",
    "pda_stenosis": "the Posterior Descending Artery (PDA)",
}


def format_stenosis_statement(vessel_name, percentage):
    """
    Create a more narrative, natural language statement for a given vessel stenosis percentage.
    """
    if percentage == 0:
        return f"{vessel_name} appears free of significant stenosis (0%)."
    elif 0 < percentage < 50:
        return f"{vessel_name} shows mild stenosis (~{percentage}%)."
    elif 50 <= percentage < 70:
        return f"{vessel_name} shows moderate stenosis (~{percentage}%)."
    elif 70 <= percentage < 90:
        return f"{vessel_name} shows severe stenosis (~{percentage}%)."
    else:
        return f"{vessel_name} shows critical stenosis (~{percentage}%)."


def create_report(row):
    # Collect narrative sentences for each vessel
    vessel_statements = []

    for label, vessel_name in labels_to_vessel_names.items():
        if label in row:
            percentage = row[label]
            if pd.notna(percentage) and percentage != -1:
                # Add a narrative statement for this vessel
                statement = format_stenosis_statement(vessel_name, percentage)
                vessel_statements.append(statement)

    # Add coronary dominance if available
    dominance_statement = ""
    if "coronary_dominance" in row and pd.notna(row["coronary_dominance"]):
        # Make the dominance part more natural
        dominance = row["coronary_dominance"]
        dominance_statement = f"The coronary circulation is {dominance.replace('_', ' ')}."

    # Construct the final report
    # Start with a general introduction
    report_lines = []
    if vessel_statements:
        report_lines.append("Angiographic assessment reveals the following findings:")
        report_lines.extend(vessel_statements)
    else:
        report_lines.append("No significant stenosis was noted in the evaluated vessels.")

    if dominance_statement:
        report_lines.append(dominance_statement)

    report = " ".join(report_lines)
    return report


# Apply the function to create the 'Report' column with a progress bar
tqdm.pandas(desc="Generating Reports")
df_predictions["Report"] = df_predictions.progress_apply(create_report, axis=1)

In [None]:
import os

output_file_path = "data/reports/reports_with_alpha_separator_no_conclusion.csv"
output_dir = os.path.dirname(output_file_path)

# Check if the directory exists, if not create it
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
# Drop rows where 'External_Exam' is True
df_predictions = df_predictions[df_predictions["External_Exam"] != True]

df_non_nan_reports = df_predictions.dropna(subset=["Report"])
# Filter the dataframe to keep only rows where 'object_value' is 5 or 9
df_non_nan_reports = df_non_nan_reports[df_non_nan_reports["object_value"].isin([5, 9])]
df_non_nan_reports.to_csv(output_file_path, sep="α", index=False, header=True)

In [None]:
df_non_nan_reports.Report.sample(5)

In [None]:
output_file_path = "data/reports/reports_with_alpha_separator_no_conclusion.csv"
df_non_nan_reports = pd.read_csv(output_file_path, sep="α")

In [None]:
# Split 70% of patients for train and 30% for validation
unique_patients = df_non_nan_reports["CathReport_MRN"].drop_duplicates()
train_size = int(0.8 * len(unique_patients))
train_patients = unique_patients.sample(n=train_size, random_state=42)
val_patients = unique_patients.drop(train_patients.index)

# Keep only the sampled patients in the dataframe
df_sampled = df_non_nan_reports[
    df_non_nan_reports["CathReport_MRN"].isin(train_patients)
    | df_non_nan_reports["CathReport_MRN"].isin(val_patients)
]

# Assign split based on CathReport_MRN
df_sampled.loc[df_sampled["CathReport_MRN"].isin(train_patients), "Split"] = "train"
df_sampled.loc[df_sampled["CathReport_MRN"].isin(val_patients), "Split"] = "val"

df_sampled = df_sampled.sample(40000).reset_index()
# Save the dataframe with split information to a new CSV file
output_sampled_file_path = "data/reports/reports_sampled_no_conclusion_40000.csv"
df_sampled.to_csv(output_sampled_file_path, sep="α", index=False)

display(df_sampled.Split.value_counts())

In [None]:
output_sampled_file_path = pd.read_csv(
    "data/reports/reports_sampled_no_conclusion_40000.csv", sep="α"
)

In [None]:
display(output_sampled_file_path.Report.head(n=5))