In [None]:
import numpy as np
import pandas as pd

# pd.set_option('display.height', 1000)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("max_colwidth", None)
from IPython.core.display import HTML, display

display(HTML("<style>.container { width:100% !important; }</style>"))


def df_stats(df):
    from tabulate import tabulate

    while True:
        print("\n***** Shape: ", df.shape, " *****\n")

        columns_list = df.columns.values.tolist()
        isnull_list = df.isnull().sum().values.tolist()
        isunique_list = df.nunique().values.tolist()
        dtypes_list = df.dtypes.tolist()

        list_stat_val = list(zip(columns_list, isnull_list, isunique_list, dtypes_list))
        df_stat_val = pd.DataFrame(list_stat_val, columns=["Name", "Null", "Unique", "Dtypes"])
        print(tabulate(df_stat_val, headers="keys", tablefmt="psql"))
    return df.head()

In [None]:
# Load the specified CSV file
csv_file_path = "/media/data1/ravram/DeepCORO/processed_dataframes/ObjectRecon_SWIN3D_2016-2023_inference_predictions_with_df_metadata_and_report.csv"
df_predictions = pd.read_csv(csv_file_path)

In [None]:
display(df_predictions.head())

In [None]:
import pandas as pd
from tqdm import tqdm

# Filtered labels to vessel names mapping
labels_to_vessel_names = {
    "lad_stenosis": "Proximal Left Anterior Descending",
    "dist_lad_stenosis": "Distal Left Anterior Descending",
    "mid_lad_stenosis": "Mid Left Anterior Descending",
    "lcx_stenosis": "Proximal Left Circumflex",
    "dist_lcx_stenosis": "Distal Left Circumflex",
    "leftmain_stenosis": "Left Main Coronary Artery",
    "prox_rca_stenosis": "Proximal Right Coronary Artery",
    "mid_rca_stenosis": "Mid Right Coronary Artery",
    "dist_rca_stenosis": "Distal Right Coronary Artery",
    "posterolateral_stenosis": "Posterolateral",
    "pda_stenosis": "Posterior Descending Artery",
}

# Function to create the report for each row


def create_report(row):
    report_lines = []

    # Combine percentages with vessel names for the selected labels
    for label, vessel_name in labels_to_vessel_names.items():
        if label in row:
            percentage = row[label]
            if pd.notna(percentage) and percentage != -1:
                report_line = f"{vessel_name}: {percentage}%"
                report_lines.append(report_line)

    # Add coronary dominance
    if "coronary_dominance" in row:
        report_lines.append(f"Coronary Dominance: {row['coronary_dominance']}")
    # Add conclusion if available
    # if "Conclusion" in row and pd.notna(row["Conclusion"]):
    #    report_lines.append(f"Conclusion: {row['Conclusion']}")

    # Join all lines into a single string
    report = "; ".join(report_lines)
    return report


# Apply the function to create the 'Report' column with tqdm progress bar
tqdm.pandas(desc="Generating Reports")
df_predictions["Report"] = df_predictions.progress_apply(create_report, axis=1)

In [None]:
import os

output_file_path = "data/reports/reports_with_alpha_separator_no_conclusion.csv"
output_dir = os.path.dirname(output_file_path)

# Check if the directory exists, if not create it
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
# Drop rows where 'External_Exam' is True
df_predictions = df_predictions[df_predictions["External_Exam"] != True]

df_non_nan_reports = df_predictions.dropna(subset=["Report"])
# Filter the dataframe to keep only rows where 'object_value' is 5 or 9
df_non_nan_reports = df_non_nan_reports[df_non_nan_reports["object_value"].isin([5, 9])]
df_non_nan_reports.to_csv(output_file_path, sep="α", index=False, header=True)

In [None]:
# output_file_path = "data/reports/reports_with_alpha_separator_no_conclusion.csv"
# df_non_nan_reports = pd.read_csv(output_file_path, sep="α")

In [None]:
# Split 70% of patients for train and 30% for validation
unique_patients = df_non_nan_reports["CathReport_MRN"].drop_duplicates()
train_size = int(0.7 * len(unique_patients))
train_patients = unique_patients.sample(n=train_size, random_state=42)
val_patients = unique_patients.drop(train_patients.index)

# Keep only the sampled patients in the dataframe
df_sampled = df_non_nan_reports[
    df_non_nan_reports["CathReport_MRN"].isin(train_patients)
    | df_non_nan_reports["CathReport_MRN"].isin(val_patients)
]

# Assign split based on CathReport_MRN
df_sampled.loc[df_sampled["CathReport_MRN"].isin(train_patients), "Split"] = "train"
df_sampled.loc[df_sampled["CathReport_MRN"].isin(val_patients), "Split"] = "val"

df_sampled = df_sampled.sample(40000).reset_index()
# Save the dataframe with split information to a new CSV file
output_sampled_file_path = "data/reports/reports_sampled_no_conclusion_40000.csv"
df_sampled.to_csv(output_sampled_file_path, sep="α", index=False)

display(df_sampled.Split.value_counts())

In [None]:
display(df_sampled.Report.sample(5))