In [None]:
import numpy as np
import pandas as pd

# pd.set_option('display.height', 1000)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("max_colwidth", None)
from IPython.core.display import HTML, display

display(HTML("<style>.container { width:100% !important; }</style>"))


def df_stats(df):
    from tabulate import tabulate

    while True:
        print("\n***** Shape: ", df.shape, " *****\n")

        columns_list = df.columns.values.tolist()
        isnull_list = df.isnull().sum().values.tolist()
        isunique_list = df.nunique().values.tolist()
        dtypes_list = df.dtypes.tolist()

        list_stat_val = list(zip(columns_list, isnull_list, isunique_list, dtypes_list))
        df_stat_val = pd.DataFrame(list_stat_val, columns=["Name", "Null", "Unique", "Dtypes"])
        print(tabulate(df_stat_val, headers="keys", tablefmt="psql"))
    return df.head()

In [None]:
# Load the specified CSV file
csv_file_path = "/media/data1/ravram/DeepCORO/processed_dataframes/ObjectRecon_SWIN3D_2016-2023_inference_predictions_with_df_metadata_and_report.csv"
df_predictions = pd.read_csv(csv_file_path)
display(df_predictions.head())

In [None]:
import pandas as pd
from tqdm import tqdm

# Filtered labels to vessel names mapping
labels_to_vessel_names = {
    "lad": "Left Anterior Descending",
    "dist_lad": "Distal Left Anterior Descending",
    "mid_lad": "Mid Left Anterior Descending",
    "lcx": "Left Circumflex",
    "dist_lcx": "Distal Left Circumflex",
    "leftmain": "Left Main Coronary Artery",
    "prox_rca": "Proximal Right Coronary Artery",
    "mid_rca": "Mid Right Coronary Artery",
    "dist_rca": "Distal Right Coronary Artery",
    "posterolateral": "Posterolateral",
    "pda": "Posterior Descending Artery",
}

# Function to create the report for each row


def create_report(row):
    report_lines = []

    # Combine percentages with vessel names for the selected labels
    for label, vessel_name in labels_to_vessel_names.items():
        if label in row:
            percentage = row[label]
            if pd.notna(percentage) and percentage != -1:
                report_line = f"{vessel_name}: {percentage}%"
                report_lines.append(report_line)

    # Add coronary dominance
    if "coronary_dominance" in row:
        report_lines.append(f"Coronary Dominance: {row['coronary_dominance']}")

    # Summarize Conclusion and Recommendation
    if "Conclusion" in row:
        report_lines.append(f"Conclusion: {row['Conclusion']}")
    if "Recommendation" in row:
        report_lines.append(f"Recommendation: {row['Recommendation']}")

    # Join all lines into a single string
    report = "\n".join(report_lines)
    return report


# Apply the function to create the 'Report' column with tqdm progress bar
tqdm.pandas(desc="Generating Reports")
df_predictions["Report"] = df_predictions.progress_apply(create_report, axis=1)

In [None]:
import os

output_file_path = "data/reports/reports_with_alpha_separator.csv"
output_dir = os.path.dirname(output_file_path)

# Check if the directory exists, if not create it
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Check if the file already exists
if not os.path.exists(output_file_path):
    df_non_nan_reports = df_predictions.dropna(subset=["Report", "Conclusion", "Recommendation"])
    # Filter the dataframe to keep only rows where 'object_value' is 5 or 9
    df_non_nan_reports = df_non_nan_reports[df_non_nan_reports["object_value"].isin([5, 9])]

    display(df_non_nan_reports.Report.sample(10))
    # Save the 'Report' column to a text file with 'α' as the separator
    df_non_nan_reports.to_csv(output_file_path, sep="α", index=False, header=True)
else:
    print(f"File {output_file_path} already exists. Skipping report generation.")

In [None]:
output_file_path = "data/reports/reports_with_alpha_separator.csv"
df = pd.read_csv(output_file_path, sep="α", engine="python")

In [None]:
# Sample 1000 rows from the dataframe
df_sampled = df.sample(n=1000, random_state=42)

# Add a new column 'split' with value 'train'
df_sampled["Split"] = "train"

# Save the sampled dataframe to a new CSV file
output_sampled_file_path = "data/reports/reports_sampled_1000.csv"
df_sampled.to_csv(output_sampled_file_path, sep="α", index=False)

display(df_sampled.Split.value_counts())