In [1]:
import numpy as np
import pandas as pd

# pd.set_option('display.height', 1000)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("max_colwidth", None)
from IPython.core.display import HTML, display

display(HTML("<style>.container { width:100% !important; }</style>"))


def df_stats(df):
    from tabulate import tabulate

    while True:
        print("\n***** Shape: ", df.shape, " *****\n")

        columns_list = df.columns.values.tolist()
        isnull_list = df.isnull().sum().values.tolist()
        isunique_list = df.nunique().values.tolist()
        dtypes_list = df.dtypes.tolist()

        list_stat_val = list(zip(columns_list, isnull_list, isunique_list, dtypes_list))
        df_stat_val = pd.DataFrame(list_stat_val, columns=["Name", "Null", "Unique", "Dtypes"])
        print(tabulate(df_stat_val, headers="keys", tablefmt="psql"))
    return df.head()

  from IPython.core.display import HTML, display


In [2]:
# Load the specified CSV file
csv_file_path = "/media/data1/ravram/DeepCORO/processed_dataframes/ObjectRecon_SWIN3D_2016-2023_inference_predictions_with_df_metadata_and_report.csv"
df_predictions = pd.read_csv(csv_file_path)

  df_predictions = pd.read_csv(csv_file_path)


In [3]:
display(df_predictions.FileName.head(n=1))

0    /media/data1/ravram/MHI_CATH_DICOM_VIDEOS/2022/2.16.124.113611.1.118.1.1.5994023_1.3.12.2.1107.5.4.5.135214.30000022010107492025500000000.dcm.avi
Name: FileName, dtype: object

In [None]:
import pandas as pd
from tqdm import tqdm

tqdm.pandas()

##############################################################################
# 1) Base Vessel Definitions & Order                                         #
##############################################################################
labels_to_vessel_names = {
    "leftmain_stenosis": "the Left Main Coronary Artery (LMCA)",
    "lad_stenosis": "the proximal LAD",
    "mid_lad_stenosis": "the mid LAD",
    "dist_lad_stenosis": "the distal LAD",
    "D1_stenosis": "D1 branch",
    "D2_stenosis": "D2 branch",
    "D3_stenosis": "D3 branch",
    "lcx_stenosis": "the proximal LCX",
    "dist_lcx_stenosis": "the distal LCX",
    "lvp_stenosis": "the LVP branch",
    "marg_d_stenosis": "the marginal (Marg D) branch",
    "om1_stenosis": "OM1",
    "om2_stenosis": "OM2",
    "om3_stenosis": "OM3",
    "prox_rca_stenosis": "the proximal RCA",
    "mid_rca_stenosis": "the mid RCA",
    "dist_rca_stenosis": "the distal RCA",
    "RVG1_stenosis": "RVG1",
    "RVG2_stenosis": "RVG2",
    "pda_stenosis": "the PDA",
    "posterolateral_stenosis": "the posterolateral branch",
    "bx_stenosis": "Ramus",
    "lima_or_svg_stenosis": "the LIMA or SVG graft",
}

vessel_order = [
    "leftmain_stenosis",
    "lad_stenosis",
    "mid_lad_stenosis",
    "dist_lad_stenosis",
    "D1_stenosis",
    "D2_stenosis",
    "D3_stenosis",
    "lcx_stenosis",
    "dist_lcx_stenosis",
    "lvp_stenosis",
    "marg_d_stenosis",
    "om1_stenosis",
    "om2_stenosis",
    "om3_stenosis",
    "prox_rca_stenosis",
    "mid_rca_stenosis",
    "dist_rca_stenosis",
    "RVG1_stenosis",
    "RVG2_stenosis",
    "pda_stenosis",
    "posterolateral_stenosis",
    "bx_stenosis",
    "lima_or_svg_stenosis",
]

##############################################################################
# 2) Short Formatting Helpers                                                #
##############################################################################

def format_stenosis_value(percent: float) -> str:
    if percent == 0:
        return "no significant stenosis"
    elif 0 < percent < 50:
        return f"mild stenosis (~{percent}%)"
    elif 50 <= percent < 70:
        return f"moderate stenosis (~{percent}%)"
    elif 70 <= percent < 90:
        return f"severe stenosis (~{percent}%)"
    else:
        return f"critical stenosis (~{percent}%)"

def format_calcification_value(calcif: str) -> str:
    txt = calcif.lower()
    if "no calcification" in txt or "pas de calcification" in txt:
        return "no calcifications"
    elif "minimes" in txt or "mild" in txt:
        return "minimal calcifications"
    elif "modérées" in txt or "moderate" in txt:
        return "moderate calcifications"
    elif "importantes" in txt or "severe" in txt:
        return "severe calcifications"
    return f"calcifications: '{calcif}'"

def format_ifr_value(ifr: float) -> str:
    ifr_str = f"{ifr:.2f}"
    if ifr > 0.89:
        return f"IFR normal (~{ifr_str})"
    return f"IFR abnormal (~{ifr_str})"

##############################################################################
# 3) Main Report Function with Custom Rules                                  #
##############################################################################

def create_report(row: pd.Series) -> str:
    """
    Builds a single short report per row using:
    - Coronary dominance rules
    - Graft presence rules
    - Combined line per vessel (stenosis, calcif, IFR)
    """
    # 1) Determine dominance and graft presence
    dom_raw = str(row.get("coronary_dominance", ""))
    dom_lower = dom_raw.lower()
    # Only show graft vessels if 'pontage' in Conclusion (case-insensitive)
    # or bypass_graft == 1
    conclusion_text = str(row.get("Conclusion", "")).lower()
    has_graft = ("pontage" in conclusion_text) or (row.get("bypass_graft", 0) == 1)

    # 2) Build a local copy of the vessel order we will actually iterate over
    local_order = vessel_order[:]

    # If Right Dominant: skip lvp & marg_d
    if "right" in dom_lower:
        if "lvp_stenosis" in local_order:
            local_order.remove("lvp_stenosis")
        if "marg_d_stenosis" in local_order:
            local_order.remove("marg_d_stenosis")

    # If not Left Dominant, keep default naming;
    # if left dominant, rename 'pda_stenosis' and 'posterolateral_stenosis'
    # to "LEFT PDA" and "LEFT posterolateral"
    vessel_dict = labels_to_vessel_names.copy()
    if "left" in dom_lower:
        vessel_dict["pda_stenosis"] = "the LEFT PDA"
        vessel_dict["posterolateral_stenosis"] = "the LEFT posterolateral branch"

    # If no graft presence, remove lima_or_svg_stenosis
    if not has_graft:
        if "lima_or_svg_stenosis" in local_order:
            local_order.remove("lima_or_svg_stenosis")

    # 3) Build each vessel’s text
    lines = []
    for stenosis_label in local_order:
        prefix = stenosis_label.replace("_stenosis", "")
        vname = vessel_dict.get(stenosis_label, stenosis_label)

        # Gather info from columns
        desc = []
        # Stenosis
        st_val = row.get(stenosis_label, -1)
        if pd.notna(st_val) and st_val != -1:
            desc.append(format_stenosis_value(float(st_val)))
        # Calcif
        calc_label = prefix + "_calcif"
        calc_val = row.get(calc_label, "-1")
        if isinstance(calc_val, str) and calc_val.strip() != "-1":
            desc.append(format_calcification_value(calc_val))
        # IFR
        ifr_label = prefix + "_IFRHYPEREMIE"
        ifr_val = row.get(ifr_label, -1)
        if pd.notna(ifr_val) and ifr_val != -1:
            desc.append(format_ifr_value(float(ifr_val)))

        # If we got any descriptors, combine in one short sentence
        if desc:
            # If multiple descriptors, separate by commas, last with 'and'
            if len(desc) == 1:
                combined = desc[0]
            else:
                combined = ", ".join(desc[:-1]) + ", and " + desc[-1]
            lines.append(f"{vname} has {combined}.")

    # 4) Add coronary dominance if not empty
    if dom_raw:
        lines.append(f"The coronary circulation is {dom_raw}.")

    # Return final or default
    final_report = "\n".join(lines)
    if not final_report.strip():
        return "No significant findings or additional data available."
    return final_report

##############################################################################
# 4) Example Usage                                                           #
##############################################################################
df_predictions["Report"] = df_predictions.progress_apply(create_report, axis=1)

  0%|          | 0/925207 [00:00<?, ?it/s]

In [4]:
import os

output_file_path = "data/reports/reports_with_alpha_separator_with_Calcifc_Stenosis_IFR_20250108.csv"
output_dir = os.path.dirname(output_file_path)

# Check if the directory exists, if not create it
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
# Drop rows where 'External_Exam' is True
df_predictions = df_predictions[df_predictions["External_Exam"] != True]

df_non_nan_reports = df_predictions.dropna(subset=["Report"])
# Filter the dataframe to keep only rows where 'object_value' is 5 or 9
df_non_nan_reports = df_non_nan_reports[df_non_nan_reports["object_value"].isin([5, 9])]


df_non_nan_reports.to_csv(output_file_path, sep="α", index=False, header=True)


In [5]:
# Get top 13 reports and print their lengths
top_13_reports = df_non_nan_reports.Report.value_counts().head(13)
for report in top_13_reports.index:
    print(f"Length: {len(report)} characters")

Length: 833 characters
Length: 832 characters
Length: 944 characters
Length: 832 characters
Length: 832 characters
Length: 832 characters
Length: 831 characters
Length: 831 characters
Length: 832 characters
Length: 831 characters
Length: 831 characters
Length: 831 characters
Length: 831 characters


In [6]:
output_file_path = "data/reports/reports_with_alpha_separator_with_Calcifc_Stenosis_IFR_20250108.csv"
df_non_nan_reports = pd.read_csv(output_file_path, sep="α")

  df_non_nan_reports = pd.read_csv(output_file_path, sep="α")


In [7]:
# First, sort the dataframe by 'series_time' and group by 'StudyInstanceUID'
df_sorted = df_non_nan_reports.assign(series_time=pd.to_numeric(df_non_nan_reports.series_time, errors='coerce')).sort_values(by='series_time')

# Group by 'StudyInstanceUID' and keep the top 5 earliest 'series_time'
df_top5 = df_sorted.groupby('StudyInstanceUID').head(5)

# Define a function to pick 3 rows with object_value == 5 and 2 rows with object_value == 9
def pick_values(group):
    # Filter rows where object_value == 5 and keep 3
    group_5 = group[group['object_value'] == 5].head(3)
    # Filter rows where object_value == 9 and keep 2
    group_9 = group[group['object_value'] == 9].head(2)
    # Concatenate the results
    return pd.concat([group_5, group_9])

# Apply the function to each group
df_final = df_top5.groupby('StudyInstanceUID').apply(pick_values).reset_index(drop=True)

  df_final = df_top5.groupby('StudyInstanceUID').apply(pick_values).reset_index(drop=True)


In [8]:
display(df_final.object_value.value_counts())

object_value
5    88883
9    39333
Name: count, dtype: int64

In [9]:
# Split 70% of patients for train and 30% for validation
unique_patients = df_final["CathReport_MRN"].drop_duplicates()
train_size = int(0.9 * len(unique_patients))
train_patients = unique_patients.sample(n=train_size, random_state=42)
val_patients = unique_patients.drop(train_patients.index)

# Keep only the sampled patients in the dataframe
df_sampled = df_final[
    df_final["CathReport_MRN"].isin(train_patients)
    | df_final["CathReport_MRN"].isin(val_patients)
]

# Sample 300 unique StudyInstanceUID from the already split dataset
#unique_study_ids = df_sampled["StudyInstanceUID"].drop_duplicates().sample(n=300, random_state=42)

# Keep only the sampled StudyInstanceUIDs in the dataframe
#df_sampled = df_sampled[df_sampled["StudyInstanceUID"].isin(unique_study_ids)]

# Assign split based on CathReport_MRN
df_sampled.loc[df_sampled["CathReport_MRN"].isin(train_patients), "Split"] = "train"
df_sampled.loc[df_sampled["CathReport_MRN"].isin(val_patients), "Split"] = "val"

# Save the dataframe with the sampled StudyInstanceUIDs to a new CSV file
output_sampled_file_path = "data/reports/reports_with_alpha_separator_with_Calcifc_Stenosis_IFR_20250108.csv"
df_sampled.to_csv(output_sampled_file_path, sep="α", index=False)

display(df_sampled.Split.value_counts())

Split
train    115365
val       12851
Name: count, dtype: int64

In [None]:
output_sampled_file_path = pd.read_csv(
    "data/reports/reports_sampled_no_conclusion.csv", sep="α"
)
df_sampled = output_sampled_file_path.sample(96).reset_index()
# Save the dataframe with split information to a new CSV file
output_sampled_file_path = "data/reports/reports_sampled_no_conclusion_96.csv"
df_sampled.to_csv(output_sampled_file_path, sep="α", index=False)

## Example tokenization

In [None]:
df_sampled = pd.read_csv('data/reports/reports_with_alpha_separator_with_conclusion_and_more_details_20250108.csv', sep='α')

In [3]:
from transformers import AutoTokenizer, AutoModel

# Load PubMedBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")

# Get a sample text from the dataframe
sample_text = df_sampled['Report'].iloc[0]

# Encode the text
encoded = tokenizer(
    sample_text,
    padding="max_length",
    max_length=512, 
    truncation=True,
    return_tensors="pt"
)

# Decode back to text to verify
decoded = tokenizer.decode(encoded['input_ids'][0], skip_special_tokens=True)


In [None]:
# Print full texts without truncation
pd.set_option('display.max_colwidth', None)
print("\nOriginal text (full):")
print(sample_text)
print("\nDecoded text (full):")
print(decoded)
print("\nEncoded tokens:")


