In [1]:
import numpy as np
import pandas as pd

# pd.set_option('display.height', 1000)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("max_colwidth", None)
from IPython.core.display import HTML, display

display(HTML("<style>.container { width:100% !important; }</style>"))


def df_stats(df):
    from tabulate import tabulate

    while True:
        print("\n***** Shape: ", df.shape, " *****\n")

        columns_list = df.columns.values.tolist()
        isnull_list = df.isnull().sum().values.tolist()
        isunique_list = df.nunique().values.tolist()
        dtypes_list = df.dtypes.tolist()

        list_stat_val = list(zip(columns_list, isnull_list, isunique_list, dtypes_list))
        df_stat_val = pd.DataFrame(list_stat_val, columns=["Name", "Null", "Unique", "Dtypes"])
        print(tabulate(df_stat_val, headers="keys", tablefmt="psql"))
    return df.head()

  from IPython.core.display import HTML, display


In [2]:
# Load the specified CSV file
csv_file_path = "/media/data1/ravram/DeepCORO/processed_dataframes/ObjectRecon_SWIN3D_2016-2023_inference_predictions_with_df_metadata_and_report.csv"
df_predictions = pd.read_csv(csv_file_path)
print("Column names in df_predictions:")
for col in df_predictions.columns:
    print(f"- {col}")

  df_predictions = pd.read_csv(csv_file_path)


Column names in df_predictions:
- Unnamed: 0.1
- Unnamed: 0
- filename
- y_hat
- object_value
- brand
- sex
- FPS
- NumberOfFrames
- date
- study_time
- series_time
- birthdate
- color_format
- StudyID
- StudyInstanceUID
- SeriesInstanceUID
- dicom_path
- FileName
- uint16_video
- primary_angle
- secondary_angle
- width
- height
- pixel_spacing
- distance_source_to_detector
- distance_source_to_patient
- estimated_radiographic_magnification_factor
- table_motion
- radiation_setting
- image_pixel_spacing
- Split
- CathReport_MRN
- EXAMEN_ID
- Num Accession
- date/heure
- DICOMPath
- AccessionNumber
- ModalitiesInStudy
- Patient_ID
- QueryRetrieveLevel
- StudyDate
- StudyTime
- year
- patient_id_anon
- dicom_id
- Recommendation
- Conclusion
- fps
- frame_time
- series_description
- External_Exam
- angle_value
- Unnamed: 0_y
- coronary_dominance_logit
- coronary_dominance
- D2_stenosis
- D3_stenosis
- RVG1_stenosis
- RVG2_stenosis
- S1_stenosis
- bx_stenosis
- diagonal_stenosis
- dist_lad

In [3]:
display(df_predictions.dist_lad_calcif.value_counts())

dist_lad_calcif
-1                           673331
-1                            34850
Calcifications minimes        27917
Pas de calcification           2206
Calcifications modérées        1554
Calcification importantes      1213
Name: count, dtype: int64

In [4]:
display(df_predictions.lcx_IFRBASAL.value_counts())

lcx_IFRBASAL
-1.00    739858
 1.00       653
 0.00       284
 0.96       219
 0.97        22
 0.92        18
 0.98        16
 0.99         1
Name: count, dtype: int64

In [5]:
import pandas as pd
from tqdm import tqdm

tqdm.pandas()

##############################################################################
# 1) Vessel Definitions (full list)                                          #
##############################################################################
labels_to_vessel_names = {
    "D2_stenosis": "the D2 branch",
    "D3_stenosis": "the D3 branch",
    "RVG1_stenosis": "the first right ventricular branch (RVG1)",
    "RVG2_stenosis": "the second right ventricular branch (RVG2)",
    "S1_stenosis": "the first septal branch (S1)",
    "bx_stenosis": "the bypass graft (Bx)",
    "diagonal_stenosis": "the diagonal branch",
    "dist_lad_stenosis": "the distal segment of the Left Anterior Descending (LAD) artery",
    "dist_lcx_stenosis": "the distal branch of the Left Circumflex (LCX) artery",
    "dist_rca_stenosis": "the distal portion of the Right Coronary Artery (RCA)",
    "lad_stenosis": "the proximal segment of the Left Anterior Descending (LAD) artery",
    "lcx_stenosis": "the proximal branch of the Left Circumflex (LCX) artery",
    "leftmain_stenosis": "the Left Main Coronary Artery (LMCA)",
    "lima_or_svg_stenosis": "the LIMA or SVG graft",
    "lvp_stenosis": "the left ventricular posterior (LVP) branch",
    "marg_d_stenosis": "the marginal (Marg D) branch",
    "mid_lad_stenosis": "the mid segment of the Left Anterior Descending (LAD) artery",
    "mid_rca_stenosis": "the mid portion of the Right Coronary Artery (RCA)",
    "om1_stenosis": "the first obtuse marginal (OM1) branch",
    "om2_stenosis": "the second obtuse marginal (OM2) branch",
    "om3_stenosis": "the third obtuse marginal (OM3) branch",
    "pda_stenosis": "the posterior descending artery (PDA)",
    "posterolateral_stenosis": "the posterolateral branch",
    "prox_rca_stenosis": "the proximal Right Coronary Artery (RCA)",
}


##############################################################################
# 2) Helper Formatting Functions                                             #
##############################################################################

def format_stenosis_statement(vessel_name: str, percentage: float) -> str:
    """
    Return a natural-language statement about stenosis severity.
    """
    if percentage == 0:
        return f"{vessel_name} appears free of significant stenosis (0%)."
    elif 0 < percentage < 50:
        return f"{vessel_name} shows mild stenosis (~{percentage}%)."
    elif 50 <= percentage < 70:
        return f"{vessel_name} shows moderate stenosis (~{percentage}%)."
    elif 70 <= percentage < 90:
        return f"{vessel_name} shows severe stenosis (~{percentage}%)."
    else:
        return f"{vessel_name} shows critical stenosis (~{percentage}%)."


def format_calcification_statement(vessel_name: str, calcif_str: str) -> str:
    """
    Convert a calcification label into a natural-language sentence.
    """
    text = calcif_str.lower().strip()
    # Adjust these if your categories differ
    if "pas de calcification" in text or "no calcification" in text:
        return f"No calcification is noted in {vessel_name}."
    elif "minimes" in text:
        return f"There are minimal calcifications in {vessel_name}."
    elif "modérées" in text or "moderate" in text:
        return f"There are moderate calcifications in {vessel_name}."
    elif "importantes" in text or "severe" in text:
        return f"There are severe calcifications in {vessel_name}."
    else:
        # fallback if text doesn't match known categories
        return f"Calcifications present in {vessel_name}: '{calcif_str}'."


def format_ifr_statement(vessel_name: str, ifr_value: float) -> str:
    """
    Indicate normal IFR (> 0.89) or abnormal IFR (<= 0.89).
    """
    ifr_str = f"{ifr_value:.2f}"
    if ifr_value > 0.89:
        return f"IFR measurement in {vessel_name} is normal (~{ifr_str})."
    else:
        return f"IFR measurement in {vessel_name} is abnormal (~{ifr_str})."


##############################################################################
# 3) Main Report Function                                                    #
##############################################################################

def create_report(row: pd.Series) -> str:
    """
    Builds a multi-section report for each row, covering:
      - Stenosis, Calcification, IFR (per vessel)
      - Coronary dominance
      - Conclusion
    """
    report_lines = []
    stenosis_list = []
    calcif_list = []
    ifr_list = []

    # For every vessel in the dictionary, check for corresponding columns
    #   prefix_stenosis, prefix_calcif, prefix_IFRBASAL
    for stenosis_label, vessel_name in labels_to_vessel_names.items():
        if stenosis_label not in row:
            continue

        # e.g. prefix = "dist_lad"
        prefix = stenosis_label.replace("_stenosis", "")

        # 1) Stenosis
        st_val = row[stenosis_label]
        if pd.notna(st_val) and st_val != -1:
            stenosis_list.append(format_stenosis_statement(vessel_name, float(st_val)))

        # 2) Calcification
        calcif_label = f"{prefix}_calcif"
        if calcif_label in row:
            calc_val = row[calcif_label]
            # skip if -1 or blank
            if isinstance(calc_val, str) and calc_val.strip() != "-1":
                calcif_list.append(format_calcification_statement(vessel_name, calc_val))

        # 3) IFR
        ifr_label = f"{prefix}_IFRBASAL"
        if ifr_label in row:
            ifr_val = row[ifr_label]
            if pd.notna(ifr_val) and ifr_val != -1:
                ifr_list.append(format_ifr_statement(vessel_name, float(ifr_val)))

    # Add them to report if not empty
    if stenosis_list:
        report_lines.append("Stenosis findings:")
        report_lines.extend(stenosis_list)
        report_lines.append("")

    if calcif_list:
        report_lines.append("Calcification findings:")
        report_lines.extend(calcif_list)
        report_lines.append("")

    if ifr_list:
        report_lines.append("IFR measurements:")
        report_lines.extend(ifr_list)
        report_lines.append("")

    # Coronary dominance
    if "coronary_dominance" in row and pd.notna(row["coronary_dominance"]):
        dom_str = str(row["coronary_dominance"]).replace("_", " ")
        report_lines.append(f"The coronary circulation is {dom_str}.")
        report_lines.append("")


    # If no content at all, return a default
    final_report = "\n".join([line for line in report_lines if line.strip() != ""])
    if not final_report.strip():
        return "No significant findings or additional data available."
    return final_report


##############################################################################
# 4) Usage Example                                                           #
##############################################################################
df_predictions["Report"] = df_predictions.progress_apply(create_report, axis=1)


100%|██████████| 925207/925207 [04:31<00:00, 3410.72it/s]


In [11]:
print(df_predictions["Report"].iloc[25])

Stenosis findings:
the D2 branch appears free of significant stenosis (0%).
the D3 branch appears free of significant stenosis (0%).
the first right ventricular branch (RVG1) appears free of significant stenosis (0%).
the second right ventricular branch (RVG2) appears free of significant stenosis (0%).
the first septal branch (S1) appears free of significant stenosis (0%).
the bypass graft (Bx) appears free of significant stenosis (0%).
the diagonal branch appears free of significant stenosis (0%).
the distal segment of the Left Anterior Descending (LAD) artery appears free of significant stenosis (0%).
the distal branch of the Left Circumflex (LCX) artery appears free of significant stenosis (0%).
the distal portion of the Right Coronary Artery (RCA) appears free of significant stenosis (0%).
the proximal segment of the Left Anterior Descending (LAD) artery appears free of significant stenosis (0%).
the proximal branch of the Left Circumflex (LCX) artery appears free of significant st

In [6]:
import os

output_file_path = "data/reports/reports_with_alpha_separator_with_Calcifc_Stenosis_IFR_20250108.csv"
output_dir = os.path.dirname(output_file_path)

# Check if the directory exists, if not create it
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
# Drop rows where 'External_Exam' is True
df_predictions = df_predictions[df_predictions["External_Exam"] != True]

df_non_nan_reports = df_predictions.dropna(subset=["Report"])
# Filter the dataframe to keep only rows where 'object_value' is 5 or 9
df_non_nan_reports = df_non_nan_reports[df_non_nan_reports["object_value"].isin([5, 9])]


df_non_nan_reports.to_csv(output_file_path, sep="α", index=False, header=True)


In [8]:
output_file_path = "data/reports/reports_with_alpha_separator_with_Calcifc_Stenosis_IFR_20250108.csv"
df_non_nan_reports = pd.read_csv(output_file_path, sep="α")

  df_non_nan_reports = pd.read_csv(output_file_path, sep="α")


In [9]:
# First, sort the dataframe by 'series_time' and group by 'StudyInstanceUID'
df_sorted = df_non_nan_reports.sort_values(by='series_time')

# Group by 'StudyInstanceUID' and keep the top 5 earliest 'series_time'
df_top5 = df_sorted.groupby('StudyInstanceUID').head(5)

# Define a function to pick 3 rows with object_value == 5 and 2 rows with object_value == 9
def pick_values(group):
    # Filter rows where object_value == 5 and keep 3
    group_5 = group[group['object_value'] == 5].head(3)
    # Filter rows where object_value == 9 and keep 2
    group_9 = group[group['object_value'] == 9].head(2)
    # Concatenate the results
    return pd.concat([group_5, group_9])

# Apply the function to each group
df_final = df_top5.groupby('StudyInstanceUID').apply(pick_values).reset_index(drop=True)

  df_final = df_top5.groupby('StudyInstanceUID').apply(pick_values).reset_index(drop=True)


In [10]:
display(df_final.object_value.value_counts())

object_value
5    88883
9    39333
Name: count, dtype: int64

In [12]:
# Split 70% of patients for train and 30% for validation
unique_patients = df_non_nan_reports["CathReport_MRN"].drop_duplicates()
train_size = int(0.9 * len(unique_patients))
train_patients = unique_patients.sample(n=train_size, random_state=42)
val_patients = unique_patients.drop(train_patients.index)

# Keep only the sampled patients in the dataframe
df_sampled = df_non_nan_reports[
    df_non_nan_reports["CathReport_MRN"].isin(train_patients)
    | df_non_nan_reports["CathReport_MRN"].isin(val_patients)
]

# Sample 300 unique StudyInstanceUID from the already split dataset
#unique_study_ids = df_sampled["StudyInstanceUID"].drop_duplicates().sample(n=300, random_state=42)

# Keep only the sampled StudyInstanceUIDs in the dataframe
#df_sampled = df_sampled[df_sampled["StudyInstanceUID"].isin(unique_study_ids)]

# Assign split based on CathReport_MRN
df_sampled.loc[df_sampled["CathReport_MRN"].isin(train_patients), "Split"] = "train"
df_sampled.loc[df_sampled["CathReport_MRN"].isin(val_patients), "Split"] = "val"

# Save the dataframe with the sampled StudyInstanceUIDs to a new CSV file
output_sampled_file_path = "data/reports/reports_with_alpha_separator_with_Calcifc_Stenosis_IFR_20250108.csv"
df_sampled.to_csv(output_sampled_file_path, sep="α", index=False)

display(df_sampled.Split.value_counts())

Split
train    311970
val       35529
Name: count, dtype: int64

In [None]:
output_sampled_file_path = pd.read_csv(
    "data/reports/reports_sampled_no_conclusion.csv", sep="α"
)
df_sampled = output_sampled_file_path.sample(96).reset_index()
# Save the dataframe with split information to a new CSV file
output_sampled_file_path = "data/reports/reports_sampled_no_conclusion_96.csv"
df_sampled.to_csv(output_sampled_file_path, sep="α", index=False)

## Example tokenization

In [None]:
df_sampled = pd.read_csv('data/reports/reports_with_alpha_separator_with_conclusion_and_more_details_20250108.csv', sep='α')

In [3]:
from transformers import AutoTokenizer, AutoModel

# Load PubMedBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")

# Get a sample text from the dataframe
sample_text = df_sampled['Report'].iloc[0]

# Encode the text
encoded = tokenizer(
    sample_text,
    padding="max_length",
    max_length=512, 
    truncation=True,
    return_tensors="pt"
)

# Decode back to text to verify
decoded = tokenizer.decode(encoded['input_ids'][0], skip_special_tokens=True)


In [None]:
# Print full texts without truncation
pd.set_option('display.max_colwidth', None)
print("\nOriginal text (full):")
print(sample_text)
print("\nDecoded text (full):")
print(decoded)
print("\nEncoded tokens:")


