In [3]:
import numpy as np
import pandas as pd

# pd.set_option('display.height', 1000)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("max_colwidth", None)
from IPython.core.display import HTML, display

display(HTML("<style>.container { width:100% !important; }</style>"))


def df_stats(df):
    from tabulate import tabulate

    while True:
        print("\n***** Shape: ", df.shape, " *****\n")

        columns_list = df.columns.values.tolist()
        isnull_list = df.isnull().sum().values.tolist()
        isunique_list = df.nunique().values.tolist()
        dtypes_list = df.dtypes.tolist()

        list_stat_val = list(zip(columns_list, isnull_list, isunique_list, dtypes_list))
        df_stat_val = pd.DataFrame(list_stat_val, columns=["Name", "Null", "Unique", "Dtypes"])
        print(tabulate(df_stat_val, headers="keys", tablefmt="psql"))
    return df.head()

  from IPython.core.display import HTML, display


In [4]:
# Load the specified CSV file
csv_file_path = "/media/data1/ravram/DeepCORO/processed_dataframes/ObjectRecon_SWIN3D_2016-2024_inference_predictions_with_df_metadata_and_report.csv"
df_predictions = pd.read_csv(csv_file_path)

  df_predictions = pd.read_csv(csv_file_path)


In [7]:
import pandas as pd
from tqdm import tqdm

tqdm.pandas()

##############################################################################
# 1) Base Vessel Definitions & Order                                         #
##############################################################################
labels_to_vessel_names = {
    "leftmain_stenosis": "the Left Main Coronary Artery (LMCA)",
    "lad_stenosis": "the proximal LAD",
    "mid_lad_stenosis": "the mid LAD", 
    "dist_lad_stenosis": "the distal LAD",
    "D1_stenosis": "D1 branch",
    "D2_stenosis": "D2 branch",
    "lcx_stenosis": "the proximal LCX",
    "dist_lcx_stenosis": "the distal LCX",
    "om1_stenosis": "OM1",
    "om2_stenosis": "OM2",
    "prox_rca_stenosis": "the proximal RCA",
    "mid_rca_stenosis": "the mid RCA",
    "dist_rca_stenosis": "the distal RCA",
    "pda_stenosis": "the PDA",
    "posterolateral_stenosis": "the posterolateral branch",
    "bx_stenosis": "Ramus",
    "lima_or_svg_stenosis": "the LIMA or SVG graft",
}

vessel_order = [
    "leftmain_stenosis",
    "lad_stenosis", 
    "mid_lad_stenosis",
    "dist_lad_stenosis",
    "D1_stenosis",
    "D2_stenosis",
    "lcx_stenosis",
    "dist_lcx_stenosis",
    "om1_stenosis",
    "om2_stenosis", 
    "prox_rca_stenosis",
    "mid_rca_stenosis",
    "dist_rca_stenosis",
    "pda_stenosis",
    "posterolateral_stenosis",
    "bx_stenosis",
    "lima_or_svg_stenosis",
]

##############################################################################
# 2) Short Formatting Helpers                                                #
##############################################################################

def format_stenosis_value(percent: float) -> str:
    if percent == 0:
        return "no significant stenosis"
    elif 0 < percent < 50:
        return f"mild stenosis (~{percent}%)"
    elif 50 <= percent < 70:
        return f"moderate stenosis (~{percent}%)"
    elif 70 <= percent < 90:
        return f"severe stenosis (~{percent}%)"
    else:
        return f"critical stenosis (~{percent}%)"

def format_calcification_value(calcif: str) -> str:
    txt = calcif.lower()
    if "no calcification" in txt or "pas de calcification" in txt:
        return "no calcifications"
    elif "minimes" in txt or "mild" in txt:
        return "minimal calcifications"
    elif "modérées" in txt or "moderate" in txt:
        return "moderate calcifications"
    elif "importantes" in txt or "severe" in txt:
        return "severe calcifications"
    return f"calcifications: '{calcif}'"

def format_ifr_value(ifr: float) -> str:
    ifr_str = f"{ifr:.2f}"
    if ifr > 0.89:
        return f"IFR normal (~{ifr_str})"
    return f"IFR abnormal (~{ifr_str})"

##############################################################################
# 3) Main Report Function with Custom Rules                                  #
##############################################################################

def create_report(row: pd.Series) -> str:
    """
    Builds a single short report per row using:
    - Coronary dominance rules
    - Graft presence rules
    - Combined line per vessel (stenosis, calcif, IFR)
    """
    # 1) Determine dominance and graft presence
    dom_raw = str(row.get("coronary_dominance", ""))
    dom_lower = dom_raw.lower()
    # Only show graft vessels if 'pontage' in Conclusion (case-insensitive)
    # or bypass_graft == 1
    conclusion_text = str(row.get("Conclusion", "")).lower()
    has_graft = ("pontage" in conclusion_text) or (row.get("bypass_graft", 0) == 1)

    # 2) Build a local copy of the vessel order we will actually iterate over
    local_order = vessel_order[:]

    # If not Left Dominant, keep default naming;
    # if left dominant, rename 'pda_stenosis' and 'posterolateral_stenosis'
    # to "LEFT PDA" and "LEFT posterolateral"
    vessel_dict = labels_to_vessel_names.copy()
    if "left" in dom_lower:
        vessel_dict["pda_stenosis"] = "the LEFT PDA"
        vessel_dict["posterolateral_stenosis"] = "the LEFT posterolateral branch"

    # If no graft presence, remove lima_or_svg_stenosis
    if not has_graft:
        if "lima_or_svg_stenosis" in local_order:
            local_order.remove("lima_or_svg_stenosis")

    # 3) Build each vessel's text
    lines = []
    for stenosis_label in local_order:
        prefix = stenosis_label.replace("_stenosis", "")
        vname = vessel_dict.get(stenosis_label, stenosis_label)

        # Gather info from columns
        desc = []
        # Stenosis
        st_val = row.get(stenosis_label, -1)
        if pd.notna(st_val) and st_val != -1:
            desc.append(format_stenosis_value(float(st_val)))
        # Calcif
        calc_label = prefix + "_calcif"
        calc_val = row.get(calc_label, "-1")
        if isinstance(calc_val, str) and calc_val.strip() != "-1":
            desc.append(format_calcification_value(calc_val))
        # IFR
        ifr_label = prefix + "_IFRHYPEREMIE"
        ifr_val = row.get(ifr_label, -1)
        if pd.notna(ifr_val) and ifr_val != -1:
            desc.append(format_ifr_value(float(ifr_val)))

        # If we got any descriptors, combine in one short sentence
        if desc:
            # If multiple descriptors, separate by commas, last with 'and'
            if len(desc) == 1:
                combined = desc[0]
            else:
                combined = ", ".join(desc[:-1]) + ", and " + desc[-1]
            lines.append(f"{vname} has {combined}.")

    # 4) Add coronary dominance if not empty
    if dom_raw:
        lines.append(f"The coronary circulation is {dom_raw}.")

    # Return final or default
    final_report = "\n".join(lines)
    if not final_report.strip():
        return "No significant findings or additional data available."
    return final_report

##############################################################################
# 4) Example Usage                                                           #
##############################################################################
df_predictions["Report"] = df_predictions.progress_apply(create_report, axis=1)

100%|██████████| 1050891/1050891 [02:14<00:00, 7837.93it/s]


In [19]:
# Create bypass_graft column based on whether Conclusion contains "pontage" (case-insensitive)
df_predictions['bypass_graft'] = df_predictions['Conclusion'].str.contains('pontage', case=False, na=False).astype(int)
display(df_predictions.bypass_graft.value_counts())

bypass_graft
0    945404
1    105487
Name: count, dtype: int64

In [23]:
df_predictions.object_value.value_counts()

object_value
5     242135
10    241096
9     115242
1      53053
6      42933
2      23966
4       6377
0       4920
7       3875
8       2977
3       2187
Name: count, dtype: int64

In [20]:
import os

output_file_path = "data/reports/reports_with_alpha_separator_with_Calcifc_Stenosis_IFR_20250108.csv"
output_dir = os.path.dirname(output_file_path)

# Check if the directory exists, if not create it
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
# Drop rows where 'External_Exam' is True
df_predictions = df_predictions[df_predictions["External_Exam"] != True]
df_predictions = df_predictions[df_predictions["bypass_graft"] != 1]

df_non_nan_reports = df_predictions.dropna(subset=["Report"])
# Filter the dataframe to keep only rows where 'object_value' is 5 or 9
df_non_nan_reports = df_non_nan_reports[df_non_nan_reports["object_value"].isin([5, 9])]


df_non_nan_reports.to_csv(output_file_path, sep="α", index=False, header=True)


In [24]:
# Get top 13 reports and print their lengths
top_13_reports = df_non_nan_reports.Report.value_counts().head(13)
for report in top_13_reports.index:
    print(f"Length: {len(report)} characters")

Length: 693 characters
Length: 682 characters
Length: 692 characters
Length: 702 characters
Length: 692 characters
Length: 692 characters
Length: 692 characters
Length: 691 characters
Length: 691 characters
Length: 692 characters
Length: 691 characters
Length: 691 characters
Length: 691 characters


In [26]:
display(df_non_nan_reports.head(n=1))

Unnamed: 0,y_hat,object_value,ManufacturerModelName,sex,FPS,NumberOfFrames,date,StudyTime,SeriesTime,PatientBirthDate,PhotometricInterpretation,StudyInstanceUID,SeriesInstanceUID,DICOMPath,FileName,uint16_video,primary_angle,secondary_angle,Columns,Rows,pixel_spacing,DistanceSourceToDetector,DistanceSourceToPatient,EstimatedRadiographicMagnificationFactor,TableMotion,RadiationSetting,ImagerPixelSpacing,Split,CathReport_MRN,EXAMEN_ID,Num Accession,date/heure,Modality,Patient_ID,QueryRetrieveLevel,DATEEXAMEN,year,SOPInstanceUID,Recommendation,Conclusion,FrameTime,SeriesDescription,External_Exam,angle_value,coronary_dominance_logit,coronary_dominance,D2_stenosis,D3_stenosis,RVG1_stenosis,RVG2_stenosis,S1_stenosis,bx_stenosis,diagonal_stenosis,dist_lad_stenosis,dist_lcx_stenosis,dist_rca_stenosis,lad_stenosis,lcx_stenosis,leftmain_stenosis,lima_or_svg_stenosis,lvp_stenosis,marg_d_stenosis,mid_lad_stenosis,mid_rca_stenosis,om1_stenosis,om2_stenosis,om3_stenosis,pda_stenosis,posterolateral_stenosis,prox_rca_stenosis,D2_IFRHYPEREMIE,D3_IFRHYPEREMIE,RVG1_IFRHYPEREMIE,RVG2_IFRHYPEREMIE,S1_IFRHYPEREMIE,bx_IFRHYPEREMIE,diagonal_IFRHYPEREMIE,dist_lad_IFRHYPEREMIE,dist_lcx_IFRHYPEREMIE,dist_rca_IFRHYPEREMIE,lad_IFRHYPEREMIE,lcx_IFRHYPEREMIE,leftmain_IFRHYPEREMIE,lima_or_svg_IFRHYPEREMIE,lvp_IFRHYPEREMIE,marg_d_IFRHYPEREMIE,mid_lad_IFRHYPEREMIE,mid_rca_IFRHYPEREMIE,om1_IFRHYPEREMIE,om2_IFRHYPEREMIE,om3_IFRHYPEREMIE,pda_IFRHYPEREMIE,posterolateral_IFRHYPEREMIE,prox_rca_IFRHYPEREMIE,D2_calcif,D3_calcif,RVG1_calcif,RVG2_calcif,S1_calcif,bx_calcif,diagonal_calcif,dist_lad_calcif,dist_lcx_calcif,dist_rca_calcif,lad_calcif,lcx_calcif,leftmain_calcif,lima_or_svg_calcif,lvp_calcif,marg_d_calcif,mid_lad_calcif,mid_rca_calcif,om1_calcif,om2_calcif,om3_calcif,pda_calcif,posterolateral_calcif,prox_rca_calcif,D2_IFRBASAL,D3_IFRBASAL,RVG1_IFRBASAL,RVG2_IFRBASAL,S1_IFRBASAL,bx_IFRBASAL,diagonal_IFRBASAL,dist_lad_IFRBASAL,dist_lcx_IFRBASAL,dist_rca_IFRBASAL,lad_IFRBASAL,lcx_IFRBASAL,leftmain_IFRBASAL,lima_or_svg_IFRBASAL,lvp_IFRBASAL,marg_d_IFRBASAL,mid_lad_IFRBASAL,mid_rca_IFRBASAL,om1_IFRBASAL,om2_IFRBASAL,om3_IFRBASAL,pda_IFRBASAL,posterolateral_IFRBASAL,prox_rca_IFRBASAL,D2_CFRBASAL,D3_CFRBASAL,RVG1_CFRBASAL,RVG2_CFRBASAL,S1_CFRBASAL,bx_CFRBASAL,diagonal_CFRBASAL,dist_lad_CFRBASAL,dist_lcx_CFRBASAL,dist_rca_CFRBASAL,lad_CFRBASAL,lcx_CFRBASAL,leftmain_CFRBASAL,lima_or_svg_CFRBASAL,lvp_CFRBASAL,marg_d_CFRBASAL,mid_lad_CFRBASAL,mid_rca_CFRBASAL,om1_CFRBASAL,om2_CFRBASAL,om3_CFRBASAL,pda_CFRBASAL,posterolateral_CFRBASAL,prox_rca_CFRBASAL,D2_CFRHYPEREMIE,D3_CFRHYPEREMIE,RVG1_CFRHYPEREMIE,RVG2_CFRHYPEREMIE,S1_CFRHYPEREMIE,bx_CFRHYPEREMIE,diagonal_CFRHYPEREMIE,dist_lad_CFRHYPEREMIE,dist_lcx_CFRHYPEREMIE,dist_rca_CFRHYPEREMIE,lad_CFRHYPEREMIE,lcx_CFRHYPEREMIE,leftmain_CFRHYPEREMIE,lima_or_svg_CFRHYPEREMIE,lvp_CFRHYPEREMIE,marg_d_CFRHYPEREMIE,mid_lad_CFRHYPEREMIE,mid_rca_CFRHYPEREMIE,om1_CFRHYPEREMIE,om2_CFRHYPEREMIE,om3_CFRHYPEREMIE,pda_CFRHYPEREMIE,posterolateral_CFRHYPEREMIE,prox_rca_CFRHYPEREMIE,D2_FFRHYPEREMIE,D3_FFRHYPEREMIE,RVG1_FFRHYPEREMIE,RVG2_FFRHYPEREMIE,S1_FFRHYPEREMIE,bx_FFRHYPEREMIE,diagonal_FFRHYPEREMIE,dist_lad_FFRHYPEREMIE,dist_lcx_FFRHYPEREMIE,dist_rca_FFRHYPEREMIE,lad_FFRHYPEREMIE,lcx_FFRHYPEREMIE,leftmain_FFRHYPEREMIE,lima_or_svg_FFRHYPEREMIE,lvp_FFRHYPEREMIE,marg_d_FFRHYPEREMIE,mid_lad_FFRHYPEREMIE,mid_rca_FFRHYPEREMIE,om1_FFRHYPEREMIE,om2_FFRHYPEREMIE,om3_FFRHYPEREMIE,pda_FFRHYPEREMIE,posterolateral_FFRHYPEREMIE,prox_rca_FFRHYPEREMIE,D2_FFRBASAL,D3_FFRBASAL,RVG1_FFRBASAL,RVG2_FFRBASAL,S1_FFRBASAL,bx_FFRBASAL,diagonal_FFRBASAL,dist_lad_FFRBASAL,dist_lcx_FFRBASAL,dist_rca_FFRBASAL,lad_FFRBASAL,lcx_FFRBASAL,leftmain_FFRBASAL,lima_or_svg_FFRBASAL,lvp_FFRBASAL,marg_d_FFRBASAL,mid_lad_FFRBASAL,mid_rca_FFRBASAL,om1_FFRBASAL,om2_FFRBASAL,om3_FFRBASAL,pda_FFRBASAL,posterolateral_FFRBASAL,prox_rca_FFRBASAL,acute_coronary_occlusion,pci_regions,PONTAGES,acute_inferior_mi,TITREEXAMEN,Indications,ACS,insuffisance_cardiaque,cardiogenic_shock,CABG,SpecificCharacterSet,SOPClassUID,AcquisitionDate,ContentDate,ConversionType,InstitutionName,StudyDescription,NameOfPhysiciansReadingStudy,PatientName,SeriesNumber,InstanceNumber,SamplesPerPixel,BitsAllocated,BitsStored,HighBit,PixelRepresentation,OriginalAttributesSequence,ModifiedAttributesSequence,AttributeModificationDateTime,ModifyingSystem,ReasonForTheAttributeModification,SeriesDate,Manufacturer,PlanarConfiguration,RequestedProcedureID,ImageType,AcquisitionTime,ContentTime,InstitutionAddress,StationName,ProcedureCodeSequence,StartTrim,StopTrim,RecommendedDisplayFrameRate,PatientAge,PatientSize,PatientWeight,KVP,DeviceSerialNumber,SoftwareVersions,ProtocolName,ExposureTime,XRayTubeCurrent,AveragePulseWidth,RadiationMode,ImageAndFluoroscopyAreaDoseProduct,IntensifierSize,FocalSpots,PositionerMotion,PatientPosition,DetectorDescription,DetectorID,AcquisitionNumber,PatientOrientation,ImagesInAcquisition,FrameIncrementPointer,PixelIntensityRelationship,WindowCenter,WindowWidth,VOILUTFunction,LossyImageCompression,RepresentativeFrameNumber,StudyIDIssuer,PerformedProcedureStepStartDate,PerformedProcedureStepStartTime,PerformedProcedureStepID,PerformedProcedureStepDescription,RequestAttributesSequence,PerformingPhysicianName,ReferencedSOPClassUID,ReferencedSOPInstanceUID,InstanceCreationDate,InstanceCreationTime,InstitutionalDepartmentName,ReferencedPerformedProcedureStepSequence,IrradiationEventUID,TableAngle,RequestedProcedureDescription,ScheduledProcedureStepDescription,ScheduledProcedureStepID,AcquisitionDateTime,FrameDelay,LossyImageCompressionMethod,ShutterShape,ShutterLeftVerticalEdge,ShutterRightVerticalEdge,ShutterUpperHorizontalEdge,ShutterLowerHorizontalEdge,DistanceSourceToEntrance,TableHorizontalRotationAngle,TableHeadTiltAngle,TableCradleTiltAngle,TableTopVerticalPosition,TableTopLongitudinalPosition,TableTopLateralPosition,OperatorsName,ReferencedStudySequence,ReferencedPatientSequence,XRayTubeCurrentInuA,AcquiredImageAreaDoseProduct,RequestedProcedureCodeSequence,ScheduledProtocolCodeSequence,ReferencedSeriesSequence,ShutterPresentationValue,SoftcopyVOILUTSequence,DisplayedAreaTopLeftHandCorner,DisplayedAreaBottomRightHandCorner,PresentationSizeMode,PresentationPixelAspectRatio,GraphicLayerSequence,GraphicLayer,GraphicLayerOrder,ContentLabel,PresentationCreationDate,PresentationCreationTime,PresentationLUTShape,PatientBirthTime,CollimatorShape,CollimatorLeftVerticalEdge,CollimatorRightVerticalEdge,CollimatorUpperHorizontalEdge,CollimatorLowerHorizontalEdge,IDPATIENT,TYPEEXAMEN,CATEGORIE,SOUSCATEGORIE,HEUREDEBUT,HEUREFIN,SALLE,NSEJOUR,FACTIVE,TYPEPROVENANCE,POIDS,TAILLE,HOSPITAL,Urgency,CREATININE,CK,HEMOGLOBINE,PLAQUETTES,TNI,TROPHS,ANGINA,ISCHEMIA,LOCALISATIONISCHEMIE,LV,DIABETES_HISTORY,HYPERTENSION,SMOKING,CAD_HISTORY,OTHERSURGERY,STROKE,VASCULAR,FIBRILLATION,TV,PM,LV.1,LVEF,RENAL,VASCULARACCESS,VASCULARACCESS2,FERMETUREART,FERMETUREART2,VEINACCESS,VEINACCESS2,FERMETUREVEIN,FERMETUREVEIN2,METHERGIN,LAO,RAO,STENTS,BMS,BALLONS,TRONC0,TRONC50,IVA50,CX50,CD50,TC50,LESIONRESTENOSE,STENTSACTIFRESTENOSE,STENTSRESTENOSE,PONTAGESMAMMAIRES50,PONTAGESSAPHENES50,DILATATIONS,DILATATIONSPONTAGESSUCCES,TCDILATATIONS,PONTAGESSAPHENES,LESIONSSTENTSACTIF,DILATATIONSPONTAGESECHEC,DILATATIONSSUCCES,DILATATIONSECHEC,LESIONSSTENTS,STENTSACTIFS,PONTAGESMAMMAIRES,PONTAGES50,BVS,CDBMS,CDBVS,CDDEB,CDDILATATIONS,CDSTENTS,CDSTENTSACTIF,CIRCRESTENOSE,CXBMS,CXBVS,CXDEB,CXDILATATIONS,CXSTENTS,CXSTENTSACTIF,DEB,DILATATIONSPONTAGES,DSTENTMAX,DSTENTMIN,DSTENTMOY,GRAFTRESTENOSE,GUIDEPRESSION,IVABMS,IVABVS,IVADEB,IVADILATATIONS,IVASTENTS,IVASTENTSACTIF,IVUS,LADRESTENOSE,LESIONS50,LESIONSDILATATIONS,LMRESTENOSE,LSTENT,LSTENTMAX,LSTENTMIN,PONTAGESBMS,PONTAGESBVS,PONTAGESDEB,PONTAGESDILATATIONS,PONTAGESPERMEABLES,PONTAGESSTENTS,PONTAGESSTENTSACTIFS,RCARESTENOSE,SYNTAX,TCBMS,TCBVS,TCDEB,TCSTENTS,TCSTENTSACTIF,TRONCRESTENOSE,Report,bypass_graft
0,[1.4491844e-05 2.4803728e-03 1.6874110e-06 1.3522274e-04 1.2590732e-05\n 5.2861782e-04 1.6547735e-06 5.2691407e-06 9.0157828e-06 9.9675262e-01\n 5.8418289e-05],9,,,10.0,75.0,20220101.0,24927.0,33910.0,19580211.0,MONOCHROME2,2.16.124.113611.1.118.1.1.5994023,'1.3.12.2.1107.5.4.5.135214.30000022010107492025500000000',/media/data1/ravram/DeepCORO/2022/68509394433809345732708154541135422799/2.16.124.113611.1.118.1.1.5994023/1.3.12.2.1107.5.4.5.135214.30000022010107492025500000000.dcm,/media/data1/ravram/MHI_CATH_DICOM_VIDEOS/2022/2.16.124.113611.1.118.1.1.5994023_1.3.12.2.1107.5.4.5.135214.30000022010107492025500000000.dcm.avi,False,30.0,-0.1,512.0,512.0,,1078.0,732.997649,1.470673,,GR,"[0.278875, 0.278875]",inference,601040.0,102859.0,HE202200000101,2022-01-01 03:35:00.0,XA,601040,STUDY,20220101,2022.0,1.3.12.2.1107.5.4.5.135214.30000022010107492025500000000,1. Aspirine à vie.,"Fraction d'éjection du ventricule gauche normale. Artères coronaires atheromateuses. Lésion peu sévère de l'artère interventriculaire antérieure proximale, englobant l'origine de la première diagonale (lésion de bifurcation). Lésion peu sévère de l'ostium de l'artère interventriculaire antérieure proximale. Lésion peu sévère de l'artère circonflexe proximale. Lésion peu sévère de l'ostium de la première marginale. Lésion peu sévère de l'artère coronaire droite proximale, englobant l'origine de l'artère interventriculaire postérieure (lésion de bifurcation).",100.0,CORONAROGRAPHIE,False,5.0,[0.0003176],right_dominant,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,20.0,0.0,-1.0,0.0,0.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,20.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,,,,,,,,,,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,the Left Main Coronary Artery (LMCA) has no significant stenosis.\nthe proximal LAD has mild stenosis (~20.0%).\nthe mid LAD has no significant stenosis.\nthe distal LAD has no significant stenosis.\nD2 branch has no significant stenosis.\nthe proximal LCX has mild stenosis (~20.0%).\nthe distal LCX has no significant stenosis.\nOM1 has mild stenosis (~20.0%).\nOM2 has no significant stenosis.\nthe proximal RCA has mild stenosis (~20.0%).\nthe mid RCA has no significant stenosis.\nthe distal RCA has no significant stenosis.\nthe PDA has no significant stenosis.\nthe posterolateral branch has no significant stenosis.\nRamus has no significant stenosis.\nThe coronary circulation is right_dominant.,0


In [6]:
output_file_path = "data/reports/reports_with_alpha_separator_with_Calcifc_Stenosis_IFR_20250108.csv"
df_non_nan_reports = pd.read_csv(output_file_path, sep="α")

  df_non_nan_reports = pd.read_csv(output_file_path, sep="α")


In [27]:
# First, sort the dataframe by 'SeriesTime' and group by 'StudyInstanceUID'
df_sorted = df_non_nan_reports.assign(SeriesTime=pd.to_numeric(df_non_nan_reports.SeriesTime, errors='coerce')).sort_values(by='SeriesTime')

# Group by 'StudyInstanceUID' and keep the top 5 earliest 'SeriesTime'
df_top5 = df_sorted.groupby('StudyInstanceUID').head(5)

# Define a function to pick 3 rows with object_value == 5 and 2 rows with object_value == 9
def pick_values(group):
    # Filter rows where object_value == 5 and keep 3
    group_5 = group[group['object_value'] == 5].head(5)
    # Filter rows where object_value == 9 and keep 2
    group_9 = group[group['object_value'] == 9].head(3)
    # Concatenate the results
    return pd.concat([group_5, group_9])

# Apply the function to each group
df_final = df_top5.groupby('StudyInstanceUID').apply(pick_values).reset_index(drop=True)

  df_final = df_top5.groupby('StudyInstanceUID').apply(pick_values).reset_index(drop=True)


In [31]:
display(df_final.object_value.value_counts())

object_value
5    118585
9     48680
Name: count, dtype: int64

In [32]:
# Split 70% of patients for train and 30% for validation
unique_patients = df_final["CathReport_MRN"].drop_duplicates()
train_size = int(0.9 * len(unique_patients))
train_patients = unique_patients.sample(n=train_size, random_state=42)
val_patients = unique_patients.drop(train_patients.index)

# Keep only the sampled patients in the dataframe
df_sampled = df_final[
    df_final["CathReport_MRN"].isin(train_patients)
    | df_final["CathReport_MRN"].isin(val_patients)
]

# Sample 300 unique StudyInstanceUID from the already split dataset
#unique_study_ids = df_sampled["StudyInstanceUID"].drop_duplicates().sample(n=300, random_state=42)

# Keep only the sampled StudyInstanceUIDs in the dataframe
#df_sampled = df_sampled[df_sampled["StudyInstanceUID"].isin(unique_study_ids)]

# Assign split based on CathReport_MRN
df_sampled.loc[df_sampled["CathReport_MRN"].isin(train_patients), "Split"] = "train"
df_sampled.loc[df_sampled["CathReport_MRN"].isin(val_patients), "Split"] = "val"

# Save the dataframe with the sampled StudyInstanceUIDs to a new CSV file
output_sampled_file_path = "data/reports/reports_with_alpha_separator_with_Calcifc_Stenosis_IFR_20250108.csv"
df_sampled.to_csv(output_sampled_file_path, sep="α", index=False)

display(df_sampled.Split.value_counts())

Split
train    150291
val       16974
Name: count, dtype: int64

In [35]:
# Display number of unique StudyInstanceUID by split and year
print("Number of unique StudyInstanceUID by split:")
print(df_sampled.groupby('Split')['StudyInstanceUID'].nunique())

# Extract year from date column and show breakdown
df_sampled['Year'] = pd.to_datetime(df_sampled['date'], format='%Y%m%d').dt.year
print("\nBreakdown by year:")
print(df_sampled.groupby('Year')['StudyInstanceUID'].nunique())

Number of unique StudyInstanceUID by split:
Split
train    31463
val       3568
Name: StudyInstanceUID, dtype: int64

Breakdown by year:
Year
2017    4629
2018    4737
2019    4175
2020    4173
2021    4384
2022    4572
2023    4224
2024    4137
Name: StudyInstanceUID, dtype: int64


In [30]:
output_sampled_file_path = pd.read_csv(
    "data/reports/reports_sampled_no_conclusion.csv", sep="α"
)
df_sampled = output_sampled_file_path.sample(96).reset_index()
# Save the dataframe with split information to a new CSV file
output_sampled_file_path = "data/reports/reports_sampled_no_conclusion_96.csv"
df_sampled.to_csv(output_sampled_file_path, sep="α", index=False)

  output_sampled_file_path = pd.read_csv(


FileNotFoundError: [Errno 2] No such file or directory: 'data/reports/reports_sampled_no_conclusion.csv'

## Example tokenization

In [None]:
df_sampled = pd.read_csv('data/reports/reports_with_alpha_separator_with_conclusion_and_more_details_20250108.csv', sep='α')

In [3]:
from transformers import AutoTokenizer, AutoModel

# Load PubMedBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")

# Get a sample text from the dataframe
sample_text = df_sampled['Report'].iloc[0]

# Encode the text
encoded = tokenizer(
    sample_text,
    padding="max_length",
    max_length=512, 
    truncation=True,
    return_tensors="pt"
)

# Decode back to text to verify
decoded = tokenizer.decode(encoded['input_ids'][0], skip_special_tokens=True)


In [None]:
# Print full texts without truncation
pd.set_option('display.max_colwidth', None)
print("\nOriginal text (full):")
print(sample_text)
print("\nDecoded text (full):")
print(decoded)
print("\nEncoded tokens:")


