In [1]:
import numpy as np
import pandas as pd

# pd.set_option('display.height', 1000)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("max_colwidth", None)
from IPython.core.display import HTML, display

display(HTML("<style>.container { width:100% !important; }</style>"))


def df_stats(df):
    from tabulate import tabulate

    while True:
        print("\n***** Shape: ", df.shape, " *****\n")

        columns_list = df.columns.values.tolist()
        isnull_list = df.isnull().sum().values.tolist()
        isunique_list = df.nunique().values.tolist()
        dtypes_list = df.dtypes.tolist()

        list_stat_val = list(zip(columns_list, isnull_list, isunique_list, dtypes_list))
        df_stat_val = pd.DataFrame(list_stat_val, columns=["Name", "Null", "Unique", "Dtypes"])
        print(tabulate(df_stat_val, headers="keys", tablefmt="psql"))
    return df.head()

  from IPython.core.display import HTML, display


In [2]:
import sys
import os
from pathlib import Path
import pandas as pd
import yaml

# Add the dataset_creation directory to the path (if running from parent directory)
if 'dataset_creation' not in sys.path:
    sys.path.append('dataset_creation')

# Import the dataset generation functions
from generate_dataset import (
    create_default_config,
    load_data,
    apply_hard_filters,
    generate_reports,
    sample_by_status,
    process_dataset,
    MAIN_STRUCTURE_MAP,
    DOMINANCE_MAP
)


# Load config from yaml file
config_path = "dataset_creation/config_template.yaml"
with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

print("Loaded configuration:")
print(yaml.dump(config, default_flow_style=False))

process_dataset('/media/data1/datasets/DeepCoro/2b_CathReport_HEMO_MHI_MERGED_2017-2024_VIDEO_LEVEL.parquet', 'reports/', config)

INFO:generate_dataset:Loading data from /media/data1/datasets/DeepCoro/2b_CathReport_HEMO_MHI_MERGED_2017-2024_VIDEO_LEVEL.parquet


Loaded configuration:
apply_mappings: true
assign_status: true
filters:
  contrast_agent_class: 1
  main_structures:
  - Left Coronary
  - Right Coronary
  status:
  - diagnostic
output_settings:
  include_index: false
  separator: "\u03B1"
report_settings:
  coronary_specific: true
sampling:
  enabled: true
  label_column: status
  n_per_group: 9
train_test_split:
  enabled: true
  patient_column: CathReport_MRN
  random_state: 42
  save_separate_files: false
  test_ratio: 0.2
  train_ratio: 0.7
  val_ratio: 0.1



INFO:generate_dataset:Loaded 843499 records with 788 columns
INFO:generate_dataset:Created bypass_graft column from Conclusion
INFO:generate_dataset:Sorted data by StudyInstanceUID and SeriesTime for temporal ordering
INFO:generate_dataset:Assigning procedure status based on PCI timing...
INFO:generate_dataset:Status distribution: {'PCI': 360840, 'diagnostic': 334320, 'POST_PCI': 112612, 'unknown': 35727}
INFO:generate_dataset:Applying hard filters...
INFO:generate_dataset:Dropped External_Exam: 843499 -> 843499 rows
INFO:generate_dataset:Dropped bypass_graft: 738052 rows remaining
INFO:generate_dataset:Filtered stenosis columns: 738052 rows remaining
INFO:generate_dataset:Status filter applied: ['diagnostic']
INFO:generate_dataset:Main structure filter applied: ['Left Coronary', 'Right Coronary']
INFO:generate_dataset:Contrast agent filter applied: 1
INFO:generate_dataset:Dataset filtered from 843499 to 167134 rows
INFO:generate_dataset:Generating medical reports...
100%|██████████| 1

In [5]:
import pandas as pd
import numpy as np

df_merged = pd.read_csv('reports/dataset_with_splits_20250801.csv', sep='α')

  df_merged = pd.read_csv('reports/dataset_with_splits_20250801.csv', sep='α')


In [6]:
display(df_merged.Split.value_counts())
display(df_merged.Report.head(10))

Split
train    117120
test      33157
val       16857
Name: count, dtype: int64

0                                                                     \nThis is a left coronary artery.\nthe Left Main Coronary Artery (LMCA) has no significant stenosis.\nthe proximal LAD has no significant stenosis.\nthe mid LAD has no significant stenosis.\nthe distal LAD has no significant stenosis.\nD1 branch has no significant stenosis.\nD2 branch has no significant stenosis.\nthe distal LCX has no significant stenosis.\nOM1 has no significant stenosis.\nOM2 has no significant stenosis.\nRamus has no significant stenosis.\nleft posterolateral branch has no significant stenosis.\nthe LEFT PDA has no significant stenosis.\nleft posterolateral branch has no significant stenosis.\nThe coronary circulation is left dominant.
1                                                                     \nThis is a left coronary artery.\nthe Left Main Coronary Artery (LMCA) has no significant stenosis.\nthe proximal LAD has no significant stenosis.\nthe mid LAD has no significant stenosis.\nthe 

In [None]:
df_coro = pd.read_parquet('/mnt/data1/datasets/DeepCoro/2b_CathReport_HEMO_MHI_MERGED_2017-2024_VIDEO_LEVEL.parquet')

In [None]:
display(df_merged.FileName.value_counts())

In [None]:
# Show report where mid_lad_cto == 1.0
print("Report with mid_lad_cto == 1.0:")
display(df_merged[['Report','primary_angle','secondary_angle', 'main_structure_class']].head(n=1))

# Show report where mid_lad_collateral == RCA
print("\nReport with mid_lad_collateral == RCA:")
display(df_merged[df_merged['mid_lad_collateral'] == 'RCA'][['CathReport_MRN','Report']].head(5))

# Show report where leftmain_bifurcation is present
print("\nReport with leftmain_bifurcation present:")
display(df_merged[df_merged['mid_lad_bifurcation'] != 'Pas de lésion de bifurcation'][['Report']].head(5))


In [None]:
## Tokenization of the longest report
from transformers import AutoTokenizer, AutoModel

# Load PubMedBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")

# Get the longest text from the dataframe
sample_text = df_merged.loc[df_merged['Report'].str.len().idxmax(), 'Report']
print("Longest report text:")
print(sample_text)
print(f"\nLength: {len(sample_text)} characters")

# Encode the text
encoded = tokenizer(
    sample_text,
    padding="max_length",
    max_length=512, 
    truncation=True,
    return_tensors="pt"
)

# Decode back to text to verify
decoded = tokenizer.decode(encoded['input_ids'][0], skip_special_tokens=True)
print("\nDecoded text (after tokenization):")
print(decoded)


In [None]:
# Create bypass_graft column based on whether Conclusion contains "pontage" (case-insensitive)
df_merged['bypass_graft'] = df_merged['Conclusion'].str.contains('pontage', case=False, na=False).astype(int)

In [None]:
output_file_path = "data/reports/reports_with_alpha_separator_with_Calcifc_Stenosis_IFR_20250801_RCA_LCA_merged_with_left_dominance_dependent_vessels.csv"

In [None]:
import os

output_dir = os.path.dirname(output_file_path)

# Check if the directory exists, if not create it
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

print(f"Initial dataframe length: {len(df_merged)}")

# Drop rows where 'External_Exam' is True
df_merged = df_merged[df_merged["External_Exam"] != True]
print(f"Length after dropping External_Exam: {len(df_merged)}")

df_merged = df_merged[df_merged["bypass_graft"] != 1]
print(f"Length after dropping bypass_graft: {len(df_merged)}")

df_non_nan_reports = df_merged.dropna(subset=["Report"])
print(f"Length after dropping NaN reports: {len(df_non_nan_reports)}")


# List of stenosis columns to check
stenosis_columns = [
    "prox_rca_stenosis", "mid_rca_stenosis", "dist_rca_stenosis",
    "left_main_stenosis", "prox_lad_stenosis", "mid_lad_stenosis", "dist_lad_stenosis",
    "D1_stenosis", "D2_stenosis", "prox_lcx_stenosis", "dist_lcx_stenosis",
    "om1_stenosis", "om2_stenosis", "bx_stenosis", "lvp_stenosis",
    "pda_stenosis", "posterolateral_stenosis"
]

# Create a boolean DataFrame indicating if each cell is NaN or -1.0
is_na_or_minus_one = df_non_nan_reports[stenosis_columns].isna() | (df_non_nan_reports[stenosis_columns] == -1.0)

# Create a mask for rows where NOT ALL stenosis columns are NaN or -1.0
mask = ~is_na_or_minus_one.all(axis=1)

# Filter the DataFrame to drop rows meeting the unwanted condition
df_non_nan_reports = df_non_nan_reports[mask]
print(f"Final length after filtering stenosis columns: {len(df_non_nan_reports)}")

df_non_nan_reports.to_csv(output_file_path, sep="α", index=False, header=True)

In [None]:
def assign_patient_splits(df_final, output_path, train_ratio=0.7, val_ratio=0.15, random_state=42):
    """
    Assigns patients to train/val/test splits and saves the resulting dataframe.
    
    Args:
        df_final (pd.DataFrame): Input dataframe containing patient data
        output_path (str): Path where the output CSV file will be saved
        train_ratio (float): Ratio of patients to assign to training set (default 0.7)
        val_ratio (float): Ratio of patients to assign to validation set (default 0.15)
        random_state (int): Random seed for reproducibility (default 42)
        
    Returns:
        pd.DataFrame: Dataframe with train/val/test splits assigned
    """
    # Split patients into train/val/test
    unique_patients = df_final["CathReport_MRN"].drop_duplicates()
    train_size = int(train_ratio * len(unique_patients))
    val_size = int(val_ratio * len(unique_patients))
    
    # Sample patients for each split
    train_patients = unique_patients.sample(n=train_size, random_state=random_state)
    remaining_patients = unique_patients.drop(train_patients.index)
    val_patients = remaining_patients.sample(n=val_size, random_state=random_state)
    test_patients = remaining_patients.drop(val_patients.index)

    # Keep only the sampled patients in the dataframe
    df_sampled = df_final[
        df_final["CathReport_MRN"].isin(train_patients) 
        | df_final["CathReport_MRN"].isin(val_patients)
        | df_final["CathReport_MRN"].isin(test_patients)
    ]

    # Assign split based on CathReport_MRN
    df_sampled.loc[df_sampled["CathReport_MRN"].isin(train_patients), "Split"] = "train"
    df_sampled.loc[df_sampled["CathReport_MRN"].isin(val_patients), "Split"] = "val"
    df_sampled.loc[df_sampled["CathReport_MRN"].isin(test_patients), "Split"] = "test"

    # Save the dataframe
    df_sampled.to_csv(output_path, sep="α", index=False)
    
    display(df_sampled.Split.value_counts())
    return df_sampled

df_sampled = assign_patient_splits(
    df_final=df_non_nan_reports,
    output_path=output_file_path,
    train_ratio=0.7,
    val_ratio=0.10,
    random_state=42
)

In [None]:
# Count unique StudyInstanceUIDs per split
split_counts = df_sampled.groupby('Split')['StudyInstanceUID'].nunique()
display(split_counts)

In [None]:
import pandas as pd
df_sampled = pd.read_csv(output_file_path, sep="α")

display(df_sampled.object_value.value_counts())

In [None]:
# Sample 30 unique StudyInstanceUIDs
sampled_study_ids = df_sampled["StudyInstanceUID"].unique()
sampled_study_ids = np.random.choice(sampled_study_ids, size=30, replace=False)

# Keep only rows matching the sampled StudyInstanceUIDs
df_sampled = df_sampled[df_sampled["StudyInstanceUID"].isin(sampled_study_ids)]
display(df_sampled.Split.value_counts())
display(df_sampled.StudyInstanceUID.value_counts())
df_sampled.to_csv('data/reports/report_sampled_200.csv', sep='α', index=False)


In [None]:
import sys
import os
import numpy as np
## CHANGE THIS
dir2 = os.path.abspath("/volume/DicomVideoProcessing/downloadAvi")
dir1 = os.path.dirname(dir2)
if not dir1 in sys.path:
    sys.path.append(dir1)

from downloadAvi import plot_avi

# Sample 2-3 random StudyInstanceUIDs
study_ids = df_non_nan_reports.loc[df_non_nan_reports['coronary_dominance_consensus']=='right_dominant']['StudyInstanceUID'].unique()

sampled_studies = np.random.choice(study_ids, size=3, replace=False)

# For each sampled study, plot 8 videos
for study_id in sampled_studies:
    study_videos = df_non_nan_reports[df_non_nan_reports['StudyInstanceUID'] == study_id]
    if len(study_videos) >= 3:
        study_sample = study_videos.sample(n=9, replace=len(study_videos) < 9)
        print(study_sample.FileName.nunique())
        plot_avi.sample_and_plot_middle_frames(study_sample, 9, 
                                             label_column='Report', 
                                             path_column='FileName')
        
        

## Example tokenization

In [None]:
df_sampled = pd.read_csv('data/reports/reports_with_alpha_separator_with_Calcifc_Stenosis_IFR_20250601_RCA_LCA_merged_with_left_dominance_dependent_vessels.csv', sep='α')
df_sampled = df_sampled.loc[df_sampled["dominance_name"] == "left_dominant"]

In [None]:
from transformers import AutoTokenizer, AutoModel

# Load PubMedBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")

# Get a sample text from the dataframe
sample_text = df_sampled['Report'].iloc[1]
print(sample_text)
# Encode the text
encoded = tokenizer(
    sample_text,
    padding="max_length",
    max_length=512, 
    truncation=True,
    return_tensors="pt"
)

# Decode back to text to verify
decoded = tokenizer.decode(encoded['input_ids'][0], skip_special_tokens=True)
