In [73]:
import pandas as pd
from itertools import combinations

df = pd.read_csv("/vol/ideadata/ed52egek/pycharm/trichotomy/datasets/eight_cxr8.csv")
for split in ["TRAIN", "VAL", "TEST"]: 
    df_split = df[df["Split"] == split]
    
    def extract_subject_id(filename):
        return int(filename.split('/')[1].split('_')[0])

    # get a list of positive pairs
    pairs = []
    for _, group in df_split.groupby('subject_id'):
        file_names = group['FileName'].tolist()
        pairs.extend(list(combinations(file_names, 2)))

    # Convert to DataFrame or List
    positive_pairs_df = pd.DataFrame(pairs, columns=['FileName1', 'FileName2'])

    # get a list of negative pairs
    negative_pairs_x1 = positive_pairs_df["FileName1"]
    negative_pairs_x2 = []
    for i in range(len(negative_pairs_x1)): 
        x1_id = extract_subject_id(negative_pairs_x1[i])
        
        while True: 
            negative = df_split.sample(1) 
            x2_id = negative["subject_id"].item()
            if x2_id != x1_id: 
                negative_pairs_x2.append(negative["FileName"].item())
                break

    negative_pairs_df = pd.DataFrame({"FileName1": negative_pairs_x1, "FileName2": negative_pairs_x2})
    negative_pairs_df 

    # Quick logic to double check that we do not have leakage 
    # Function to extract subject_id from a FileName


    # Extract subject_id for both FileName1 and FileName2
    negative_pairs_df['SubjectID1'] = negative_pairs_df['FileName1'].apply(extract_subject_id)
    negative_pairs_df['SubjectID2'] = negative_pairs_df['FileName2'].apply(extract_subject_id)

    # Check for rows where SubjectID1 == SubjectID2
    violations = negative_pairs_df[negative_pairs_df['SubjectID1'] == negative_pairs_df['SubjectID2']]

    # Test result
    if violations.empty:
        print("Test passed: No rows with the same subject_id in FileName1 and FileName2.")
    else:
        print("Test failed: The following rows have the same subject_id:")
        print(violations)

    negative_pairs_df = negative_pairs_df.drop(columns={"SubjectID1", "SubjectID2"})

    positive_pairs_df["Label"] = 1.0 
    negative_pairs_df["Label"] = 0.0

    df_out = pd.concat([positive_pairs_df, negative_pairs_df]).reset_index(drop=True)

    with open(split + "_pairs.txt", "w") as fp: 
        for x in df_out.iterrows(): 
            fp.write(f"{x[1]['FileName1']}     {x[1]['FileName2']}     {x[1]['Label']}\n")

   Unnamed: 0                 FileName  Patient Age Patient Gender  \
0           0  images/00000001_000.png           57              M   

   subject_id  Split  No Finding  Atelectasis  Cardiomegaly  Consolidation  \
0           1  TRAIN           0            0             1              0   

   Edema  Pleural Effusion  Pneumonia  Pneumothorax  
0      0                 0          0             0  
Test passed: No rows with the same subject_id in FileName1 and FileName2.
       Unnamed: 0                 FileName  Patient Age Patient Gender  \
67309       67309  images/00000040_000.png           66              M   

       subject_id Split  No Finding  Atelectasis  Cardiomegaly  Consolidation  \
67309          40   VAL           1            0             0              0   

       Edema  Pleural Effusion  Pneumonia  Pneumothorax  
67309      0                 0          0             0  
Test passed: No rows with the same subject_id in FileName1 and FileName2.
       Unnamed: 0 

# Script to prepare txt file that Privacy Model uses as input logic

In [2]:
import numpy as np
np.loadtxt('./TRAIN_pairs.txt', dtype=str)

array([['images/00000001_000.png', 'images/00000001_001.png', '1.0'],
       ['images/00000005_000.png', 'images/00000005_001.png', '1.0'],
       ['images/00000005_000.png', 'images/00000005_002.png', '1.0'],
       ...,
       ['images/00030772_001.png', 'images/00029088_016.png', '0.0'],
       ['images/00030791_000.png', 'images/00008290_002.png', '0.0'],
       ['images/00030801_000.png', 'images/00012935_013.png', '0.0']],
      shape=(754938, 3), dtype='<U23')

# Mimic - different filename to subject id but everything else is the same 

In [7]:
import pandas as pd
from itertools import combinations

df = pd.read_csv("/vol/ideadata/ed52egek/pycharm/trichotomy/datasets/eight_mimic.csv")
for split in ["TRAIN", "VAL", "TEST"]: 
    df_split = df[df["Split"] == split]
    
    def extract_subject_id(filename):
        return float(filename.split('/')[2][1:])

    # get a list of positive pairs
    pairs = []
    for _, group in df_split.groupby('subject_id'):
        file_names = group['FileName'].tolist()
        pairs.extend(list(combinations(file_names, 2)))

    # Convert to DataFrame or List
    positive_pairs_df = pd.DataFrame(pairs, columns=['FileName1', 'FileName2'])

    # get a list of negative pairs
    negative_pairs_x1 = positive_pairs_df["FileName1"]
    negative_pairs_x2 = []
    for i in range(len(negative_pairs_x1)): 
        x1_id = extract_subject_id(negative_pairs_x1[i]) 
        while True: 
            negative = df_split.sample(1) 
            x2_id = negative["subject_id"].item()
            if x2_id != x1_id: 
                negative_pairs_x2.append(negative["FileName"].item())
                break

    negative_pairs_df = pd.DataFrame({"FileName1": negative_pairs_x1, "FileName2": negative_pairs_x2})
    negative_pairs_df 

    # Quick logic to double check that we do not have leakage 
    # Function to extract subject_id from a FileName


    # Extract subject_id for both FileName1 and FileName2
    negative_pairs_df['SubjectID1'] = negative_pairs_df['FileName1'].apply(extract_subject_id)
    negative_pairs_df['SubjectID2'] = negative_pairs_df['FileName2'].apply(extract_subject_id)

    # Check for rows where SubjectID1 == SubjectID2
    violations = negative_pairs_df[negative_pairs_df['SubjectID1'] == negative_pairs_df['SubjectID2']]

    # Test result
    if violations.empty:
        print("Test passed: No rows with the same subject_id in FileName1 and FileName2.")
    else:
        print("Test failed: The following rows have the same subject_id:")
        print(violations)

    negative_pairs_df = negative_pairs_df.drop(columns={"SubjectID1", "SubjectID2"})

    positive_pairs_df["Label"] = 1.0 
    negative_pairs_df["Label"] = 0.0

    df_out = pd.concat([positive_pairs_df, negative_pairs_df]).reset_index(drop=True)

    with open(split + "_pairs_mimic.txt", "w") as fp: 
        for x in df_out.iterrows(): 
            fp.write(f"{x[1]['FileName1']}     {x[1]['FileName2']}     {x[1]['Label']}\n")
df

Test passed: No rows with the same subject_id in FileName1 and FileName2.
Test passed: No rows with the same subject_id in FileName1 and FileName2.
Test passed: No rows with the same subject_id in FileName1 and FileName2.


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,dicom_id,subject_id,study_id,PerformedProcedureStepDescription,ViewPosition,Rows,Columns,StudyDate,...,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices,impression,Finding Label,Finding Labels
0,0,0,1a1fe7e3-cbac5d93-b339aeda-86bb86b5-4f31e82e,19999987.0,58971208.0,CHEST (PORTABLE AP),AP,3056.0,2544.0,21451103.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Slight interval worsening of right lower lung ...,0.0,Atelectasis
1,1,1,7ba273af-3d290f8d-e28d0ab4-484b7a86-7fc12b08,19999987.0,58621812.0,CHEST (PORTABLE AP),AP,3056.0,2544.0,21451102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Appropriately positioned ET and NG tubes. Bib...,0.0,Atelectasis|Support Devices
2,2,2,3fcd0406-9b111603-feae7033-96632b3a-111333e5,19999733.0,57132437.0,CHEST (PA AND LAT),PA,3056.0,2544.0,21520708.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,No acute cardiothoracic process.,0.0,No Finding
3,3,3,428e2c18-5721d8f3-35a05001-36f3d080-9053b83c,19999733.0,57132437.0,CHEST (PA AND LAT),PA,3056.0,2544.0,21520708.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,No acute cardiothoracic process.,0.0,No Finding
4,4,4,53e9b6d0-5d5317f5-f1a4c031-01d40558-fd14a425,19999376.0,57540554.0,CHEST (PORTABLE AP),AP,3056.0,2544.0,21450731.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,No acute cardiopulmonary process. No evidence...,0.0,No Finding
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119529,119529,119529,926ca783-1abb560b-2e4efb96-6f0b25d8-4c8ee5ca,10760670.0,59575239.0,0,PA,1720.0,1486.0,21900329.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1) Markedly improved right-sided pneumonia wit...,0.0,Lung Lesion|Lung Opacity|Pneumonia
119530,119530,119530,068144bb-0b4fa968-441e9426-44d8609a-2465f49e,10760670.0,56785501.0,CHEST (PORTABLE AP),AP,3056.0,2544.0,21921027.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiomediastinal silhouette is unchanged. Rig...,0.0,Consolidation|Lung Opacity
119531,119531,119531,6c3436b6-65eeb5bc-143c7787-8a551fb9-62dd5ce4,10760670.0,54827584.0,CHEST (PA AND LAT),AP,3056.0,2544.0,21900225.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,Pulmonary edema with small bilateral pleural e...,0.0,Consolidation|Edema|Lung Opacity|Pleural Effus...
119532,119532,119532,aa1de98c-5b2943ac-7b348d9e-23424c11-71dd667b,10760670.0,53468449.0,CHEST (PORTABLE AP),AP,3056.0,2544.0,21900226.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,Resolving pulmonary edema. Increasing density...,0.0,Edema|Lung Opacity|Pneumonia


# Chexpert - different filename to subject id but everything else is the same 

In [23]:
import pandas as pd
from itertools import combinations

df = pd.read_csv("/vol/ideadata/ed52egek/pycharm/trichotomy/datasets/eight_chexpert.csv")
for split in ["TRAIN", "VAL", "TEST"]: 
    df_split = df[df["Split"] == split]
    
    def extract_subject_id(filename):
        return float(filename.split('/')[2][7:])

    # get a list of positive pairs
    pairs = []
    for _, group in df_split.groupby('subject_id'):
        file_names = group['FileName'].tolist()
        pairs.extend(list(combinations(file_names, 2)))

    # Convert to DataFrame or List
    positive_pairs_df = pd.DataFrame(pairs, columns=['FileName1', 'FileName2'])

    # get a list of negative pairs
    negative_pairs_x1 = positive_pairs_df["FileName1"]
    negative_pairs_x2 = []
    for i in range(len(negative_pairs_x1)): 
        x1_id = extract_subject_id(negative_pairs_x1[i]) 
        while True: 
            negative = df_split.sample(1) 
            x2_id = float(negative["subject_id"].item()[7:])
            if x2_id != x1_id: 
                negative_pairs_x2.append(negative["FileName"].item())
                break

    negative_pairs_df = pd.DataFrame({"FileName1": negative_pairs_x1, "FileName2": negative_pairs_x2})
    negative_pairs_df 

    # Quick logic to double check that we do not have leakage 
    # Function to extract subject_id from a FileName


    # Extract subject_id for both FileName1 and FileName2
    negative_pairs_df['SubjectID1'] = negative_pairs_df['FileName1'].apply(extract_subject_id)
    negative_pairs_df['SubjectID2'] = negative_pairs_df['FileName2'].apply(extract_subject_id)

    # Check for rows where SubjectID1 == SubjectID2
    violations = negative_pairs_df[negative_pairs_df['SubjectID1'] == negative_pairs_df['SubjectID2']]

    # Test result
    if violations.empty:
        print("Test passed: No rows with the same subject_id in FileName1 and FileName2.")
    else:
        print("Test failed: The following rows have the same subject_id:")
        print(violations)

    negative_pairs_df = negative_pairs_df.drop(columns={"SubjectID1", "SubjectID2"})

    positive_pairs_df["Label"] = 1.0 
    negative_pairs_df["Label"] = 0.0

    df_out = pd.concat([positive_pairs_df, negative_pairs_df]).reset_index(drop=True)

    with open(split + "_pairs_chexpert.txt", "w") as fp: 
        for x in df_out.iterrows(): 
            fp.write(f"{x[1]['FileName1']}     {x[1]['FileName2']}     {x[1]['Label']}\n")
df

Test passed: No rows with the same subject_id in FileName1 and FileName2.
Test passed: No rows with the same subject_id in FileName1 and FileName2.
Test passed: No rows with the same subject_id in FileName1 and FileName2.


Unnamed: 0.1,Unnamed: 0,FileName,Sex,Age,Frontal/Lateral,AP/PA,No Finding,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,...,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,subject_id,Split
0,0,CheXpert-v1.0/train/patient00001/study1/view1_...,Female,68,Frontal,AP,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,patient00001,TRAIN
1,1,CheXpert-v1.0/train/patient00003/study1/view1_...,Male,41,Frontal,AP,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,patient00003,TRAIN
2,2,CheXpert-v1.0/train/patient00004/study1/view1_...,Female,20,Frontal,PA,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,patient00004,TRAIN
3,3,CheXpert-v1.0/train/patient00005/study1/view1_...,Male,33,Frontal,PA,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,patient00005,TRAIN
4,4,CheXpert-v1.0/train/patient00005/study2/view1_...,Male,33,Frontal,AP,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,patient00005,TRAIN
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95798,95798,CheXpert-v1.0/train/patient64399/study1/view1_...,Female,62,Frontal,AP,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,patient64399,TEST
95799,95799,CheXpert-v1.0/train/patient64404/study1/view1_...,Female,83,Frontal,AP,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,patient64404,TEST
95800,95800,CheXpert-v1.0/train/patient64411/study1/view1_...,Female,65,Frontal,AP,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,patient64411,TEST
95801,95801,CheXpert-v1.0/train/patient64434/study1/view1_...,Female,29,Frontal,AP,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,patient64434,TEST


In [26]:
import os

# Specify the directory containing the text files
directory = "./image_pairs"  # Replace with your directory if not in the current one

# Define the splits to process
splits = ["TRAIN", "VAL", "TEST"]

# Iterate through each split
for split in splits:
    combined_lines = []
    output_filename = f"{split}_pairs_all__.txt"
    line_distribution = {}
    
    # Collect all files matching the split
    split_files = [f for f in os.listdir(directory) if f.startswith(f"{split}_pairs_") and f.endswith(".txt")]
    
    # Read and combine lines from each file
    for file in split_files:
        with open(os.path.join(directory, file), "r") as f:
            lines = f.readlines()
            combined_lines.extend(lines)
            line_distribution[file] = len(lines)
    
    # Write the combined lines to the new file
    with open(os.path.join(directory, output_filename), "w") as output_file:
        output_file.writelines(combined_lines)
    
    # Print the distribution and summary
    print(f"\nCreated {output_filename} with {len(combined_lines)} lines.")
    print("Line contribution from each file:")
    for file, count in line_distribution.items():
        print(f"  {file}: {count} lines")


Created TRAIN_pairs_all__.txt with 2933392 lines.
Line contribution from each file:
  TRAIN_pairs_all.txt: 1466696 lines
  TRAIN_pairs_chexpert.txt: 262662 lines
  TRAIN_pairs_cxr8.txt: 754938 lines
  TRAIN_pairs_mimic.txt: 449096 lines

Created VAL_pairs_all__.txt with 362044 lines.
Line contribution from each file:
  VAL_pairs_all.txt: 181022 lines
  VAL_pairs_chexpert.txt: 31940 lines
  VAL_pairs_crx8.txt: 86292 lines
  VAL_pairs_mimic.txt: 62790 lines

Created TEST_pairs_all__.txt with 800656 lines.
Line contribution from each file:
  TEST_pairs_all.txt: 400328 lines
  TEST_pairs_chexpert.txt: 125112 lines
  TEST_pairs_cxr8.txt: 101964 lines
  TEST_pairs_mimic.txt: 173252 lines
