Alright. This is a notebook to help me understand the flow of data trough the clip model. 
It is my goal to create a small example csv file that looks like this.

image_path,caption
/path/to/image1.jpg,This is a caption for image 1.
/path/to/image2.jpg,This is a caption for image 2.
/path/to/image3.jpg,This is a caption for image 3.

What do I want? 
dicom_paths / patient folder / patient / radiographic study / sentences / sentences length 

## Read report meta file

In [1]:
import pandas as pd

df = pd.read_pickle('/home/lsiefermann/xClip/utils/report-meta-v1.pkl')

In [2]:
# rename patient column of df to subject_id
df.rename(columns={'patient': 'subject_id'}, inplace=True)
df.rename(columns={'radiographic_study': 'study_id'}, inplace=True)

# drop the first letter in the subject_id column
df['subject_id'] = df['subject_id'].str[1:]
df['study_id'] = df['study_id'].str[1:]

# turn datatype of subject_id into int
df['subject_id'] = df['subject_id'].astype(int)
df['study_id'] = df['study_id'].astype(int)

In [3]:
df

Unnamed: 0,patient_folder,subject_id,study_id,FINDINGS,IMPRESSION,dicom_paths,REPORT,sentences,sentence_length,LETTER_COUNT,WORD_COUNT
126733,p10,10000032,50414267,"There is no focal consolidation, pleural effus...",No acute cardiopulmonary process.,[files/p10/p10000032/s50414267/02aa804e-bde0af...,"There is no focal consolidation, pleural effus...","[There is no focal consolidation, pleural effu...","[9, 9, 5, 10, 6, 12, 4]",402,55
126731,p10,10000032,53189527,"The cardiac, mediastinal and hilar contours ar...",No acute cardiopulmonary abnormality.,[files/p10/p10000032/s53189527/2a2277a9-b0ded1...,"The cardiac, mediastinal and hilar contours ar...","[The cardiac, mediastinal and hilar contours a...","[8, 4, 3, 7, 10, 8, 4]",318,44
126730,p10,10000032,53911762,Single frontal view of the chest provided. ...,No acute intrathoracic process.,[files/p10/p10000032/s53911762/68b5c4b1-227d04...,Single frontal view of the chest provided. ...,"[Single frontal view of the chest provided., T...","[7, 8, 5, 15, 9, 4]",336,48
126732,p10,10000032,56699142,"The lungs are clear of focal consolidation, pl...",No acute cardiopulmonary process.,[files/p10/p10000032/s56699142/ea030e7a-2e3b13...,"The lungs are clear of focal consolidation, pl...","[The lungs are clear of focal consolidation, p...","[11, 5, 5, 15, 4]",271,40
118061,p10,10000764,57375967,PA and lateral views of the chest provided. ...,"Focal consolidation at the left lung base, pos...",[files/p10/p10000764/s57375967/096052b7-d256dc...,PA and lateral views of the chest provided. ...,"[PA and lateral views of the chest provided., ...","[8, 5, 15, 5, 6, 9, 7, 12, 3]",499,70
...,...,...,...,...,...,...,...,...,...,...,...
173328,p19,19999442,58708861,ET tube ends 4.7 cm above the carina. NG tube...,,[files/p19/p19999442/s58708861/16b6c70f-6d36bd...,ET tube ends 4.7 cm above the carina. NG tube...,"[ET tube ends 4.7 cm above the carina., NG tub...","[8, 6, 5, 6, 7, 6]",237,38
162119,p19,19999733,57132437,"The lungs are clear, and the cardiomediastinal...",No acute cardiothoracic process.,[files/p19/p19999733/s57132437/3fcd0406-9b1116...,"The lungs are clear, and the cardiomediastinal...","[The lungs are clear, and the cardiomediastina...","[12, 8, 4]",163,24
174503,p19,19999987,55368167,There has been interval extubation and improve...,,[files/p19/p19999987/s55368167/58766883-376a15...,There has been interval extubation and improve...,[There has been interval extubation and improv...,"[14, 26, 34, 6, 5]",554,85
174501,p19,19999987,58621812,Portable supine AP view of the chest provided ...,Appropriately positioned ET and NG tubes. Bib...,[files/p19/p19999987/s58621812/7ba273af-3d290f...,Portable supine AP view of the chest provided ...,[Portable supine AP view of the chest provided...,"[21, 9, 4, 7, 5, 6, 2]",388,54


In [4]:
import pandas as pd

cxr_jpg_split = pd.read_csv(
    '/scratch1/MIMIC/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-2.0.0-split.csv.gz', compression='gzip')
cxr_jpg_split.head()

# Assuming df is your DataFrame
grouped = cxr_jpg_split.groupby(['subject_id', 'study_id'])

for name, group in grouped:
    if group['split'].nunique() > 1:
        print(
            f"Subject ID: {name[0]}, Study ID: {name[1]} has images in different splits.")

In [5]:
# drop cxr_jpg_split dicom_id column
cxr_jpg_split = cxr_jpg_split.drop(columns=['dicom_id'], inplace=False)

# only keep unique rows
cxr_jpg_split = cxr_jpg_split.drop_duplicates(
    subset=['subject_id', 'study_id', 'split'], keep='first', inplace=False)

cxr_jpg_split

Unnamed: 0,study_id,subject_id,split
0,50414267,10000032,train
2,53189527,10000032,train
4,53911762,10000032,train
6,56699142,10000032,train
7,57375967,10000764,train
...,...,...,...
377103,58708861,19999442,train
377104,57132437,19999733,train
377107,55368167,19999987,train
377108,58621812,19999987,train


In [6]:
# find all rows in df that are in cxr_jpg_split and add a column with the split to the df
df = df.merge(cxr_jpg_split, how='left', on=['subject_id', 'study_id'])

In [7]:
df['split'].value_counts()

split
train       211662
test          2920
validate      1718
Name: count, dtype: int64

In [8]:
# get index number 126733
df.iloc[0]['dicom_paths']

['files/p10/p10000032/s50414267/02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.dcm',
 'files/p10/p10000032/s50414267/174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962.dcm']

In [9]:
df['jpg_paths'] = df['dicom_paths'].apply(
    lambda paths: [path.replace('.dcm', '.jpg') for path in paths])

In [10]:
# get index number 126733
df.iloc[0]['jpg_paths']

['files/p10/p10000032/s50414267/02aa804e-bde0afdd-112c0b34-7bc16630-4e384014.jpg',
 'files/p10/p10000032/s50414267/174413ec-4ec4c1f7-34ea26b7-c5f994f8-79ef1962.jpg']

In [11]:
df

Unnamed: 0,patient_folder,subject_id,study_id,FINDINGS,IMPRESSION,dicom_paths,REPORT,sentences,sentence_length,LETTER_COUNT,WORD_COUNT,split,jpg_paths
0,p10,10000032,50414267,"There is no focal consolidation, pleural effus...",No acute cardiopulmonary process.,[files/p10/p10000032/s50414267/02aa804e-bde0af...,"There is no focal consolidation, pleural effus...","[There is no focal consolidation, pleural effu...","[9, 9, 5, 10, 6, 12, 4]",402,55,train,[files/p10/p10000032/s50414267/02aa804e-bde0af...
1,p10,10000032,53189527,"The cardiac, mediastinal and hilar contours ar...",No acute cardiopulmonary abnormality.,[files/p10/p10000032/s53189527/2a2277a9-b0ded1...,"The cardiac, mediastinal and hilar contours ar...","[The cardiac, mediastinal and hilar contours a...","[8, 4, 3, 7, 10, 8, 4]",318,44,train,[files/p10/p10000032/s53189527/2a2277a9-b0ded1...
2,p10,10000032,53911762,Single frontal view of the chest provided. ...,No acute intrathoracic process.,[files/p10/p10000032/s53911762/68b5c4b1-227d04...,Single frontal view of the chest provided. ...,"[Single frontal view of the chest provided., T...","[7, 8, 5, 15, 9, 4]",336,48,train,[files/p10/p10000032/s53911762/68b5c4b1-227d04...
3,p10,10000032,56699142,"The lungs are clear of focal consolidation, pl...",No acute cardiopulmonary process.,[files/p10/p10000032/s56699142/ea030e7a-2e3b13...,"The lungs are clear of focal consolidation, pl...","[The lungs are clear of focal consolidation, p...","[11, 5, 5, 15, 4]",271,40,train,[files/p10/p10000032/s56699142/ea030e7a-2e3b13...
4,p10,10000764,57375967,PA and lateral views of the chest provided. ...,"Focal consolidation at the left lung base, pos...",[files/p10/p10000764/s57375967/096052b7-d256dc...,PA and lateral views of the chest provided. ...,"[PA and lateral views of the chest provided., ...","[8, 5, 15, 5, 6, 9, 7, 12, 3]",499,70,train,[files/p10/p10000764/s57375967/096052b7-d256dc...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
216295,p19,19999442,58708861,ET tube ends 4.7 cm above the carina. NG tube...,,[files/p19/p19999442/s58708861/16b6c70f-6d36bd...,ET tube ends 4.7 cm above the carina. NG tube...,"[ET tube ends 4.7 cm above the carina., NG tub...","[8, 6, 5, 6, 7, 6]",237,38,train,[files/p19/p19999442/s58708861/16b6c70f-6d36bd...
216296,p19,19999733,57132437,"The lungs are clear, and the cardiomediastinal...",No acute cardiothoracic process.,[files/p19/p19999733/s57132437/3fcd0406-9b1116...,"The lungs are clear, and the cardiomediastinal...","[The lungs are clear, and the cardiomediastina...","[12, 8, 4]",163,24,train,[files/p19/p19999733/s57132437/3fcd0406-9b1116...
216297,p19,19999987,55368167,There has been interval extubation and improve...,,[files/p19/p19999987/s55368167/58766883-376a15...,There has been interval extubation and improve...,[There has been interval extubation and improv...,"[14, 26, 34, 6, 5]",554,85,train,[files/p19/p19999987/s55368167/58766883-376a15...
216298,p19,19999987,58621812,Portable supine AP view of the chest provided ...,Appropriately positioned ET and NG tubes. Bib...,[files/p19/p19999987/s58621812/7ba273af-3d290f...,Portable supine AP view of the chest provided ...,[Portable supine AP view of the chest provided...,"[21, 9, 4, 7, 5, 6, 2]",388,54,train,[files/p19/p19999987/s58621812/7ba273af-3d290f...


In [12]:
df_train = df[(df['split'] == 'train')]
df_val = df[(df['split'] == 'validate')]
df_test = df[(df['split'] == 'test')]

print(f"Number of rows in df_train: {len(df_train)}")
print(f"Number of rows in df_val: {len(df_val)}")
print(f"Number of rows in df_test: {len(df_test)}")

Number of rows in df_train: 211662
Number of rows in df_val: 1718
Number of rows in df_test: 2920


## Creating the MIMIC 5x200 dataset for zero-shot image classifcation

I first like to try if we can create this dataset based on the test dataset.

In [13]:
cxr_jpg_chexpert = pd.read_csv(
    '/scratch1/MIMIC/physionet.org/files/mimic-cxr-jpg/2.0.0/mimic-cxr-2.0.0-chexpert.csv.gz', compression='gzip')
cxr_jpg_chexpert

Unnamed: 0,subject_id,study_id,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices
0,10000032,50414267,,,,,,,,,1.0,,,,,
1,10000032,53189527,,,,,,,,,1.0,,,,,
2,10000032,53911762,,,,,,,,,1.0,,,,,
3,10000032,56699142,,,,,,,,,1.0,,,,,
4,10000764,57375967,,,1.0,,,,,,,,,-1.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227822,19999442,58708861,,,,,,,,,1.0,,,,,1.0
227823,19999733,57132437,,,,,,,,,1.0,,,,,
227824,19999987,55368167,1.0,-1.0,,,,,0.0,,,0.0,,,0.0,
227825,19999987,58621812,1.0,,,,,,,,,,,,,1.0


In [14]:
all_labels = []
for col in cxr_jpg_chexpert.columns[2:]:
    all_labels.append(col)

In [15]:
all_labels

['Atelectasis',
 'Cardiomegaly',
 'Consolidation',
 'Edema',
 'Enlarged Cardiomediastinum',
 'Fracture',
 'Lung Lesion',
 'Lung Opacity',
 'No Finding',
 'Pleural Effusion',
 'Pleural Other',
 'Pneumonia',
 'Pneumothorax',
 'Support Devices']

In [16]:
def get_eval_labels_occurences(df, labels):
    eval_dataset = pd.merge(df, cxr_jpg_chexpert, on=[
                            'subject_id', 'study_id'], how='inner')
    eval_dataset.fillna(0, inplace=True)

    print(f"Number of rows in eval: {len(eval_dataset)}")

    # drop all rows where a label holds a value of -1
    for label in labels:
        eval_dataset = eval_dataset[eval_dataset[label] != -1]

    # Assuming df is your dataframe
    filtered_df = eval_dataset[(eval_dataset[labels].sum(axis=1)) == 1]

    print(f"Number of rows in eval without -1 values: {len(eval_dataset)}")
    print(f"Number of rows in filtered_df: {len(filtered_df)}")

    # display the values of the labels
    for label in labels:
        print(f"{label}: {filtered_df[label].value_counts()}")

    return filtered_df

In [17]:
get_eval_labels_occurences(df_test, all_labels)

Number of rows in eval: 2920
Number of rows in eval without -1 values: 2047
Number of rows in filtered_df: 816
Atelectasis: Atelectasis
0.0    788
1.0     28
Name: count, dtype: int64
Cardiomegaly: Cardiomegaly
0.0    746
1.0     70
Name: count, dtype: int64
Consolidation: Consolidation
0.0    809
1.0      7
Name: count, dtype: int64
Edema: Edema
0.0    755
1.0     61
Name: count, dtype: int64
Enlarged Cardiomediastinum: Enlarged Cardiomediastinum
0.0    809
1.0      7
Name: count, dtype: int64
Fracture: Fracture
0.0    796
1.0     20
Name: count, dtype: int64
Lung Lesion: Lung Lesion
0.0    802
1.0     14
Name: count, dtype: int64
Lung Opacity: Lung Opacity
0.0    760
1.0     56
Name: count, dtype: int64
No Finding: No Finding
1.0    426
0.0    390
Name: count, dtype: int64
Pleural Effusion: Pleural Effusion
0.0    758
1.0     58
Name: count, dtype: int64
Pleural Other: Pleural Other
0.0    809
1.0      7
Name: count, dtype: int64
Pneumonia: Pneumonia
0.0    771
1.0     45
Name: count

Unnamed: 0,patient_folder,subject_id,study_id,FINDINGS,IMPRESSION,dicom_paths,REPORT,sentences,sentence_length,LETTER_COUNT,...,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices
2,p10,10046166,50051329,Lateral view somewhat limited due to overlying...,No evidence of acute cardiopulmonary process.,[files/p10/p10046166/s50051329/427446c1-881f5c...,Lateral view somewhat limited due to overlying...,[Lateral view somewhat limited due to overlyin...,"[9, 6, 9, 18, 6, 4, 8, 14, 6]",537,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,p10,10046166,51738740,,No acute intrathoracic process.,[files/p10/p10046166/s51738740/3a8a17fc-3cd357...,No acute intrathoracic process.,[ No acute intrathoracic process.],[4],32,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,p10,10046166,53492798,Frontal and lateral radiographs of the chest r...,No acute cardiopulmonary process.,[files/p10/p10046166/s53492798/18f0fd6d-f513af...,Frontal and lateral radiographs of the chest r...,[Frontal and lateral radiographs of the chest ...,"[29, 22, 16, 9, 12, 7, 11, 4]",750,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,p10,10046166,57379357,Frontal and lateral views of the chest were ob...,No radiographic findings to suggest pneumonia.,[files/p10/p10046166/s57379357/6e511483-c7e160...,Frontal and lateral views of the chest were ob...,[Frontal and lateral views of the chest were o...,"[9, 30, 21, 7, 14, 6]",603,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6,p10,10046166,57977208,"In comparison with the study of ___, there is ...",,[files/p10/p10046166/s57977208/e2856783-ffa5ec...,"In comparison with the study of ___, there is ...","[In comparison with the study of ___, there is...","[13, 12]",161,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2910,p19,19991135,51777681,PA and lateral radiographs of the chest were a...,1. No significant interval change. 2. Post-...,[files/p19/p19991135/s51777681/3272470c-530109...,PA and lateral radiographs of the chest were a...,[PA and lateral radiographs of the chest were ...,"[9, 24, 13, 21, 8, 6, 5, 13, 6, 1, 4, 1, 8]",825,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2911,p19,19991135,54103833,AP single view of the chest has been obtained ...,Stable chest findings as seen on portable foll...,[files/p19/p19991135/s54103833/6ce54ac9-077864...,AP single view of the chest has been obtained ...,[AP single view of the chest has been obtained...,"[15, 17, 23, 14, 14]",557,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2914,p19,19991135,56587463,,"In comparison to ___, no relevant change is se...",[files/p19/p19991135/s56587463/56112caf-112b95...,"In comparison to ___, no relevant change is s...","[ In comparison to ___, no relevant change is ...","[9, 4, 2, 2, 8]",168,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2915,p19,19991135,56918032,"In comparison with study of ___, there is agai...",,[files/p19/p19991135/s56918032/2d0477bb-67599f...,"In comparison with study of ___, there is agai...","[In comparison with study of ___, there is aga...","[20, 9, 14]",257,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
df_train = pd.merge(df_train, cxr_jpg_chexpert, on=[
                    'subject_id', 'study_id'], how='inner')
df_train.fillna(0, inplace=True)

In [19]:
df_test = pd.merge(df_test, cxr_jpg_chexpert, on=[
    'subject_id', 'study_id'], how='inner')
df_test.fillna(0, inplace=True)

In [20]:
df_val = pd.merge(df_val, cxr_jpg_chexpert, on=[
    'subject_id', 'study_id'], how='inner')
df_val.fillna(0, inplace=True)

In [21]:
# Create an empty dataframe for the evaluation set
eval_set = pd.DataFrame()

# Define the labels we are interested in
interested_labels = ['Atelectasis', 'Cardiomegaly',
                     'Edema', 'Pleural Effusion', 'Pleural Other']

In [22]:
test_df = df_test.copy()
train_df = df_train.copy()

In [23]:
# Loop through each label
for label in interested_labels:
    # Find rows in the test set where the label is 1, all other labels are 0, and none of the labels is -1
    label_rows_test = test_df[(test_df[label] == 1) & (test_df[test_df.columns.intersection(all_labels)].sum(
        axis=1) == 1) & (test_df[test_df.columns.intersection(all_labels)].min(axis=1) >= 0)]

    print(f"Found {len(label_rows_test)} instances of {label} in the test set.")

    # Check if we have 200 instances. If not, get some from the train set.
    if len(label_rows_test) < 200:
        # Calculate the shortfall
        shortfall = 200 - len(label_rows_test)

        print(
            f"Shortfall of {shortfall} instances for {label}. Looking in train set...")

        # Find rows in the train set where the label is 1, all other labels are 0, and none of the labels is -1
        label_rows_train = train_df[(train_df[label] == 1) & (train_df[train_df.columns.intersection(
            all_labels)].sum(axis=1) == 1) & (train_df[train_df.columns.intersection(all_labels)].min(axis=1) >= 0)]

        print(
            f"Found {len(label_rows_train)} instances of {label} in the train set.")

        # Select 'shortfall' number of instances
        label_rows_train = label_rows_train.sample(
            n=shortfall, replace=False, random_state=42)

        # Remove selected rows from the train set
        train_df = train_df.drop(label_rows_train.index)

        # Combine the selected rows from the test and train set
        label_rows = pd.concat([label_rows_test, label_rows_train])
    else:
        # Select 200 instances from the test set
        label_rows = label_rows_test.sample(
            n=200, replace=False, random_state=42)

    print(f"Selected 200 instances for {label}.")

    # Append to the eval_set dataframe
    eval_set = pd.concat([eval_set, label_rows])

# Reset the indices of the evaluation set
eval_set.reset_index(drop=True, inplace=True)

# Also, we should reset the indices of the updated training set
train_df.reset_index(drop=True, inplace=True)

print(f"Evaluation set size: {len(eval_set)}")
print(f"Train set size after removing selected instances: {len(train_df)}")

Found 28 instances of Atelectasis in the test set.
Shortfall of 172 instances for Atelectasis. Looking in train set...
Found 3981 instances of Atelectasis in the train set.
Selected 200 instances for Atelectasis.
Found 70 instances of Cardiomegaly in the test set.
Shortfall of 130 instances for Cardiomegaly. Looking in train set...
Found 5549 instances of Cardiomegaly in the train set.
Selected 200 instances for Cardiomegaly.
Found 61 instances of Edema in the test set.
Shortfall of 139 instances for Edema. Looking in train set...
Found 2139 instances of Edema in the train set.
Selected 200 instances for Edema.
Found 58 instances of Pleural Effusion in the test set.
Shortfall of 142 instances for Pleural Effusion. Looking in train set...
Found 3436 instances of Pleural Effusion in the train set.
Selected 200 instances for Pleural Effusion.
Found 7 instances of Pleural Other in the test set.
Shortfall of 193 instances for Pleural Other. Looking in train set...
Found 236 instances of Ple

In [24]:
# Drop the unnecessary columns
train = train_df[['subject_id', 'study_id', 'REPORT', 'sentences', 'jpg_paths', 'Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Enlarged Cardiomediastinum',
                  'Fracture', 'Lung Lesion', 'Lung Opacity', 'No Finding', 'Pleural Effusion', 'Pleural Other', 'Pneumonia', 'Pneumothorax', 'Support Devices']]
val = df_val[['subject_id', 'study_id', 'REPORT', 'sentences', 'jpg_paths', 'Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Enlarged Cardiomediastinum',
              'Fracture', 'Lung Lesion', 'Lung Opacity', 'No Finding', 'Pleural Effusion', 'Pleural Other', 'Pneumonia', 'Pneumothorax', 'Support Devices']]
MIMIC_5x200 = eval_set[['subject_id', 'study_id', 'REPORT', 'sentences', 'jpg_paths', 'Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema', 'Enlarged Cardiomediastinum',
                        'Fracture', 'Lung Lesion', 'Lung Opacity', 'No Finding', 'Pleural Effusion', 'Pleural Other', 'Pneumonia', 'Pneumothorax', 'Support Devices']]

In [26]:
# write to csv file
# train.to_csv('train_mimic.csv', index=False)
train.to_pickle(
    '/home/lsiefermann/open_clip_based_thesis/myUtils/train_mimic.pkl')
# write to csv file
# val.to_csv('val_mimic.csv', index=False)
val.to_pickle('/home/lsiefermann/open_clip_based_thesis/myUtils/val_mimic.pkl')
# write to csv file
# MIMIC_5x200.to_csv('eval_mimic_5x200.csv', index=False)
MIMIC_5x200.to_pickle(
    '/home/lsiefermann/open_clip_based_thesis/myUtils/eval_mimic_5x200.pkl')

## Create small dataset for as a test with random image, random sentence

In [29]:
import pandas as pd
import random

# Let's assume df is your DataFrame
# Take the first 10 rows of the DataFrame
df_train_sampled = train.head(5000).copy()
# Take the first 10 rows of the DataFrame
df_val_sampled = val.head(1000).copy()

In [30]:
# write to csv file
train.to_csv('train_5000_mimic.csv', index=False)
df_train_sampled.to_pickle(
    '/home/lsiefermann/open_clip_based_thesis/myUtils/train_5000_mimic.pkl')
# write to csv file
val.to_csv('val_1000_mimic.csv', index=False)
df_val_sampled.to_pickle(
    '/home/lsiefermann/open_clip_based_thesis/myUtils/val_1000_mimic.pkl')

In [None]:
random.seed(422)  # Set the seed for the random number generator

# Pick a random entry from the list in each row for both columns
df_train_sampled['jpg_paths'] = df_train_sampled['jpg_paths'].apply(
    lambda x: random.choice(x) if isinstance(x, list) and x else None)
df_train_sampled['sentences'] = df_train_sampled['sentences'].apply(
    lambda x: random.choice(x) if isinstance(x, list) and x else None)

# Pick a random entry from the list in each row for both columns
df_val_sampled['jpg_paths'] = df_val_sampled['jpg_paths'].apply(
    lambda x: random.choice(x) if isinstance(x, list) and x else None)
df_val_sampled['sentences'] = df_val_sampled['sentences'].apply(
    lambda x: random.choice(x) if isinstance(x, list) and x else None)

# Drop the unnecessary columns
df_train_sampled = df_train_sampled[['jpg_paths', 'sentences']]
df_val_sampled = df_val_sampled[['jpg_paths', 'sentences']]

# Reset the index
df_train_sampled = df_train_sampled.reset_index(drop=True)
df_val_sampled = df_val_sampled.reset_index(drop=True)