### Purpose of this file is to help me figure out which probe's outputs to feed into the prompt

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from pathlib import Path


## PROBE ORDER super important  as was used to train the model
vindr_pathologies = ["Aortic enlargement", "Atelectasis", "Calcification", "Cardiomegaly",
              "Clavicle fracture", "Consolidation", "Emphysema", "Enlarged PA",
              "ILD", "Infiltration", "Lung Opacity", "Lung cavity", "Lung cyst",
              "Mediastinal shift","Nodule/Mass", "Pleural effusion", "Pleural thickening",
              "Pneumothorax", "Pulmonary fibrosis","Rib fracture", "Other lesion",
              "No finding"] 


cheXpert_pathologies = ['No Finding','Enlarged Cardiomediastinum','Cardiomegaly','Lung Opacity',
        'Lung Lesion','Edema','Consolidation','Pneumonia','Atelectasis','Pneumothorax',
        'Pleural Effusion','Pleural Other','Fracture','Support Devices']

In [2]:
class BinaryMultiPathologyPresenceDataset(Dataset):
    def __init__(self, dataframe, pathology_columns, layer = "post_layer_norm"): # "q_former"
        self.dataframe = dataframe
        self.pathology_columns = pathology_columns
        self.layer = layer

    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        
        # Extract embeddings and drop the first dimension i.e. [1,128,768] -> [128,768]
        embeddings = torch.tensor(row[self.layer], dtype=torch.float)
        
        # Extract binary labels for pathologies
        labels_values = row[self.pathology_columns].values
        # set the dtype of each item in values to be an int
        labels_values = [int(item) for item in labels_values]
        labels_tensor = torch.tensor(labels_values, dtype=torch.float)

        # image_id 
        image_id = row['image_id']
        
        return image_id, embeddings, labels_tensor

class LinearClassifier(nn.Module):
    def __init__(self, num_features, num_classes):
        super(LinearClassifier, self).__init__()
        self.linear = nn.Linear(num_features, num_classes)
    
    def forward(self, x):
        x = self.linear(x)
        # Shape of x becomes [batch_size, num_classes]
        return x

In [7]:
vindr_layer_norm_weights = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/model_inspection/post_layer_norm_best_vindr_model.pth")
vindr_layer_norm_model = LinearClassifier(1408, len(vindr_pathologies))
vindr_layer_norm_model.load_state_dict(torch.load(vindr_layer_norm_weights))


chexpert_layer_norm_weights = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/model_inspection/post_layer_norm_best_chexpert_model.pth")
chexpert_layer_norm_model = LinearClassifier(1408, len(cheXpert_pathologies))
chexpert_layer_norm_model.load_state_dict(torch.load(chexpert_layer_norm_weights))

<All keys matched successfully>

In [22]:
def generate_model_outputs_on_test_data(model,layer, test_data, pathologies, probabilities = False, shuffle = False, return_dict = False):
    model.eval()
    image_id_to_predictions = {}

    if shuffle:
        test_data = test_data.sample(frac=1).reset_index(drop=True)
    
    for index, row in test_data.iterrows():
        # Extract image_id and q_former from the row
        image_id = row['image_id']
        embeddings = row[layer]

        # convert q_former to tensor
        embeddings = torch.tensor(embeddings, dtype=torch.float)
        
        if probabilities:
            outputs = model(embeddings.view(embeddings.size(0), -1))
            predictions = torch.sigmoid(outputs)

            if return_dict:
                # in image_id_to_predictions, store the predictions as a dict of pathologies to probabilities
                predictions = predictions.squeeze().tolist()
                predictions = {pathologies[i]: predictions[i] for i in range(len(pathologies))}
                image_id_to_predictions[image_id] = predictions
                continue
            image_id_to_predictions[image_id] = predictions.squeeze().tolist()
            continue


        # Run your model to get predictions
        with torch.no_grad():
            outputs = model(embeddings.view(embeddings.size(0), -1))
            predictions = torch.sigmoid(outputs)
            predictions = predictions > 0.5
        
        # Convert predictions to a dictionary using the order of pathologies
        predictions = predictions.squeeze().tolist()
        predictions = {pathologies[i]: predictions[i] for i in range(len(pathologies))}
        predictions = {k for k, v in predictions.items() if v}
        if len(predictions) == 0:
            predictions = {"No Finding"}
        
        # Store predictions in the dictionary with image_id as key
        image_id_to_predictions[image_id] = predictions

    return image_id_to_predictions

In [8]:
test_all_pathology_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/model_inspection/embeddings/VinDr/no_tuning_4934e91451945c8218c267aae9c34929a7677829/collated_test_all.pkl")
test_all_pathology = pd.read_pickle(test_all_pathology_path)

VinDr_test_test_split_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/datasets/VinDr-CXR/test_set_three_splits/VinDr_test_test_split.txt")
with open(VinDr_test_test_split_path, 'r') as f:
    test_test_image_ids = f.read().splitlines()

test_test_pathology_df = test_all_pathology[test_all_pathology["image_id"].isin(test_test_image_ids)]

In [9]:
cheXpert_test_df = pd.read_pickle("/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/model_inspection/embeddings/CheXpert-small/collated_test_df.pkl")

In [10]:
print(test_test_pathology_df.columns)

Index(['image_id', 'patch_embeddings', 'post_layer_norm', 'q_former',
       'language_projection', 'Clavicle fracture', 'No finding',
       'Calcification', 'Lung Opacity', 'Nodule/Mass', 'Pneumothorax',
       'Atelectasis', 'Infiltration', 'Consolidation', 'Aortic enlargement',
       'Other lesion', 'Emphysema', 'Lung cavity', 'Lung cyst',
       'Pulmonary fibrosis', 'ILD', 'Cardiomegaly', 'Pleural effusion',
       'Pleural thickening', 'Rib fracture', 'Enlarged PA',
       'Mediastinal shift', 'Clavicle fracture position',
       'No finding position', 'Calcification position',
       'Lung Opacity position', 'Nodule/Mass position',
       'Pneumothorax position', 'Atelectasis position',
       'Infiltration position', 'Consolidation position',
       'Aortic enlargement position', 'Other lesion position',
       'Emphysema position', 'Lung cavity position', 'Lung cyst position',
       'Pulmonary fibrosis position', 'ILD position', 'Cardiomegaly position',
       'Pleural effu

In [11]:
print(cheXpert_test_df.columns)

Index(['image_id', 'patch_embeddings', 'post_layer_norm', 'q_former',
       'language_projection', 'No Finding', 'Enlarged Cardiomediastinum',
       'Cardiomegaly', 'Lung Opacity', 'Lung Lesion', 'Edema', 'Consolidation',
       'Pneumonia', 'Atelectasis', 'Pneumothorax', 'Pleural Effusion',
       'Pleural Other', 'Fracture', 'Support Devices'],
      dtype='object')


In [14]:
# randomly select n rows from the cheXpert dataframe have no finding
n = 10
cheXpert_test_no_finding = cheXpert_test_df[cheXpert_test_df["No Finding"] == 1].sample(n=n)

In [23]:
# for both models, generate predictions on the cheXpert test no finding data
vindr_layer_norm_no_finding_predictions = generate_model_outputs_on_test_data(vindr_layer_norm_model, "post_layer_norm", cheXpert_test_no_finding, vindr_pathologies, probabilities = True, return_dict = True)
chexpert_layer_norm_no_finding_predictions = generate_model_outputs_on_test_data(chexpert_layer_norm_model, "post_layer_norm", cheXpert_test_no_finding, cheXpert_pathologies, probabilities = True, return_dict = True)


In [30]:
# for both models print the predictions for each image id
threshold = 0.5
for image_id in vindr_layer_norm_no_finding_predictions:
    print(f"VinDr No Finding Image ID: {image_id}")
    print("VinDr probe")
    # sort keys in alphabetical order
    vindr_probe_predictions = vindr_layer_norm_no_finding_predictions[image_id]
    vindr_probe_predictions = {k: vindr_probe_predictions[k] for k in sorted(vindr_probe_predictions)}
    print(vindr_probe_predictions)
    # print activated pathologies
    total_activations = 0
    for pathology, probability in vindr_probe_predictions.items():
        if probability > threshold:
            total_activations += 1
            print(pathology)
    if total_activations == 0:
        print("No Finding")


    print("CheXpert probe")
    chexpert_probe_predictions = chexpert_layer_norm_no_finding_predictions[image_id]
    chexpert_probe_predictions = {k: chexpert_probe_predictions[k] for k in sorted(chexpert_probe_predictions)}
    print(chexpert_probe_predictions)
    # print activated pathologies
    total_activations = 0
    for pathology, probability in chexpert_probe_predictions.items():
        if probability > threshold:
            total_activations += 1
            print(pathology)
    if total_activations == 0:
        print("No Finding")

    print("\n")


VinDr No Finding Image ID: patient65013/study1/view3_lateral.jpg
VinDr probe
{'Aortic enlargement': 0.10446276515722275, 'Atelectasis': 0.07806508988142014, 'Calcification': 0.13275589048862457, 'Cardiomegaly': 0.22468256950378418, 'Clavicle fracture': 0.05900351330637932, 'Consolidation': 0.024012327194213867, 'Emphysema': 0.033105723559856415, 'Enlarged PA': 0.04055171087384224, 'ILD': 0.41254639625549316, 'Infiltration': 0.003186184214428067, 'Lung Opacity': 0.2535903751850128, 'Lung cavity': 0.0066094291396439075, 'Lung cyst': 0.003683160524815321, 'Mediastinal shift': 0.07937994599342346, 'No finding': 0.10060049593448639, 'Nodule/Mass': 0.042468417435884476, 'Other lesion': 0.48892971873283386, 'Pleural effusion': 0.09210673719644547, 'Pleural thickening': 0.5887652039527893, 'Pneumothorax': 0.07990998774766922, 'Pulmonary fibrosis': 0.1406436264514923, 'Rib fracture': 0.04850078001618385}
Pleural thickening
CheXpert probe
{'Atelectasis': 0.21724015474319458, 'Cardiomegaly': 0.14