In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from pathlib import Path


## PROBE ORDER super important  as was used to train the model
vindr_pathologies = ["Aortic enlargement", "Atelectasis", "Calcification", "Cardiomegaly",
              "Clavicle fracture", "Consolidation", "Emphysema", "Enlarged PA",
              "ILD", "Infiltration", "Lung Opacity", "Lung cavity", "Lung cyst",
              "Mediastinal shift","Nodule/Mass", "Pleural effusion", "Pleural thickening",
              "Pneumothorax", "Pulmonary fibrosis","Rib fracture", "Other lesion",
              "No finding"] 


cheXpert_pathologies = ['No Finding','Enlarged Cardiomediastinum','Cardiomegaly','Lung Opacity',
        'Lung Lesion','Edema','Consolidation','Pneumonia','Atelectasis','Pneumothorax',
        'Pleural Effusion','Pleural Other','Fracture','Support Devices']

Dataset Agnostic code

In [5]:
class BinaryMultiPathologyPresenceDataset(Dataset):
    def __init__(self, dataframe, pathology_columns, layer = "post_layer_norm"): # "q_former"
        self.dataframe = dataframe
        self.pathology_columns = pathology_columns
        self.layer = layer

    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        
        # Extract embeddings and drop the first dimension i.e. [1,128,768] -> [128,768]
        embeddings = torch.tensor(row[self.layer], dtype=torch.float)
        
        # Extract binary labels for pathologies
        labels_values = row[self.pathology_columns].values
        # set the dtype of each item in values to be an int
        labels_values = [int(item) for item in labels_values]
        labels_tensor = torch.tensor(labels_values, dtype=torch.float)

        # image_id 
        image_id = row['image_id']
        
        return image_id, embeddings, labels_tensor

class LinearClassifier(nn.Module):
    def __init__(self, num_features, num_classes):
        super(LinearClassifier, self).__init__()
        self.linear = nn.Linear(num_features, num_classes)
    
    def forward(self, x):
        x = self.linear(x)
        # Shape of x becomes [batch_size, num_classes]
        return x

In [6]:
def generate_model_outputs_on_test_data(model,layer, test_data, pathologies):
    model.eval()
    image_id_to_predictions = {}
    
    for index, row in test_data.iterrows():
        # Extract image_id and q_former from the row
        image_id = row['image_id']
        embeddings = row[layer]

        # convert q_former to tensor
        embeddings = torch.tensor(embeddings, dtype=torch.float)
     
        # Run your model to get predictions
        with torch.no_grad():
            outputs = model(embeddings.view(embeddings.size(0), -1))
            predictions = torch.sigmoid(outputs)
            predictions = predictions > 0.5
        
        # Convert predictions to a dictionary using the order of pathologies
        predictions = predictions.squeeze().tolist()
        predictions = {pathologies[i]: predictions[i] for i in range(len(pathologies))}
        predictions = {k for k, v in predictions.items() if v}
        if len(predictions) == 0:
            predictions = {"No Finding"}
        
        # Store predictions in the dictionary with image_id as key
        image_id_to_predictions[image_id] = predictions

    return image_id_to_predictions

def write_predictions_to_csv(image_id_to_predictions, output_file):
    with open(output_file, 'w') as f:
        for image_id, predictions in image_id_to_predictions.items():
            predictions = ",".join(predictions)
            f.write(f"{image_id},{predictions}\n")

VinDr Generation (on VinDr data)

In [7]:
test_all_pathology_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/model_inspection/embeddings/VinDr/no_tuning_4934e91451945c8218c267aae9c34929a7677829/collated_test_all.pkl")
test_all_pathology = pd.read_pickle(test_all_pathology_path)

VinDr_test_test_split_path = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/datasets/VinDr-CXR/test_set_three_splits/VinDr_test_test_split.txt")
with open(VinDr_test_test_split_path, 'r') as f:
    test_test_image_ids = f.read().splitlines()

test_test_pathology_df = test_all_pathology[test_all_pathology["image_id"].isin(test_test_image_ids)]

In [76]:
layer_norm_weights = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/model_inspection/post_layer_norm_best_vindr_model.pth")
vindr_layer_norm_model = LinearClassifier(1408, len(vindr_pathologies))
vindr_layer_norm_model.load_state_dict(torch.load(layer_norm_weights))

q_former_weights = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/model_inspection/q_former_best_vindr_model.pth")
vindr_q_former_model = LinearClassifier(98304, len(vindr_pathologies))
vindr_q_former_model.load_state_dict(torch.load(q_former_weights))

<All keys matched successfully>

In [None]:
vindr_layer_norm_predictions = generate_model_outputs_on_test_data(vindr_layer_norm_model,"post_layer_norm", test_test_pathology_df, vindr_pathologies)
write_predictions_to_csv(vindr_layer_norm_predictions, "vindr_layer_norm_predictions.csv")

vindr_q_former_predictions = generate_model_outputs_on_test_data(vindr_q_former_model,"q_former", test_test_pathology_df, vindr_pathologies)
write_predictions_to_csv(vindr_q_former_predictions, "vindr_q_former_predictions.csv")



In [10]:
# testing CheXpert probe on VinDr data
cheXpert_layer_norm_weights = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/model_inspection/post_layer_norm_best_chexpert_model.pth")
chexpert_layer_norm_model = LinearClassifier(1408, len(cheXpert_pathologies))
chexpert_layer_norm_model.load_state_dict(torch.load(cheXpert_layer_norm_weights))

chexpert_model_vindr_predictions = generate_model_outputs_on_test_data(chexpert_layer_norm_model,"post_layer_norm", test_test_pathology_df, cheXpert_pathologies)
write_predictions_to_csv(chexpert_model_vindr_predictions, "chexpert_model_vindr_predictions.csv")

CheXpert Generation

In [12]:
cheXpert_test_df = pd.read_pickle("/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/model_inspection/embeddings/CheXpert-small/collated_test_df.pkl")

In [81]:
chexpert_layer_norm_weights = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/model_inspection/post_layer_norm_best_chexpert_model.pth")
chexpert_layer_norm_model = LinearClassifier(1408, len(cheXpert_pathologies))
chexpert_layer_norm_model.load_state_dict(torch.load(chexpert_layer_norm_weights))

chexpert_q_former_weights = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/model_inspection/q_former_best_chexpert_model.pth")
chexpert_q_former_model = LinearClassifier(98304, len(cheXpert_pathologies))
chexpert_q_former_model.load_state_dict(torch.load(chexpert_q_former_weights))



<All keys matched successfully>

In [82]:
chexpert_layer_norm_predictions = generate_model_outputs_on_test_data(chexpert_layer_norm_model,"post_layer_norm", cheXpert_test_df, cheXpert_pathologies)
write_predictions_to_csv(chexpert_layer_norm_predictions, "chexpert_layer_norm_predictions.csv")

chexpert_q_former_predictions = generate_model_outputs_on_test_data(chexpert_q_former_model,"q_former", cheXpert_test_df, cheXpert_pathologies)
write_predictions_to_csv(chexpert_q_former_predictions, "chexpert_q_former_predictions.csv")


In [13]:
layer_norm_weights = Path("/vol/biomedic3/bglocker/ugproj2324/nns20/CheXagent/model_inspection/post_layer_norm_best_vindr_model.pth")
vindr_layer_norm_model = LinearClassifier(1408, len(vindr_pathologies))
vindr_layer_norm_model.load_state_dict(torch.load(layer_norm_weights))

vindr_model_chexpert_predictions = generate_model_outputs_on_test_data(vindr_layer_norm_model,"post_layer_norm", cheXpert_test_df, vindr_pathologies)
write_predictions_to_csv(vindr_model_chexpert_predictions, "chexpert_predictions_by_vindr_model.csv")