In [1]:
import os
from IPython.display import display, clear_output
from ipywidgets import widgets
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt
import cv2
import json
import pytesseract # for OCR
import pandas as pd
import torch
from open_clip import create_model_from_pretrained, get_tokenizer
from concurrent.futures import ThreadPoolExecutor

In [2]:
filtered_csv_path = '/cs/labs/tomhope/yuvalbus/pmc/pythonProject/pythonFilesGuy/filtered_df.csv'
filtered_df = pd.read_csv(filtered_csv_path)

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
biomedclip_model, preprocess = create_model_from_pretrained('hf-hub:microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224')
biomedclip_model.to(device)
biomedclip_model.eval()
tokenizer = get_tokenizer('hf-hub:microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224')

In [31]:
preprocess

Compose(
    Resize(size=224, interpolation=bicubic, max_size=None, antialias=True)
    CenterCrop(size=(224, 224))
    <function _convert_to_rgb at 0x7f2c661b8220>
    ToTensor()
    Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711])
)

In [4]:
device

device(type='cuda')

In [5]:
filtered_df.head(2)

Unnamed: 0,image_path,caption_path,patient_uid,pmc_id,article_path,unique_articles_sim_patients,caption_text,caption_id
0,PMC8167975/8167975_1/8167975_1_1.jpg,PMC8167975/8167975_1/8167975_1_1.txt,8167975-1,8167975,https://ncbi.nlm.nih.gov/pmc/articles/PMC8167975,['5563556-1'],"Fundus photograph showed a yellowish white, pe...",8167975_1_1
1,PMC8167975/8167975_1/8167975_1_2.jpg,PMC8167975/8167975_1/8167975_1_2.txt,8167975-1,8167975,https://ncbi.nlm.nih.gov/pmc/articles/PMC8167975,['5563556-1'],B-scan demonstrated focal subretinal calcifica...,8167975_1_2


In [6]:
prefix = '/cs/labs/tomhope/yuvalbus/pmc/pythonProject/data2'

In [7]:
image_path_list = [os.path.join(prefix, suffix) for suffix in filtered_df['image_path']]

In [8]:
text_path_list = [os.path.join(prefix, suffix) for suffix in filtered_df['caption_path']]

In [9]:
with open(text_path_list[111], 'r', encoding='utf-8') as f:
    captions = f.read()


In [10]:
captions

'Photomicrograph showing large pleomorphic cells admixed with mixed inflammatory cells (H and E, ×200)'

In [11]:
tokens = tokenizer(captions).to(device)

with torch.no_grad():
    txt_emb = biomedclip_model.encode_text(tokens)

In [36]:
img = preprocess(Image.open(image_path_list[0]))

In [12]:
img = preprocess(Image.open(image_path_list[111]))
img = img.unsqueeze(0).to(device)
# imgs_batch = torch.stack(imgs).to(device)
with torch.no_grad():
    img_emb = biomedclip_model.encode_image(img)

In [13]:
def compute_cosine_similarity_matrix(embeddings_i, embeddings_j):
    # Normalize embeddings
    embeddings_i_norm = embeddings_i / embeddings_i.norm(dim=1, keepdim=True)
    embeddings_j_norm = embeddings_j / embeddings_j.norm(dim=1, keepdim=True)
    # Compute cosine similarity matrix
    similarity_matrix = torch.mm(embeddings_i_norm, embeddings_j_norm.t())
    return similarity_matrix


In [14]:
def read_caption(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return ""

def read_captions_parallel(paths, max_workers=8):
    captions = [None] * len(paths)
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_index = {executor.submit(read_caption, path): idx for idx, path in enumerate(paths)}
        for future in future_to_index:
            idx = future_to_index[future]
            try:
                captions[idx] = future.result()
            except Exception as e:
                print(f"Error reading caption at index {idx}: {e}")
                captions[idx] = ""
    return captions

In [15]:
text_list = read_captions_parallel(text_path_list[:400])

In [16]:
tokens = tokenizer(text_list).to(device)

with torch.no_grad():
    text_ftrs = biomedclip_model.encode_text(tokens)


In [17]:
preprocessed_imgs = [preprocess(Image.open(img_path)) for img_path in image_path_list[:400]]


In [18]:
imgs_batch = torch.stack(preprocessed_imgs).to(device)

with torch.no_grad():
    imgs_ftrs = biomedclip_model.encode_image(imgs_batch)


In [19]:
sim_mat = compute_cosine_similarity_matrix(text_ftrs, imgs_ftrs)

In [20]:
captions

'Photomicrograph showing large pleomorphic cells admixed with mixed inflammatory cells (H and E, ×200)'

In [21]:
a = [text_list[0], text_list[1]]

In [22]:
len(tokens)

400

In [23]:
text_list[0]

'Fundus photograph showed a yellowish white, peripapillary and sharply demarcated choroidal lesion involving the macula (white arrows)'

In [24]:
torch.cosine_similarity(txt_emb, img_emb)

tensor([0.4504], device='cuda:0')

In [25]:
torch.cosine_similarity(img_emb, txt_emb)

tensor([0.4504], device='cuda:0')

In [26]:
torch.nn.functional.cosine_similarity(txt_emb, img_emb)

tensor([0.4504], device='cuda:0')

In [27]:
torch.nn.functional.cosine_similarity(img_emb, txt_emb)

tensor([0.4504], device='cuda:0')

In [28]:
compute_cosine_similarity_matrix(img_emb, txt_emb)

tensor([[0.4504]], device='cuda:0')

In [29]:
img.shape

torch.Size([1, 3, 224, 224])

In [30]:
txt_emb.shape

torch.Size([1, 512])