In [6]:
# %pip install --upgrade chromadb
# %pip install pillow
# %pip install open-clip-torch
# %pip install matplotlib
%pip install transformers

Defaulting to user installation because normal site-packages is not writeable
Collecting transformers
  Using cached transformers-4.49.0-py3-none-any.whl (10.0 MB)
Installing collected packages: transformers
Successfully installed transformers-4.49.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
import sys
print(sys.executable)


/bin/python3.11


In [2]:
import PIL
print(PIL.__version__)


11.1.0


In [3]:
!pip show pillow


Name: pillow
Version: 11.1.0
Summary: Python Imaging Library (Fork)
Home-page: 
Author: 
Author-email: "Jeffrey A. Clark" <aclark@aclark.net>
License: MIT-CMU
Location: /home/raiyan00/.local/lib/python3.10/site-packages
Requires: 
Required-by: maestro, matplotlib, qwen-vl-utils, supervision, torchvision


In [1]:
import chromadb
from chromadb.utils.embedding_functions import OpenCLIPEmbeddingFunction
from chromadb.utils.data_loaders import ImageLoader
from matplotlib import pyplot as plt

In [2]:
# Create database file at folder "chroma" or load into client if exists.
chroma_client = chromadb.PersistentClient(path="chroma")

# Instantiate image loader helper.
image_loader = ImageLoader()

# Instantiate multimodal embedding function.
multimodal_ef = OpenCLIPEmbeddingFunction()

# Create the collection, aka vector database. Or, if database already exist, then use it. Specify the model that we want to use to do the embedding.
multimodal_db = chroma_client.get_or_create_collection(name="multimodal_db", embedding_function=multimodal_ef, data_loader=image_loader)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import os
import json
from glob import glob
from PIL import Image

# Assuming OpenCLIPEmbeddingFunction is already defined/imported
embedding_function = OpenCLIPEmbeddingFunction()  # Initialize your embedding function

# Define the directory paths
image_directory = '/mnt/data1/raiyan/breast_cancer/datasets/dmid/pixel_level_annotations/png_images/'
json_directory = '/mnt/data1/raiyan/breast_cancer/VLMs-for-Mammograms/GROUND-TRUTH-REPORTS'

# Initialize lists
ids = []
uris = []
metadatas = []
image_embeddings = []

# Initialize counter
processed_count = 0  

# Loop through images
for img_path in glob(os.path.join(image_directory, '*.png')):  
    img_name = os.path.basename(img_path)  
    json_path = os.path.join(json_directory, img_name.replace('.png', '.json'))  

    if os.path.exists(json_path):
        with open(json_path, 'r') as json_file:
            metadata = json.load(json_file)

        # Extract metadata fields
        metadata_entry = {
            'Breast_Composition': metadata.get('Breast_Composition', 'N/A'),
            'BIRADS': metadata.get('BIRADS', 'N/A'),
            'Findings': metadata.get('Findings', 'N/A')
        }

        # Load image
        image = Image.open(img_path).convert("RGB")

        # Generate image embedding using OpenCLIPEmbeddingFunction
        embedding = embedding_function.get_embedding(image)  # Assuming this method exists

        # Store details
        ids.append(img_name)
        uris.append(img_path)
        metadatas.append(metadata_entry)
        image_embeddings.append(embedding)  # Store embedding

        # Increment counter and print progress
        processed_count += 1
        print(f"Indexed {processed_count}: {img_name}")

# Add records to the multimodal database
multimodal_db.add(
    ids=ids,
    uris=uris,
    data=image_embeddings,  # Store OpenCLIP embeddings here
    metadatas=metadatas
)

print(f"Total images indexed: {processed_count}")


AttributeError: 'OpenCLIPEmbeddingFunction' object has no attribute 'get_embedding'

In [5]:
import os
import json
import torch
import torchvision.transforms as transforms
from glob import glob
from PIL import Image
from transformers import CLIPProcessor, CLIPModel

# Load CLIP model
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Define the directory paths
image_directory = '/mnt/data1/raiyan/breast_cancer/datasets/dmid/pixel_level_annotations/png_images/'
json_directory = '/mnt/data1/raiyan/breast_cancer/VLMs-for-Mammograms/GROUND-TRUTH-REPORTS'

# Initialize lists
ids = []
uris = []
metadatas = []
image_embeddings = []

# Loop through images
for img_path in glob(os.path.join(image_directory, '*.png')):  
    img_name = os.path.basename(img_path)  
    json_path = os.path.join(json_directory, img_name.replace('.png', '.json'))  

    if os.path.exists(json_path):
        with open(json_path, 'r') as json_file:
            metadata = json.load(json_file)

        # Extract metadata fields
        metadata_entry = {
            'Breast_Composition': metadata.get('Breast_Composition', 'N/A'),
            'BIRADS': metadata.get('BIRADS', 'N/A'),
            'Findings': metadata.get('Findings', 'N/A')
        }

        # Load and preprocess the image
        image = Image.open(img_path).convert("RGB")
        inputs = processor(images=image, return_tensors="pt")  # Preprocess for CLIP
        
        # Generate image embedding
        with torch.no_grad():
            embedding = model.get_image_features(**inputs)  # Extract features
            embedding = embedding.squeeze().cpu().numpy()  # Convert to NumPy array
        
        # Store details
        ids.append(img_name)
        uris.append(img_path)
        metadatas.append(metadata_entry)
        image_embeddings.append(embedding)  # Store embedding

# Add records to the multimodal database
multimodal_db.add(
    ids=ids,
    uris=uris,
    data=image_embeddings,  # Store embeddings here
    metadatas=metadatas
)


ModuleNotFoundError: No module named 'transformers'

In [9]:
import os
import json
from glob import glob

# Define the directory paths
image_directory = '/mnt/data1/raiyan/breast_cancer/datasets/dmid/pixel_level_annotations/png_images/'  # Replace with the directory containing images
json_directory = '/mnt/data1/raiyan/breast_cancer/VLMs-for-Mammograms/GROUND-TRUTH-REPORTS'    # Replace with the directory containing JSON files

# Initialize empty lists for ids, uris, and metadatas
ids = []
uris = []
metadatas = []

# Initialize counters
image_count = 0
json_found_count = 0
json_missing_count = 0

# Loop through all image files in the directory
for img_path in glob(os.path.join(image_directory, '*.png')):  # Adjust extension if needed
    img_name = os.path.basename(img_path)  # Get the image file name (e.g., 'lion.jpg')
    json_path = os.path.join(json_directory, img_name.replace('.png', '.json'))  # Construct the JSON file path
    
    # Check if the corresponding JSON file exists
    image_count += 1  # Increment image count for each processed image
    
    if os.path.exists(json_path):
        json_found_count += 1
        # Read metadata from the JSON file
        with open(json_path, 'r') as json_file:
            metadata = json.load(json_file)
        
        # Extracting specific metadata fields
        # You can adjust the structure of this part to fit your JSON content
        metadata_entry = {
            'Breast_Composition': metadata.get('Breast_Composition', 'N/A'),
            'BIRADS': metadata.get('BIRADS', 'N/A'),
            'Findings': metadata.get('Findings', 'N/A')
        }
        
        # Add the details to the lists
        ids.append(img_name)  # Use image file name as the ID
        uris.append(img_path)  # Absolute path of the image
        metadatas.append(metadata_entry)  # Add metadata as a dictionary (not a list)
        print(f"Processed {img_name} (JSON found)")  # Print when JSON is found for an image
        # Print img_name, img_path, and the metadata entry when JSON is found
        print(f"Processed {img_name} (JSON found)")
        print(f"Image Path: {img_path}")
        print(f"Metadata: {metadata_entry}")
        print("=" * 50)  # Just to separate the outputs for readability
    else:
        json_missing_count += 1
        print(f"Processed {img_name} (JSON missing)")  # Print when JSON is missing for an image


# Add the records to the multimodal database
multimodal_db.add(
    ids=ids,
    uris=uris,
    metadatas=metadatas
)


Processed IMG315.png (JSON found)
Processed IMG315.png (JSON found)
Image Path: /mnt/data1/raiyan/breast_cancer/datasets/dmid/pixel_level_annotations/png_images/IMG315.png
Metadata: {'Breast_Composition': 'fibro fatty and glandular breast parenchyma (ACR-C).', 'BIRADS': 4, 'Findings': 'Well-defined lobulated soft opacity seen in the upper quadrant of the breast suggests high suspicious malignant lesion (BIRADS 4c)\nskin nipple and pectoral muscle appear normal.\nno abnormal microcalcification seen.\ntwo enlarged axillary nodes seen'}
Processed IMG156.png (JSON found)
Processed IMG156.png (JSON found)
Image Path: /mnt/data1/raiyan/breast_cancer/datasets/dmid/pixel_level_annotations/png_images/IMG156.png
Metadata: {'Breast_Composition': 'dense fibro glandular breast parenchyma (ACR D).', 'BIRADS': 3, 'Findings': 'a well-defined soft opacity seen in the inner quadrant of the breast suggests probably a benign lesion (BIRADS 3).\nskin and nipple appear normal.\nA tiny benign-looking calcifi

ERROR:chromadb.db.mixins.embeddings_queue:Exception occurred invoking consumer for subscription ee664ce6a05d429e895dd24e39ce76ffto topic persistent://default/default/50e9fbfa-2e90-4608-8240-45afd8b89adc attempt to write a readonly database
ERROR:chromadb.db.mixins.embeddings_queue:Exception occurred invoking consumer for subscription ee664ce6a05d429e895dd24e39ce76ffto topic persistent://default/default/50e9fbfa-2e90-4608-8240-45afd8b89adc attempt to write a readonly database
ERROR:chromadb.db.mixins.embeddings_queue:Exception occurred invoking consumer for subscription ee664ce6a05d429e895dd24e39ce76ffto topic persistent://default/default/50e9fbfa-2e90-4608-8240-45afd8b89adc attempt to write a readonly database
ERROR:chromadb.db.mixins.embeddings_queue:Exception occurred invoking consumer for subscription ee664ce6a05d429e895dd24e39ce76ffto topic persistent://default/default/50e9fbfa-2e90-4608-8240-45afd8b89adc attempt to write a readonly database
ERROR:chromadb.db.mixins.embeddings_queu

OperationalError: attempt to write a readonly database

In [10]:
# Simple function to print the results of a query.
# The 'results' is a dict {ids, distances, data, ...}
# Each item in the dict is a 2d list.
def print_query_results(query_list: list, query_results: dict)->None:
    result_count = len(query_results['ids'][0])

    for i in range(len(query_list)):
        print(f'Results for query: {query_list[i]}')

        for j in range(result_count):
            id       = query_results["ids"][i][j]
            distance = query_results['distances'][i][j]
            data     = query_results['data'][i][j]
            document = query_results['documents'][i][j]
            metadata = query_results['metadatas'][i][j]
            uri      = query_results['uris'][i][j]

            print(f'id: {id}, distance: {distance}, metadata: {metadata}, uri: {uris}') 

            # Display image, the physical file must exist at URI.
            # (ImageLoader loads the image from file)
            print(f'data: {uri}')
            plt.imshow(data)
            plt.axis("off")
            plt.show()

In [11]:
# It is possible to submit multiple queries at the same time, just add to the list.  
query_texts = ['breast']

# Query vector db
query_results = multimodal_db.query(
    query_texts = query_texts,
    n_results=5,
    include=[  'metadatas', 'ids', 'uris'],
    where={'BIRADS':'3'}
)

print_query_results(query_texts, query_results)

ValueError: Expected include item to be one of documents, embeddings, metadatas, distances, uris, data, got ids in query.

In [None]:
# Query the collection to retrieve some records and their metadata
query_results = multimodal_db.query(
    query_embeddings=None,  # You don't need embeddings for just retrieving metadata
    n_results=5,  # Specify the number of results to retrieve
    include=["ids", "metadatas", "documents", "uris"]  # Include metadata along with other fields
)

# Print metadata for the first few records
for i in range(len(query_results["ids"])):
    print(f"Record {i + 1}:")
    print(f"ID: {query_results['ids'][i]}")  # Print the ID of the record
    print(f"Metadata: {query_results['metadatas'][i]}")  # Print metadata
    print("=" * 50)


ValueError: At least one of one of embeddings, documents, images, uris must be provided in query.

In [18]:
# Query the first 5 records
query_results = multimodal_db.query(
    query_embeddings=None,  # We're not querying for specific embeddings here
    n_results=5,  # Number of results to retrieve
    where={"Breast_Composition": "Fatty"},  # Filter by metadata field
    include=["ids", "metadatas", "documents", "uris"]
)


ValueError: At least one of one of embeddings, documents, images, uris must be provided in query.

In [15]:
# Query the collection to retrieve records where the URI matches a specific value
query_results = multimodal_db.query(
    query_uris=["/mnt/data1/raiyan/breast_cancer/datasets/dmid/pixel_level_annotations/png_images/IMG002.png"],  # List of URIs to search for
    n_results=5,  # Specify the number of results to retrieve
    include=["metadatas", "documents", "uris"],  # Include metadata along with other fields
    where={"uris": "/mnt/data1/raiyan/breast_cancer/datasets/dmid/pixel_level_annotations/png_images/IMG002.png"}  # Filter by URI
)

# Print metadata for the filtered records
for i in range(len(query_results["ids"])):
    print(f"Record {i + 1}:")
    print(f"ID: {query_results['ids'][i]}")  # Print the ID of the record
    print(f"Metadata: {query_results['metadatas'][i]}")  # Print metadata
    print(f"URI: {query_results['uris'][i]}")  # Print URI
    print("=" * 50)


Record 1:
ID: []
Metadata: []
URI: []


In [21]:
import os
import json
from glob import glob
from PIL import Image
import torch
import open_clip
from torchvision import transforms
import chromadb

chroma_client = chromadb.PersistentClient(path="./chroma_db")  # Persistent storage
multimodal_db = chroma_client.get_or_create_collection(name="image_embeddings")


# Load OpenCLIP model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = open_clip.create_model("ViT-B/32", pretrained="laion2b_s34b_b79k").to(device)

# ✅ Remove tokenizer (not needed for vision-only models)

# Preprocessing for OpenCLIP (manual transform)
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize to model input size
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.481, 0.457, 0.408), std=(0.268, 0.261, 0.275))  # CLIP normalization
])





# Function to generate image embeddings
def get_image_embedding(image):
    image = preprocess(image).unsqueeze(0).to(device)
    with torch.no_grad():
        embedding = model.encode_image(image)
    return embedding.cpu().numpy().flatten().tolist()  # Convert to list of floats

# Define directories
image_directory = '/mnt/data1/raiyan/breast_cancer/datasets/dmid/pixel_level_annotations/png_images/'
json_directory = '/mnt/data1/raiyan/breast_cancer/VLMs-for-Mammograms/GROUND-TRUTH-REPORTS'

# Initialize lists
ids, uris, metadatas, image_embeddings = [], [], [], []

# Process images
processed_count = 0  
for img_path in glob(os.path.join(image_directory, '*.png')):  
    img_name = os.path.basename(img_path)  
    json_path = os.path.join(json_directory, img_name.replace('.png', '.json'))  

    if os.path.exists(json_path):
        with open(json_path, 'r') as json_file:
            metadata = json.load(json_file)

        metadata_entry = {
            'Breast_Composition': metadata.get('Breast_Composition', 'N/A'),
            'BIRADS': metadata.get('BIRADS', 'N/A'),
            'Findings': metadata.get('Findings', 'N/A')
        }

        # Load and process image
        image = Image.open(img_path).convert("RGB")
        embedding = get_image_embedding(image)

        # Store details
        ids.append(img_name)
        uris.append(img_path)
        metadatas.append(metadata_entry)
        image_embeddings.append(embedding)

        processed_count += 1
        print(f"Indexed {processed_count}: {img_name}")
        
# Inspect the data
for idx, img_path in enumerate(uris):
    print(f"URI {idx}: {img_path}")
    
for idx, embedding in enumerate(image_embeddings):
    print(f"Embedding {idx}: {embedding[:5]}")  # Print first 5 values of each embedding


# Store in ChromaDB
multimodal_db.update(
    ids=ids,
    uris=uris,
    embeddings=image_embeddings,  # ChromaDB requires embeddings
    metadatas=metadatas
)


print(f"Total images indexed: {processed_count}")


Indexed 1: IMG315.png
Indexed 2: IMG156.png
Indexed 3: IMG282.png
Indexed 4: IMG062.png
Indexed 5: IMG145.png
Indexed 6: IMG103.png
Indexed 7: IMG458.png
Indexed 8: IMG414.png
Indexed 9: IMG107.png
Indexed 10: IMG029.png
Indexed 11: IMG148.png
Indexed 12: IMG319.png
Indexed 13: IMG353.png
Indexed 14: IMG407.png
Indexed 15: IMG209.png
Indexed 16: IMG018.png
Indexed 17: IMG422.png
Indexed 18: IMG510.png
Indexed 19: IMG496.png
Indexed 20: IMG379.png
Indexed 21: IMG099.png
Indexed 22: IMG453.png
Indexed 23: IMG069.png
Indexed 24: IMG033.png
Indexed 25: IMG278.png
Indexed 26: IMG158.png
Indexed 27: IMG266.png
Indexed 28: IMG012.png
Indexed 29: IMG451.png
Indexed 30: IMG084.png
Indexed 31: IMG081.png
Indexed 32: IMG286.png
Indexed 33: IMG179.png
Indexed 34: IMG086.png
Indexed 35: IMG481.png
Indexed 36: IMG336.png
Indexed 37: IMG295.png
Indexed 38: IMG074.png
Indexed 39: IMG174.png
Indexed 40: IMG172.png
Indexed 41: IMG380.png
Indexed 42: IMG335.png
Indexed 43: IMG415.png
Indexed 44: IMG433.p

In [35]:
# Function to retrieve top 3 similar images from ChromaDB
def retrieve_similar_images(query_image_path):
    # Load and preprocess the query image
    query_image = Image.open(query_image_path).convert("RGB")
    query_embedding = get_image_embedding(query_image)
    
    # Query ChromaDB for the most similar embeddings
    results = multimodal_db.query(
        query_embeddings=[query_embedding],  # Query using the passed image embedding
        n_results=3  # Retrieve top 3 results
    )
    
    # Inspect and log the full results to understand their structure
    print("Full Query Results:")
    print(results)
    
    print("ID: ", results['ids'])
    print("Documents: ", results['documents'])
    print("Metadatas: ", results['metadatas'])
    # print("URIS: ", results['uris'])
    # print("Embeddings: ", results['embeddings'])


    
    # Display the top 3 similar images and their metadata
    for idx, result in enumerate(results['documents']):
        # Extract metadata
        metadata = results['metadatas'][idx]
        
        # Print the image and metadata
        print(f"Similar Image {idx+1}:")
        print(f"  Image Path: {result}")  # The path of the similar image
        
        # Ensure metadata is a dictionary and access its attributes
        if isinstance(metadata, dict):
            print(f"  Breast Composition: {metadata.get('Breast_Composition', 'N/A')}")
            print(f"  BIRADS: {metadata.get('BIRADS', 'N/A')}")
            print(f"  Findings: {metadata.get('Findings', 'N/A')}")
        else:
            print("  Metadata format error, expected a dictionary.")
        
        print("-" * 40)

# Example usage
query_image_path = "/mnt/data1/raiyan/breast_cancer/datasets/dmid/pixel_level_annotations/png_images/IMG001.png"  # Path to the image you want to query
retrieve_similar_images(query_image_path)


Full Query Results:
{'ids': [['IMG001.png', 'IMG403.png', 'IMG094.png']], 'embeddings': None, 'documents': [[None, None, None]], 'uris': None, 'data': None, 'metadatas': [[{'BIRADS': 3, 'Breast_Composition': 'predominantly fibro fatty breast parenchyma (ACR B)', 'Findings': 'irregular ill-defined soft opacity with microcalcifications suggests malignant lesion (BIRADS-5)\nsmall well defined soft nodular opacity -- benign lesion (BIRADS-3)\nbenign and vascular calcifications\nSkin and nipple - no abnormality\nNo axillary adenopathy'}, {'BIRADS': 3, 'Breast_Composition': 'fibro fatty and glandular breast parenchyma (ACR B)', 'Findings': 'an ill-defined soft opacity seen in the upper quadrant of the breast suggest probably a benign lesion \n(BIRADS 3)\n\nbenign-looking calcifications seen in the breast. \n\nskin, nipple, and pectoral muscle appear normal. \n\nfew small benign-looking axillary nodes seen. \n\nBIRADS:3'}, {'BIRADS': 1, 'Breast_Composition': 'dense glandular breast parenchyma

In [39]:


# Extracting the IDs and metadata and storing them in the required format.
formatted_data = []

for img_id, metadata in zip(results['ids'][0], results['metadatas'][0]):
    formatted_entry = {
        "IMG-ID": img_id,
        "BREAST-COMPOSITION": metadata['Breast_Composition'].replace('\n', ' ').strip(),
        "BIRADS": str(metadata['BIRADS']),
        "FINDINGS": metadata['Findings'].replace('\n', ' ').strip()
    }
    formatted_data.append(formatted_entry)

# Storing the results in separate variables
entry_1 = f"""{{
    "IMG-ID": "{formatted_data[0]['IMG-ID']}",
    "BREAST-COMPOSITION": "{formatted_data[0]['BREAST-COMPOSITION']}",
    "BIRADS": "{formatted_data[0]['BIRADS']}",
    "FINDINGS": "{formatted_data[0]['FINDINGS']}"
}}"""

entry_2 = f"""{{
    "IMG-ID": "{formatted_data[1]['IMG-ID']}",
    "BREAST-COMPOSITION": "{formatted_data[1]['BREAST-COMPOSITION']}",
    "BIRADS": "{formatted_data[1]['BIRADS']}",
    "FINDINGS": "{formatted_data[1]['FINDINGS']}"
}}"""

entry_3 = f"""{{
    "IMG-ID": "{formatted_data[2]['IMG-ID']}",
    "BREAST-COMPOSITION": "{formatted_data[2]['BREAST-COMPOSITION']}",
    "BIRADS": "{formatted_data[2]['BIRADS']}",
    "FINDINGS": "{formatted_data[2]['FINDINGS']}"
}}"""

# Now the formatted strings are stored in entry_1, entry_2, and entry_3
print(entry_3)



{
    "IMG-ID": "IMG094.png",
    "BREAST-COMPOSITION": "dense glandular breast parenchyma, which lowers the sensitivity of mammograms and may obscure small lesions (ACR D).",
    "BIRADS": "1",
    "FINDINGS": "benign-looking coarse calcification in the lower quadrant.   skin, nipple, and pectoral muscle appear normal. benign-looking axillary adenopathy."
}


AttributeError: 'list' object has no attribute 'length'

In [14]:
# Example usage
query_image_path = "/mnt/data1/raiyan/breast_cancer/datasets/dmid/pixel_level_annotations/png_images/IMG002.png"  # Path to the image you want to query
retrieve_similar_images(query_image_path)

Similar Image 1:
  Image Path: [None, None, None]
  Metadata format error, expected a dictionary.
----------------------------------------


In [25]:
# Example check: retrieve a few entries from ChromaDB to inspect
print(multimodal_db.get(ids=['IMG001.png']))


{'ids': ['IMG001.png'], 'embeddings': None, 'documents': [None], 'uris': None, 'data': None, 'metadatas': [{'BIRADS': 3, 'Breast_Composition': 'predominantly fibro fatty breast parenchyma (ACR B)', 'Findings': 'irregular ill-defined soft opacity with microcalcifications suggests malignant lesion (BIRADS-5)\nsmall well defined soft nodular opacity -- benign lesion (BIRADS-3)\nbenign and vascular calcifications\nSkin and nipple - no abnormality\nNo axillary adenopathy'}], 'included': [<IncludeEnum.documents: 'documents'>, <IncludeEnum.metadatas: 'metadatas'>]}
