In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import random
import torchvision.transforms as transforms
import matplotlib.pyplot as plt

from torch.utils.data import DataLoader, Dataset
from torchvision.datasets import MNIST
from torch import optim

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
mnist_train = MNIST(root='./data', train=True, download=True)
mnist_test = MNIST(root='./data', train=False, download=True)

transform = transforms.Compose([transforms.ToTensor()])

In [3]:
class SiameseDataset(Dataset):
    def __init__(self, data, transform=None):
        self.data = data
        self.transform = transform
    
    def __len__(self):
        return len(self.data)  # Add this method
        

    def __getitem__(self, index):
    
        imgA, labelA = self.data[index]
            
        same_class_flag = random.randint(0, 1) # pair with same class?
        
        if same_class_flag: # yes, pair with same class
            labelB = -1
            while labelB != labelA:
                imgB, labelB = random.choice(self.data)
                
        else: # no, pair with different class
            labelB = labelA
            while labelB == labelA:
                imgB, labelB = random.choice(self.data)

        if self.transform:
            imgA = self.transform(imgA)
            imgB = self.transform(imgB)
            
        pair_label = torch.tensor([(labelA != labelB)], dtype=torch.float32)
            
        return imgA, imgB, pair_label

In [4]:
siamese_train = SiameseDataset(mnist_train, transform)
siamese_test = SiameseDataset(mnist_test, transform)

In [5]:
class SiameseNetwork(nn.Module):
    def __init__(self):
        super(SiameseNetwork, self).__init__()

        self.cnn = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=5, stride=1, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, stride=2),

            nn.Conv2d(64, 128, kernel_size=5, stride=1, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, stride=2),

            nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2, stride=2)
        )

        self.fc = nn.Sequential(
            nn.Linear(256 * 3 * 3, 1024),
            nn.ReLU(inplace=True),

            nn.Linear(1024, 256),
            nn.ReLU(inplace=True),

            nn.Linear(256, 2)
        )

    def forward_once(self, x):
        output = self.cnn(x)
        output = output.view(output.size()[0], -1)
        output = self.fc(output)
        return output

    def forward(self, inputA, inputB):
        outputA = self.forward_once(inputA)
        outputB = self.forward_once(inputB)
        return outputA, outputB

In [6]:
class ContrastiveLoss(torch.nn.Module):
    
    def __init__(self, margin=2.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, outputA, outputB, y):
        euclidean_distance = F.pairwise_distance(outputA, outputB, keepdim = True)

        same_class_loss = (1-y) * (euclidean_distance**2)
        diff_class_loss = (y) * (torch.clamp(self.margin - euclidean_distance, min=0.0)**2)
    
        return torch.mean(same_class_loss + diff_class_loss)

In [7]:
train_dataloader = DataLoader(siamese_train, shuffle=True, num_workers=0, batch_size=64)
model = SiameseNetwork()
criterion = ContrastiveLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.001)

In [None]:
for epoch in range(5):
    total_loss = 0
    
    for imgA, imgB, label in train_dataloader:

        # imgA, imgB, label = imgA.cuda(), imgB.cuda(), label.cuda()
        optimizer.zero_grad()
        outputA, outputB = model(imgA, imgB)
        loss_contrastive = criterion(outputA, outputB, label)
        loss_contrastive.backward()

        total_loss += loss_contrastive.item()
        optimizer.step()

    print(f"Epoch {epoch}; Loss {total_loss}")

Epoch 0; Loss 286.3716132491827
Epoch 1; Loss 105.9401145670563


## CLIP (Contrastive Language-Image Pre-Training)

### Model Training 

In [None]:
from datasets import load_dataset

data = load_dataset(
    "jamescalam/image-text-demo",
    split="train"
)

In [None]:
data['text'][3]
data['image'][3]

In [None]:
from transformers import CLIPProcessor, CLIPModel
import torch

model_id = "openai/clip-vit-base-patch32"

processor = CLIPProcessor.from_pretrained(model_id)
model = CLIPModel.from_pretrained(model_id)

# move model to device if possible
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model.to(device)

In [None]:
text = data['text']
images = data['image']

inputs = processor(text=text, 
                   images=images,
                   return_tensors="pt",
                   padding=True
                   ).to(device)

print(inputs.keys())

In [None]:
outputs = model(**inputs)

print(outputs.keys())


In [None]:
text_emb = outputs.text_embeds
image_emb = outputs.image_embeds

print(text_emb.shape)
print(image_emb.shape)


In [35]:
text_emb = text_emb / torch.norm(text_emb, dim=1, keepdim=True)

image_emb = image_emb / torch.norm(image_emb, dim=1, keepdim=True)

cos_sim = torch.mm(text_emb, image_emb.T).detach().numpy()

In [None]:
import matplotlib.pyplot as plt

plt.imshow(cos_sim)
plt.show()

## Text to Image Retrieval

In [37]:
query_text = "Dog running on grass"

# preprocess text (tokenize, etc.)
inputs = processor(text=[query_text], return_tensors="pt", padding=True).to(device)

# generate text embeddings
text_features = model.get_text_features(**inputs)

# normalize text embedding
text_features = text_features / text_features.norm(dim=-1, keepdim=True)
    
# Calculate similarity scores across all image embddings
similarity = torch.mm(text_features, image_emb.T)
    
# Get top-k matches
top_k = 3
values, indices = similarity[0].topk(min(top_k, len(data)))

In [None]:
fig, axes = plt.subplots(1, top_k, figsize=(15, 3))
    
for i, (idx, score) in enumerate(zip(indices, values)):
    # Print text and score
    print(f"{data['text'][idx]}: {score:.3f}")
        
    # Display image
    axes[i].imshow(data['image'][idx])
    axes[i].axis('off')
    axes[i].set_title(f"Score: {score:.3f}")
    
plt.tight_layout()
plt.show()

## Image to Image Retrieval

In [39]:
query_image = data['image'][0]

# preprocess image
inputs = processor(images=query_image, return_tensors="pt", padding=True).to(device)

# generate image embeddings
image_features = model.get_image_features(**inputs)

# normalize image embedding
image_features = image_features / image_features.norm(dim=-1, keepdim=True)
    
# Calculate similarity scores across all image embddings
similarity = torch.mm(image_features, image_emb.T)
    
# Get top-k matches
values, indices = similarity[0].topk(min(top_k, len(data)))

In [None]:
fig, axes = plt.subplots(1, top_k, figsize=(15, 3))
    
for i, (idx, score) in enumerate(zip(indices, values)):
    # Print text and score
    print(f"{data['text'][idx]}: {score:.3f}")
    
    # Display image
    axes[i].imshow(data['image'][idx])
    axes[i].axis('off')
    axes[i].set_title(f"Score: {score:.3f}")
    
plt.tight_layout()
plt.show()

## Zero-shot classification

In [41]:
from PIL import Image
import requests

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)

In [None]:
image

In [42]:
inputs = processor(
    text=["a photo of a cat", "a photo of a dog"],
    images=image,
    return_tensors="pt",
    padding=True,
)

outputs = model(**inputs)

In [None]:
logits_per_image = outputs.logits_per_image

probs = logits_per_image.softmax(dim=1)

print(probs)

## Multi-modal prompting

In [None]:
import ollama
from IPython.display import Markdown, display

response1 = ollama.chat(
    model='llama3.2-vision',
    messages = [{"role": "user",
                "content": "Who wrote the book Lord of the Rings?"
                }]
                )

display(Markdown(response1.message.content))

In [None]:
response2 = ollama.chat(
    model='llama3.2-vision',    
    messages = [
      {"role": "user",
       "content": "Who wrote the book Lord of the Rings?"},
       
      {"role": "assistant",
       "content": response1.message.content},
      
      {"role": "user",
       "content": "What other books has the author written?"}
    ],
)

display(Markdown(response2.message.content))

In [None]:
class Conversation:
    def __init__(self, system=""):
        self.messages = [] # define history list
        
        if system:
            self.messages.append({"role": "system", "content": system})
            
    def generate(self, user_question):
    
        # append user query to history under "user" role
        self.messages.append({"role": "user", "content":user_question})
        
        # generate response from LLM
        response = ollama.chat(model='llama3.2-vision', messages=self.messages)
        
        # Add LLM's response to the history under "assistant" role
        self.messages.append({"role":"assistant", "content":response.message.content})
        
        return response

In [None]:
# define conversation
system_message = "You are a terse expert in high fantasy literature."
conv = Conversation(system_message)

# generate response from query
response = conv.generate("Who wrote the book Lord of the Rings?")

# display response
display(Markdown(response2.message.content))

In [None]:
print(conv.messages)

### Specify images in the prompt

In [None]:
import ollama

response = ollama.chat(
    model='llama3.2-vision',
    messages=[{
        'role': 'user',
        'content': 'Describe this image.',
        'images': ['dogs.png']
    }]
)

display(Markdown(response.message.content))

In [None]:
import ollama

response = ollama.chat(
    model='llama3.2-vision',
    messages=[{
        'role': 'user',
        'content': 'Describe this image.',
        'images': ['dogs1.png', "dogs2.png"]
    }]
)

display(Markdown(response.message.content))

## A realistic OCR use case

In [None]:
import ollama

image_path = "image.png"

response = ollama.chat(
                model='llama3.2-vision',
                messages=[{'role': 'user',
                           'content': """Analyze the text in the provided image.
                                         Extract all readable content and present
                                         it in a structured Markdown format that
                                         is clear, concise, and well-organized.
                                         Ensure proper formatting (e.g., headings,
                                         lists, or code blocks) as necessary to
                                         represent the content effectively.""",
                            'images': [image_path]
                            }]
                        )

print(response.message.content)

## Tool Calling

### Demo

In [9]:
import yfinance as yf
import ollama

In [10]:
def get_stock_price(ticker: str) -> float:
    stock = yf.Ticker(ticker)
    return stock.history(period='1d')['Close'].iloc[-1]

In [None]:
response = ollama.chat(
    'llama3.2',
    messages=[{'role': 'user', 'content': 'What is the stock price of Apple?'}],
    tools=[get_stock_price],  # Pass the tool function reference
)

In [None]:
print(dict(response))

In [None]:
available_functions = {
    'get_stock_price': get_stock_price,
}

for tool in response.message.tool_calls or []:

    function_to_call = available_functions.get(tool.function.name)

    if function_to_call:
        print('Arguments:', tool.function.arguments)
        print('Function output:', function_to_call(**tool.function.arguments))

    else:
        print('Function not found:', tool.function.name)