# Project Demo

In [8]:
# install requirements
import sys
from PIL import Image
import requests
import torch
from torchvision import transforms
from torchvision.transforms.functional import InterpolationMode
from models.blip import blip_decoder
from collections import defaultdict

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [19]:
def load_demo_image(img_path, image_size, device):
    raw_image = Image.open(img_path).convert('RGB')   

    w,h = raw_image.size
    display(raw_image.resize((w//2,h//2)))
    
    transform = transforms.Compose([
        transforms.Resize((image_size,image_size),interpolation=InterpolationMode.BICUBIC),
        transforms.ToTensor(),
        transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
        ]) 
    image = transform(raw_image).unsqueeze(0).to(device)   
    return image

# Image Captioning
Perform image captioning of UI screenshots using BLIP model

In [5]:
img_path = '/Users/michaelmbajwa/Desktop/DL_Project/Image_Testing/501.jpg'
image_size = 384

In [12]:
# paths to the finetuned models and baseline BLIP models
paths = [
    '/Users/michaelmbajwa/Downloads/model_base_14M.pth',
    '/Users/michaelmbajwa/Desktop/DL_Project/BLIP_w:_ViT-B/BLIP_14M_25_EPOCHS/output/Caption_coco/checkpoint_best.pth',
    '/Users/michaelmbajwa/Downloads/model_base.pth',
    '/Users/michaelmbajwa/Desktop/DL_Project/BLIP_w:_ViT-B/BLIP_25_EPOCHS/output/Caption_coco/checkpoint_best.pth',
    '/Users/michaelmbajwa/Downloads/model_base_capfilt_large.pth',
    '/Users/michaelmbajwa/Desktop/DL_Project/BLIP_w:_ViT-B_CapFilt-L/BLIP_25_EPOCH/output/Caption_coco/checkpoint_best.pth',
    '/Users/michaelmbajwa/Desktop/DL_Project/BLIP_w:_ViT-B_CapFilt-L/BLIP_100_EPOCHS/output/Caption_coco/checkpoint_best.pth',
    '/Users/michaelmbajwa/Downloads/model_large.pth',
    '/Users/michaelmbajwa/Desktop/DL_Project/BLIP_w:_ViT-L/BLIP_25_EPOCHS/output/Caption_coco/checkpoint_best.pth',
    '/Users/michaelmbajwa/Desktop/DL_Project/BLIP_w:_ViT-L/BLIP_100_EPOCHS/output/Caption_coco/checkpoint_best.pth'
]

# Paths to images for testing
images = [
    '/Users/michaelmbajwa/Desktop/DL_Project/Image_Testing/501.jpg',
    '/Users/michaelmbajwa/Desktop/DL_Project/Image_Testing/505.jpg',
    '/Users/michaelmbajwa/Desktop/DL_Project/Image_Testing/15697.jpg',
    '/Users/michaelmbajwa/Desktop/DL_Project/Image_Testing/17832.jpg',
    '/Users/michaelmbajwa/Desktop/DL_Project/Image_Testing/71268.jpg',
    '/Users/michaelmbajwa/Desktop/DL_Project/Image_Testing/72164.jpg',
    '/Users/michaelmbajwa/Desktop/DL_Project/Image_Testing/72201.jpg',
    '/Users/michaelmbajwa/Desktop/DL_Project/Image_Testing/IMG_D89B530EB834-1.jpeg',
    '/Users/michaelmbajwa/Downloads/PHOTO-2022-06-01-15-05-54.jpg'
]

In [None]:
output = defaultdict()
for img_path in images:
    output2 = defaultdict()
    for model_path in paths:
        image = load_demo_image(img_path=img_path, image_size=384, device=device)

        if 'ViT-L' in model_path or model_path=='/Users/michaelmbajwa/Downloads/model_large.pth':
            model = blip_decoder(pretrained=model_path, image_size=384, vit='large')
        else:
            model = blip_decoder(pretrained=model_path, image_size=384, vit='base')
        model.eval()
        model = model.to(device)

        with torch.no_grad():
            # beam search
            caption = model.generate(image, sample=False, num_beams=3, max_length=20, min_length=5)
        
        output2[model_path] = caption
    output[img_path] = output2

In [23]:
import json

with open("/Users/michaelmbajwa/Downloads/results.json", "w") as outfile:
    json.dump(output, outfile)