In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
! pip install ftfy regex tqdm
! pip install git+https://github.com/openai/CLIP.git

Collecting ftfy
  Downloading ftfy-6.0.3.tar.gz (64 kB)
[?25l[K     |█████                           | 10 kB 28.8 MB/s eta 0:00:01[K     |██████████▏                     | 20 kB 24.7 MB/s eta 0:00:01[K     |███████████████▎                | 30 kB 11.1 MB/s eta 0:00:01[K     |████████████████████▍           | 40 kB 8.9 MB/s eta 0:00:01[K     |█████████████████████████▌      | 51 kB 4.2 MB/s eta 0:00:01[K     |██████████████████████████████▋ | 61 kB 4.6 MB/s eta 0:00:01[K     |████████████████████████████████| 64 kB 1.8 MB/s 
Building wheels for collected packages: ftfy
  Building wheel for ftfy (setup.py) ... [?25l[?25hdone
  Created wheel for ftfy: filename=ftfy-6.0.3-py3-none-any.whl size=41933 sha256=9743b4c389378a42e208f9e8fc9f8a384b37f9e81d4cc40abb87a14a6cd8a121
  Stored in directory: /root/.cache/pip/wheels/19/f5/38/273eb3b5e76dfd850619312f693716ac4518b498f5ffb6f56d
Successfully built ftfy
Installing collected packages: ftfy
Successfully installed ftfy-6.0.3
Colle

In [4]:
import numpy as np
import torch
import pickle
import itertools
import os
import cv2
from PIL import Image
from torch import nn
from torch.nn import functional as F
from torch.cuda.amp import GradScaler, autocast

from torch.utils.data import TensorDataset, DataLoader

print("Torch version:", torch.__version__)

assert torch.__version__.split(".") >= ["1", "7", "1"], "PyTorch 1.7.1 or later is required"

Torch version: 1.9.0+cu111


In [5]:
import clip

clip.available_models()

['RN50', 'RN101', 'RN50x4', 'RN50x16', 'ViT-B/32', 'ViT-B/16']

In [21]:
model, preprocess = clip.load("ViT-B/32")
model.cuda().eval()
input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)

Model parameters: 151,277,313
Input resolution: 224
Context length: 77
Vocab size: 49408


In [22]:
# CLIP has some layers explicitly parameterized using fp16 values. We need to
# convert them back to fp32 in order to use automatic mixed-precision training
def convert_weights(model: nn.Module):
    """Convert applicable model parameters to fp32"""

    def _convert_weights_to_fp32(l):
        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
            l.weight.data = l.weight.data.float()
            if l.bias is not None:
                l.bias.data = l.bias.data.float()

        if isinstance(l, nn.MultiheadAttention):
            for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
                tensor = getattr(l, attr)
                if tensor is not None:
                    tensor.data = tensor.data.float()

        for name in ["text_projection", "proj"]:
            if hasattr(l, name):
                attr = getattr(l, name)
                if attr is not None:
                    attr.data = attr.data.float()

    model.apply(_convert_weights_to_fp32)

convert_weights(model)

In [7]:
def preprocess_img(path):
  img = cv2.imread(path)
  img = cv2.resize(img, (256, 256))
  img = img.astype(np.float32) / 255
  return img

def read_pickle(fn):
	with open(fn, "rb") as f:
		return pickle.load(f)
  
DATASET_DIR = "/content/drive/MyDrive/coursework/mmml/DecorAssistant/dataset/text_data/"
IMAGES_DIR = "/content/drive/MyDrive/coursework/mmml/DecorAssistant/dataset/images/all_items/"
  
# {room image url -> string of room category}; e.g.: 'ikea-town-and-country__1364308377063-s4.jpg': 'Living Room'
room_categories = read_pickle(DATASET_DIR + "categories_dict.p")
# {item image ID -> string of item category}; e.g.: '291.292.29': 'Footstool',
item_categories = read_pickle(DATASET_DIR + "categories_images_dict.p")
# {item image id -> dict of descriptions}; e.g. '202.049.06': {'color': 'Grey,black','desc': 'View more product information Concealed press studs keep the quilt in place','img': 'images/objects/202.049.06.jpg','name': 'GURLI','size': '120x180 cm','type': 'Throw'},
item_property = read_pickle(DATASET_DIR + "products_dict.p")
# {item image url -> {description, name}}; e.g: '/static/images/902.592.50.jpg': {'desc': 'The high pile dampens sound and provides a soft surface to walk on.','name': 'GSER'},
item_to_description = read_pickle(DATASET_DIR + "img_to_desc.p")
# {item image url -> list of corresponding room image url}; e.g.: 'images/001.509.85.jpg': ['images/room_scenes/ikea-wake-up-and-grow__1364335362013-s4.jpg','images/room_scenes/ikea-wake-up-and-grow-1364335370196.jpg'],
item_to_rooms_map = read_pickle(DATASET_DIR + "item_to_room.p")
# {room image url -> list of items}; e.g.: 'ikea-work-from-home-in-perfect-harmony__1364319311386-s4.jpg': ['desk','chair']
room_to_item_categories = read_pickle(DATASET_DIR + "room_to_items.p")

# Some simple preprossing
item_to_info = {key : value["type"] + " " +
                             value["desc"]
                       for key, value in item_property.items()}

room_to_items = {}

for item_url, room_url_list in item_to_rooms_map.items():
  item_id = item_url.split("/")[-1].split(".jpg")[0]

  for room_url in room_url_list:
    room_id = room_url.split("/")[-1].split(".jpg")[0]
    if room_id not in room_to_items:
      room_to_items[room_id] = []
    else:
      room_to_items[room_id].append(item_id)

all_positive_pairs = []
for room, item_id_list in room_to_items.items():
  pairs_for_current_room = list(itertools.combinations(room_to_items[room], 2))
  all_positive_pairs += pairs_for_current_room


train_pairs = all_positive_pairs[500:650]
val_pairs = train_pairs

In [8]:
image_premise_id_list = [x[0] for x in train_pairs]
image_hypothesis_id_list = [x[1] for x in train_pairs]
X_image_premise = torch.stack([preprocess(Image.open(IMAGES_DIR + image_id + ".jpg")) for image_id in image_premise_id_list])
X_image_hypothesis = torch.stack([preprocess(Image.open(IMAGES_DIR + image_id + ".jpg")) for image_id in image_hypothesis_id_list])

y = np.array([np.array([0, 1]) for _ in range(len(train_pairs))])

premise_texts = [item_to_info[id] for id in image_premise_id_list]
hypothesis_texts = [item_to_info[id] for id in image_hypothesis_id_list]

X_text_premise = clip.tokenize(premise_texts)
X_text_hypothesis = clip.tokenize(hypothesis_texts)

In [10]:
BATCH_SIZE = 32

img_train_data = TensorDataset(X_image_premise, X_image_hypothesis, torch.from_numpy(y))
text_train_data = TensorDataset(X_text_premise, X_text_hypothesis, torch.from_numpy(y))

img_val_data = img_train_data
text_val_data = text_train_data

text_train_loader = DataLoader(text_train_data, batch_size=BATCH_SIZE)
img_train_loader = DataLoader(img_train_data, batch_size=BATCH_SIZE)

text_val_loader = DataLoader(text_val_data, batch_size=BATCH_SIZE)
img_val_loader = DataLoader(img_val_data, batch_size=BATCH_SIZE)

print(len(text_train_loader), len(img_train_loader))
print(len(text_val_loader), len(img_val_loader))

5 5
5 5


In [29]:
# Find out embedding shapes
print(model.encode_image(X_image_premise[0:1].cuda()).size())
print(model.encode_text(X_text_premise[0:1].cuda()).size())

torch.Size([1, 512])
torch.Size([1, 512])


In [23]:
class CLIPIKEA(nn.Module):
    def __init__(self, clip_model, embedding_dim, n_out):
        super(CLIPIKEA, self).__init__()

        self.clip_model = clip_model
        self.combined_fc1 = nn.Linear(embedding_dim * 4, 256)
        self.output_fc = nn.Linear(256, n_out)

    def forward(self, txt_1, txt_2, img_1, img_2):
        batch_size = txt_1.size(0)

        with autocast(enabled=False):
            txt_emb_1 = self.clip_model.encode_text(txt_1)
            txt_emb_2 = self.clip_model.encode_text(txt_2)
            img_emb_1 = self.clip_model.encode_image(img_1)
            img_emb_2 = self.clip_model.encode_image(img_2)

        all_emb = torch.cat((txt_emb_1, txt_emb_2, img_emb_1, img_emb_2), 1)
        x_comb = F.relu(self.combined_fc1(all_emb))
        out = self.output_fc(x_comb)

        return out

In [24]:
output_size = y.shape[1]
print(output_size)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

full_model = CLIPIKEA(model, 512, output_size)
full_model.to(device)

lr=0.001
# criterion = nn.MultiLabelSoftMarginLoss()
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(full_model.parameters(), lr=lr, weight_decay=1e-5)

2
cuda


In [28]:
epochs = 1
grad_clip = 5

# Scale gradients to use fp16 training
scaler = GradScaler()

full_model.train()
for i in range(epochs):
    total_acc_train = 0
    total_loss_train = 0

    for lstm, cnn in zip(text_train_loader, img_train_loader):
        lstm_inp1, lstm_inp2, lstm_labels = lstm
        cnn_inp1, cnn_inp2, cnn_labels = cnn
        
        with autocast():
            lstm_inp1, lstm_inp2, lstm_labels = lstm_inp1.to(device), lstm_inp2.to(device), lstm_labels.to(device)
            cnn_inp1, cnn_inp2, cnn_labels = cnn_inp1.to(device), cnn_inp2.to(device), cnn_labels.to(device)
            full_model.zero_grad()
            output = full_model(lstm_inp1, lstm_inp2, cnn_inp1, cnn_inp2)
            loss = criterion(output.squeeze(), lstm_labels.half())
        
        scaler.scale(loss).backward()
        
        scaler.unscale_(optimizer)
        nn.utils.clip_grad_norm_(full_model.parameters(), grad_clip)
        scaler.step(optimizer)
        scaler.update()

        with torch.no_grad():
            acc = torch.abs(torch.sigmoid(output.squeeze()) - lstm_labels.float()).view(-1)
            acc = (1. - acc.sum() / acc.size()[0])
            total_acc_train += acc
            total_loss_train += loss.item()
  
    train_acc = total_acc_train/len(text_train_loader)
    train_loss = total_loss_train/len(text_train_loader)
    full_model.eval()
    total_acc_val = 0
    total_loss_val = 0
    with torch.no_grad():
        for lstm, cnn in zip(text_val_loader, img_val_loader):
            lstm_inp1, lstm_inp2, lstm_labels = lstm
            cnn_inp1, cnn_inp2, cnn_labels = cnn
            lstm_inp1, lstm_inp2, lstm_labels = lstm_inp1.to(device), lstm_inp2.to(device), lstm_labels.to(device)
            cnn_inp1, cnn_inp2, cnn_labels = cnn_inp1.to(device), cnn_inp2.to(device), cnn_labels.to(device)
            full_model.zero_grad()
            output = full_model(lstm_inp1, lstm_inp2, cnn_inp1, cnn_inp2)
            val_loss = criterion(output.squeeze(), lstm_labels.float())
            acc = torch.abs(torch.sigmoid(output.squeeze()) - lstm_labels.float()).view(-1)
            acc = (1. - acc.sum() / acc.size()[0])
            total_acc_val += acc
            total_loss_val += val_loss.item()
    val_acc = total_acc_val/len(text_val_loader)
    val_loss = total_loss_val/len(text_val_loader)
    print(f'Epoch {i+1}: train_loss: {train_loss:.4f} train_acc: {train_acc:.4f} | val_loss: {val_loss:.4f} val_acc: {val_acc:.4f}')
    full_model.train()
    torch.cuda.empty_cache()

Epoch 1: train_loss: 0.0000 train_acc: 1.0000 | val_loss: 0.0000 val_acc: 1.0000
