In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
! pip install ftfy regex tqdm
! pip install git+https://github.com/openai/CLIP.git

Collecting ftfy
  Downloading ftfy-6.0.3.tar.gz (64 kB)
[K     |████████████████████████████████| 64 kB 2.0 MB/s eta 0:00:011
Building wheels for collected packages: ftfy
  Building wheel for ftfy (setup.py) ... [?25l[?25hdone
  Created wheel for ftfy: filename=ftfy-6.0.3-py3-none-any.whl size=41933 sha256=f8b8ed4a3f3b8ca7bee1bb49fadc9e96e6783eeffba1d534a6da51ada7104fce
  Stored in directory: /root/.cache/pip/wheels/19/f5/38/273eb3b5e76dfd850619312f693716ac4518b498f5ffb6f56d
Successfully built ftfy
Installing collected packages: ftfy
Successfully installed ftfy-6.0.3
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-opzxh8dq
  Running command git clone -q https://github.com/openai/CLIP.git /tmp/pip-req-build-opzxh8dq
Building wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369090 sha256=f7f07a21eb168142460

In [1]:
import numpy as np
import torch
import pickle
import itertools
import os
import random
import cv2
from PIL import Image
from torch import nn
from torch.nn import functional as F
from torch.cuda.amp import GradScaler, autocast
from sklearn.model_selection import train_test_split
import tqdm
import matplotlib.pyplot as plt
import emblaze

from torch.utils.data import Dataset, TensorDataset, DataLoader

print("Torch version:", torch.__version__)

assert torch.__version__.split(".") >= ["1", "7", "1"], "PyTorch 1.7.1 or later is required"

Torch version: 1.7.1


In [2]:
import clip

clip.available_models()

['RN50', 'RN101', 'RN50x4', 'RN50x16', 'ViT-B/32', 'ViT-B/16']

In [3]:
model, preprocess = clip.load("ViT-B/32")
input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)

Model parameters: 151,277,313
Input resolution: 224
Context length: 77
Vocab size: 49408


In [4]:
# CLIP has some layers explicitly parameterized using fp16 values. We need to
# convert them back to fp32 in order to use automatic mixed-precision training
def convert_weights(model: nn.Module):
    """Convert applicable model parameters to fp32"""

    def _convert_weights_to_fp32(l):
        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
            l.weight.data = l.weight.data.float()
            if l.bias is not None:
                l.bias.data = l.bias.data.float()

        if isinstance(l, nn.MultiheadAttention):
            for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
                tensor = getattr(l, attr)
                if tensor is not None:
                    tensor.data = tensor.data.float()

        for name in ["text_projection", "proj"]:
            if hasattr(l, name):
                attr = getattr(l, name)
                if attr is not None:
                    attr.data = attr.data.float()

    model.apply(_convert_weights_to_fp32)

convert_weights(model)

In [6]:
import gc
gc.collect()
torch.cuda.empty_cache()

# Dataset Preprocessing

In [7]:
BASE_DIR = "../dataset/"
DATASET_DIR = BASE_DIR + "text_data/"
IMAGES_DIR = BASE_DIR + "images/all_items/"
POSITIVE_SIZE = None # We might only use a subset of the positive pairs
TRAIN_TEST_RATIO = 0.33

In [8]:
def preprocess_img(path):
  img = cv2.imread(path)
  img = cv2.resize(img, (256, 256))
  img = img.astype(np.float32) / 255
  return img

def read_pickle(fn):
	with open(fn, "rb") as f:
		return pickle.load(f)

In [9]:
def random_negative_sampling(products, all_positive_pairs, count=None, random_state=None):
  selected_negative_pairs = []
  if random_state is not None: random.seed(random_state)
  while len(selected_negative_pairs) < (count or len(all_positive_pairs)):
    random_pair = tuple(random.sample(products, 2))
    if random_pair in all_positive_pairs:
      continue
    else:
      selected_negative_pairs.append(random_pair)
  return selected_negative_pairs
  
# {room image url -> string of room category}; e.g.: 'ikea-town-and-country__1364308377063-s4.jpg': 'Living Room'
room_categories = read_pickle(DATASET_DIR + "categories_dict.p")
# {item image ID -> string of item category}; e.g.: '291.292.29': 'Footstool',
item_categories = read_pickle(DATASET_DIR + "categories_images_dict.p")
# {item image id -> dict of descriptions}; e.g. '202.049.06': {'color': 'Grey,black','desc': 'View more product information Concealed press studs keep the quilt in place','img': 'images/objects/202.049.06.jpg','name': 'GURLI','size': '120x180 cm','type': 'Throw'},
item_property = read_pickle(DATASET_DIR + "products_dict.p")
# {item image url -> {description, name}}; e.g: '/static/images/902.592.50.jpg': {'desc': 'The high pile dampens sound and provides a soft surface to walk on.','name': 'GSER'},
item_to_description = read_pickle(DATASET_DIR + "img_to_desc.p")
# {item image url -> list of corresponding room image url}; e.g.: 'images/001.509.85.jpg': ['images/room_scenes/ikea-wake-up-and-grow__1364335362013-s4.jpg','images/room_scenes/ikea-wake-up-and-grow-1364335370196.jpg'],
item_to_rooms_map = read_pickle(DATASET_DIR + "item_to_room.p")
# {room image url -> list of items}; e.g.: 'ikea-work-from-home-in-perfect-harmony__1364319311386-s4.jpg': ['desk','chair']
room_to_item_categories = read_pickle(DATASET_DIR + "room_to_items.p")

# Some simple preprossing
item_to_info = {key : value["type"] + " " +
                             value["desc"]
                       for key, value in item_property.items()} # remove view more info

room_to_items = {}

for item_url, room_url_list in item_to_rooms_map.items():
  item_id = item_url.split("/")[-1].split(".jpg")[0]
  if not os.path.exists(IMAGES_DIR + item_id + ".jpg"):
      print(item_url + " does not exist")
      continue

  for room_url in room_url_list:
    room_id = room_url.split("/")[-1].split(".jpg")[0]
    if room_id not in room_to_items:
      room_to_items[room_id] = [item_id]
    else:
      room_to_items[room_id].append(item_id)

all_positive_pairs = set()
for room, item_id_list in room_to_items.items():
  pairs_for_current_room = list(itertools.combinations(room_to_items[room], 2)) # n choose 2
  all_positive_pairs |= set(pairs_for_current_room)

# if POSITIVE_SIZE is not None:
#     sampled_positives = all_positive_pairs[:POSITIVE_SIZE] # Uncomment to subsample
# else:
#     sampled_positives = all_positive_pairs
# all_pairs = sampled_positives + random_negative_sampling(all_positive_pairs, count=len(sampled_positives))
# y = np.array([1 for _ in range(len(all_positive_pairs))] + 
#              [0 for _ in range(len(all_positive_pairs))])
# train_pairs, val_pairs, y_train, y_val = train_test_split(all_pairs, y, test_size=TRAIN_TEST_RATIO, random_state=517)

# # Shuffle now so batches are not all positive or all negative
# train_indices = np.random.permutation(np.arange(len(train_pairs)))
# train_pairs = [train_pairs[i] for i in train_indices]
# y_train = y_train[train_indices]

# val_indices = np.random.permutation(np.arange(len(val_pairs)))
# val_pairs = [val_pairs[i] for i in val_indices]
# y_val = y_val[val_indices]

# len(train_pairs), len(val_pairs), y_train[:10], y_val[:10]

images/890.333.75.jpg does not exist
images/991.333.98.jpg does not exist
images/990.612.97.jpg does not exist


In [10]:
# To read the validation sets only
with open(BASE_DIR + "val_data.pkl", "rb") as file:
    val_pairs, y_val = pickle.load(file)

In [32]:
# Optional subsampling
SUBSAMPLE_SIZE = 32
val_pairs = val_pairs[:SUBSAMPLE_SIZE]
y_val = y_val[:SUBSAMPLE_SIZE]

In [11]:
class FurnitureImagePairsDataset(Dataset):
    """Dataset containing pairs of furniture items."""

    def __init__(self, image_path, pairs, labels):
        """
        Args:
            image_path (string): Path to the directory containing images.
            pairs (list of tuples of strings): Pairs of image IDs to be used as training samples.
            labels (array of integers): Labels for the training samples.
        """
        super(FurnitureImagePairsDataset, self).__init__()
        self.image_ids = list(set(x for pair in pairs for x in pair))
        self.index_mapping = {image_id: i for i, image_id in enumerate(self.image_ids)}
        self.images = [preprocess(Image.open(image_path + image_id + ".jpg")) for image_id in tqdm.tqdm(self.image_ids)]
        self.pairs = pairs
        self.labels = labels

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        if isinstance(idx, (list, tuple)):
            x1, x2, y = zip(*[self[i] for i in idx])
            return torch.stack(x1), torch.stack(x2), torch.from_numpy(np.array(y))

        pair = self.pairs[idx]
        return self.images[self.index_mapping[pair[0]]], self.images[self.index_mapping[pair[1]]], self.labels[idx]

In [13]:
product_ids = sorted(list(set(x for pair in val_pairs for x in pair)))
val_images = torch.stack([preprocess(Image.open(IMAGES_DIR + image_id + ".jpg")) for image_id in tqdm.tqdm(product_ids)])

def tokenize(text):
  try:
      return clip.tokenize(text)
  except:
      return clip.tokenize(' '.join(text.split()[:50]))

val_texts = torch.cat([tokenize(item_to_info[id]) for id in product_ids], 0)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 718/718 [00:05<00:00, 136.57it/s]


In [14]:
BATCH_SIZE = 32

val_data = TensorDataset(val_images, val_texts)
val_loader = DataLoader(val_data, batch_size=BATCH_SIZE)

print(len(val_loader))

23


# Ranking Evaluation

Build a dataset where a random sample of "premise" products is compared against all "hypothesis" products in the validation set.

In [12]:
# Build pairs to rank

NUM_QUERIES = 50

val_products = sorted(list(set(x for pair in val_pairs for x in pair)))

np.random.seed(1234)
premise_products = np.random.choice(val_products, size=min(NUM_QUERIES, len(val_products)), replace=False)
hypothesis_products = val_products

ranking_pairs = list(itertools.product(premise_products, hypothesis_products))
print(len(ranking_pairs), "pairs,", len(hypothesis_products), "products per query")

35900 pairs, 718 products per query


In [13]:
# Get the ground-truth

ground_truth_map = {}
for item_url, room_url_list in item_to_rooms_map.items():
    item_id = item_url.split("/")[-1].split(".jpg")[0]
    if item_id not in premise_products: continue

    for room_url in room_url_list:
        room_id = room_url.split("/")[-1].split(".jpg")[0]
        ground_truth_map[item_id] = ground_truth_map.get(item_id, set()) | set(room_to_items[room_id])
ground_truth_lists = [ground_truth_map[item_id] for item_id in premise_products]

# plt.hist([len(x) for x in ground_truth_lists], bins=np.arange(0, max(len(x) for x in ground_truth_lists), 5));

In [14]:
X_rank_image = FurnitureImagePairsDataset(IMAGES_DIR, ranking_pairs, np.zeros(len(ranking_pairs)))

def tokenize(text):
  try:
      return clip.tokenize(text)
  except:
      return clip.tokenize(' '.join(text.split()[:50]))

X_rank_text_premise = torch.cat([tokenize(item_to_info[id]) for id, _ in ranking_pairs], 0)
X_rank_text_hypothesis = torch.cat([tokenize(item_to_info[id]) for _, id in ranking_pairs], 0)
X_rank_text_premise.size()

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 718/718 [00:05<00:00, 136.92it/s]


torch.Size([35900, 77])

In [15]:
BATCH_SIZE = 32

img_ranking_data = X_rank_image # TensorDataset(torch.from_numpy(X_val_image_premise), torch.from_numpy(X_val_image_hypothesis), torch.from_numpy(y_val))
text_ranking_data = TensorDataset(X_rank_text_premise, X_rank_text_hypothesis, torch.zeros(len(ranking_pairs)))

text_ranking_loader = DataLoader(text_ranking_data, batch_size=BATCH_SIZE)
img_ranking_loader = DataLoader(img_ranking_data, batch_size=BATCH_SIZE)

print(len(text_ranking_loader), len(img_ranking_loader))

1122 1122


In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [18]:
model.eval()
ranking_results = []
with torch.no_grad():
    for lstm, cnn in tqdm.tqdm(zip(text_ranking_loader, img_ranking_loader), total=len(text_ranking_loader)):
        lstm_inp1, lstm_inp2, _ = lstm
        cnn_inp1, cnn_inp2, _ = cnn
        lstm_inp1, lstm_inp2 = lstm_inp1.to(device), lstm_inp2.to(device)
        cnn_inp1, cnn_inp2 = cnn_inp1.to(device), cnn_inp2.to(device)
        emb_1 = torch.cat((model.encode_image(cnn_inp1), model.encode_text(lstm_inp1)), 1)
        emb_2 = torch.cat((model.encode_image(cnn_inp2), model.encode_text(lstm_inp2)), 1)
        output = torch.sum(emb_1 * emb_2, axis=1) / (torch.linalg.norm(emb_1, 2, 1) * torch.linalg.norm(emb_2, 2, 1))
        ranking_results.append(output.cpu().numpy())
ranking_results = np.concatenate(ranking_results).reshape(len(premise_products), len(hypothesis_products))
print(ranking_results.shape)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1122/1122 [02:11<00:00,  8.51it/s]

(50, 718)





In [19]:
import math
class Evaluator:
    def __init__(self, GroundTruth):
      self.GroundTruth = GroundTruth

    def NDCG_Eval(self, rankresult, topk):
      sortedRankResult = sorted(rankresult.items(), key = lambda x:x[1], reverse=True)
      DCGScore = 0
      result = []
      for i, item in enumerate(sortedRankResult[:topk]):
        if item[0] in self.GroundTruth:
          result.append((item, i))
      DCGScore = sum([item[0][1]/math.log(item[1]+2, 2) for item in result])
      IDCGScore = sum([1/math.log(i+2,2) for i in range(topk)])
      NDCG = DCGScore / IDCGScore

      return NDCG
    
    def Score_Eval(self, rankresult, topk):
      sortedRankResult = sorted(rankresult.items(), key = lambda x:x[1], reverse=True)
      return sum(i[1] for i in sortedRankResult[:topk] if i[0] in self.GroundTruth) / topk
    
    def Precision(self, rankresult, topk):
      sortedRankResult = sorted(rankresult.items(), key = lambda x:x[1], reverse=True)
      topkresult = sortedRankResult[:topk]
      return len([i for i in sortedRankResult[:topk] if i[0] in self.GroundTruth]) / len(topkresult)

    def Recall(self, rankresult, topk):
      sortedRankResult = sorted(rankresult.items(), key = lambda x:x[1], reverse=True)
      topkresult = sortedRankResult[:topk]
      return len([i for i in sortedRankResult[:topk] if i[0] in self.GroundTruth]) / len(self.GroundTruth)
    
    def FValue(self, rankresult, topk):
      precision = self.Precision(rankresult, topk)
      recall = self.Recall(rankresult, topk)
      return 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0

In [21]:
# Now use the evaluator
TOP_K = 10

ndcg = np.zeros(len(premise_products))
score = np.zeros(len(premise_products))
precision = np.zeros(len(premise_products))
recall = np.zeros(len(premise_products))
fvalue = np.zeros(len(premise_products))
for i, (ground_truth, rankings) in enumerate(zip(ground_truth_lists, ranking_results)):
    evaluator = Evaluator(ground_truth)
    rankings = {product: output for product, output in zip(hypothesis_products, rankings)}
    ndcg[i] = evaluator.NDCG_Eval(rankings, TOP_K)
    score[i] = evaluator.Score_Eval(rankings, TOP_K)
    precision[i] = evaluator.Precision(rankings, TOP_K)
    recall[i] = evaluator.Recall(rankings, TOP_K)
    fvalue[i] = evaluator.FValue(rankings, TOP_K)
print("NDCG: {:.4f} (95% CI {:.3f}-{:.3f})".format(ndcg.mean(), ndcg.mean() - 1.96 * ndcg.std(), ndcg.mean() + 1.96 * ndcg.std()))
print("Score: {:.4f} (95% CI {:.3f}-{:.3f})".format(score.mean(), score.mean() - 1.96 * score.std(), score.mean() + 1.96 * score.std()))
print("Precision: {:.4f} (95% CI {:.3f}-{:.3f})".format(precision.mean(), precision.mean() - 1.96 * precision.std(), precision.mean() + 1.96 * precision.std()))
print("Recall: {:.4f} (95% CI {:.3f}-{:.3f})".format(recall.mean(), recall.mean() - 1.96 * recall.std(), recall.mean() + 1.96 * recall.std()))
print("FValue: {:.4f} (95% CI {:.3f}-{:.3f})".format(fvalue.mean(), fvalue.mean() - 1.96 * fvalue.std(), fvalue.mean() + 1.96 * fvalue.std()))

with open("untuned_rankings_{}_queries.pkl".format(len(premise_products)), "wb") as file:
    pickle.dump({
        "ndcg": ndcg,
        "score": score,
        "precision": precision,
        "recall": recall,
        "fvalue": fvalue
    }, file)

NDCG: 0.2389 (95% CI 0.133-0.345)
Score: 0.1212 (95% CI 0.027-0.216)
Precision: 0.1240 (95% CI 0.016-0.232)
Recall: 0.0760 (95% CI -0.008-0.160)
FValue: 0.0886 (95% CI 0.014-0.164)
