In [1]:
!wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
!gunzip GoogleNews-vectors-negative300.bin.gz

--2021-12-01 01:18:22--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.166.176
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.166.176|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘GoogleNews-vectors-negative300.bin.gz’


2021-12-01 01:18:56 (46.4 MB/s) - ‘GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [38]:
import cv2
import numpy as np
import pickle
import os
import itertools
import tqdm
from PIL import Image

import torch 
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torchvision import transforms

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from gensim.models import KeyedVectors
import random
from sklearn.model_selection import train_test_split
# random.seed(517)

import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt

In [208]:
# Global Path Vairables
# ROOT_DIR =  "drive/MyDrive/DecorAssist/"
# DATASET_DIR = ROOT_DIR + "IKEA/text_data/"
# IMAGES_DIR = ROOT_DIR + "IKEA/images/all_items/"
ROOT_DIR =  "/content/drive/Othercomputers/My MacBook Pro/GitHub/DecorAssistant"
DATASET_DIR = ROOT_DIR + "/dataset/text_data/"
IMAGES_DIR = ROOT_DIR + "/dataset/images/all_items/"

# Global Parameter Variables
MAX_SEQUENCE_LENGTH = 1950
NUM_WORDS_TOKENIZER = 50000
EMBEDDING_DIM = 300
BATCH_SIZE = 16
POSITIVE_SIZE = 1000 # We might only use a subset of the positive pairs
TRAIN_TEST_RATIO = 0.33

# Model Hyperparameters
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
LEARNING_RATE = 2e-5 # 0.001
HIDDEN_DIM = 64 # 64
N_LAYERS = 2 # 2
EPOCHS = 10
CLIP = 5
DROPOUT = 0.1

In [138]:
def preprocess_img(path):
  img = cv2.imread(path)
  try:
    img = cv2.resize(img, (28, 28))
  except:
    print(path)
  img = img.astype(np.float32) / 255
  img = np.reshape(img, (3, 28 ,28))
  # print(img.shape)
  return img


def read_pickle(fn):
	with open(fn, "rb") as f:
		return pickle.load(f)


# Train-val split that does not share products between training and validation sets.
def generate_product_limited_samples(products, all_positive_pairs, random_state=None):
    """
    Generates positive and negative examples for the given products using shared
    occurence in rooms to indicate whether two products are compatible.

    products: A sequence of product IDs; ALL positive and negative pairs must
        contain only these product IDs.
    all_positive_pairs: A set of product ID pairs that are positive.
    
    Returns: A tuple (x, y), where x is a sequence of product ID pairs and y is
        the array of [0 or 1] labels indicating presence in all_positive_pairs.
    """
    product_set = set(products)
    within_positive_pairs = [p for p in sorted(all_positive_pairs) if p[0] in product_set and p[1] in product_set]
    negative_pairs = random_negative_sampling(products, all_positive_pairs, count=len(within_positive_pairs), random_state=random_state)
    x = within_positive_pairs + negative_pairs
    y = np.array([1] * len(within_positive_pairs) + [0] * len(negative_pairs))
    if random_state is not None: np.random.seed(random_state)
    indices = np.random.permutation(np.arange(len(x)))
    return [x[i] for i in indices], y[indices]


def random_negative_sampling(products, all_positive_pairs, count=None, random_state=None):
  selected_negative_pairs = []
  if random_state is not None: random.seed(random_state)
  while len(selected_negative_pairs) < (count or len(all_positive_pairs)):
    random_pair = tuple(random.sample(products, 2))
    if random_pair in all_positive_pairs:
      continue
    else:
      selected_negative_pairs.append(random_pair)
  return selected_negative_pairs


def get_embedding_matrix(word_index, weights_path="/content/GoogleNews-vectors-negative300.bin"):
  word2vecDict = KeyedVectors.load_word2vec_format(weights_path, binary=True)
  embed_size = 300
  embeddings_index = dict()
  for word in word2vecDict.wv.vocab:
    embeddings_index[word] = word2vecDict.word_vec(word)
  print("Loaded " + str(len(embeddings_index)) + " word vectors.")
        
  embedding_matrix = 1 * np.random.randn(len(word_index)+1, embed_size)

  embeddedCount = 0
  for word, i in word_index.items():
    i-=1
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: 
      embedding_matrix[i] = embedding_vector
      embeddedCount+=1
  print("total embedded:", embeddedCount, "common words")
  del(embeddings_index)
  return embedding_matrix

# Build Train and Eval Set

#### Load raw data

In [139]:
# {room image url -> string of room category}; e.g.: 'ikea-town-and-country__1364308377063-s4.jpg': 'Living Room'
room_categories = read_pickle(DATASET_DIR + "categories_dict.p")
# {item image ID -> string of item category}; e.g.: '291.292.29': 'Footstool',
item_categories = read_pickle(DATASET_DIR + "categories_images_dict.p")
# {item image id -> dict of descriptions}; e.g. '202.049.06': {'color': 'Grey,black','desc': 'View more product information Concealed press studs keep the quilt in place','img': 'images/objects/202.049.06.jpg','name': 'GURLI','size': '120x180 cm','type': 'Throw'},
item_property = read_pickle(DATASET_DIR + "products_dict.p")
# {item image url -> {description, name}}; e.g: '/static/images/902.592.50.jpg': {'desc': 'The high pile dampens sound and provides a soft surface to walk on.','name': 'GSER'},
item_to_description = read_pickle(DATASET_DIR + "img_to_desc.p")
# {item image url -> list of corresponding room image url}; e.g.: 'images/001.509.85.jpg': ['images/room_scenes/ikea-wake-up-and-grow__1364335362013-s4.jpg','images/room_scenes/ikea-wake-up-and-grow-1364335370196.jpg'],
item_to_rooms_map = read_pickle(DATASET_DIR + "item_to_room.p")
# {room image url -> list of items}; e.g.: 'ikea-work-from-home-in-perfect-harmony__1364319311386-s4.jpg': ['desk','chair']
room_to_item_categories = read_pickle(DATASET_DIR + "room_to_items.p")

# Some simple preprossing
item_to_info = {key : value["type"] + " " +
                             value["desc"]
                       for key, value in item_property.items()} # remove view more info

room_to_items = {}

for item_url, room_url_list in item_to_rooms_map.items():
  item_id = item_url.split("/")[-1].split(".jpg")[0]
  if not os.path.exists(IMAGES_DIR + item_id + ".jpg"):
      print(item_url + " does not exist")
      continue

  for room_url in room_url_list:
    room_id = room_url.split("/")[-1].split(".jpg")[0]
    if room_id not in room_to_items:
      room_to_items[room_id] = [item_id]
    else:
      room_to_items[room_id].append(item_id)

images/890.333.75.jpg does not exist
images/991.333.98.jpg does not exist
images/990.612.97.jpg does not exist


#### Construct positive and negative pairs

For IR-style problem, seen and unseen can be tricky. We need to discuss whether unseen means "unseen pairs" or "unseen image or text"

In [140]:
all_positive_pairs = set()
for room, item_id_list in room_to_items.items():
  pairs_for_current_room = list(itertools.combinations(room_to_items[room], 2)) # n choose 2
  all_positive_pairs |= set(pairs_for_current_room)

all_products = sorted(set([x for pair in all_positive_pairs for x in pair]))
train_products, val_products = train_test_split(all_products, test_size=TRAIN_TEST_RATIO, random_state=72)

train_pairs, y_train = generate_product_limited_samples(train_products, all_positive_pairs, random_state=72)
val_pairs, y_val = generate_product_limited_samples(val_products, all_positive_pairs, random_state=72)
print(len(train_pairs), len(val_pairs))

21666 5810


In [141]:
# # To read the training and validation sets
# with open(ROOT_DIR + "train_data.pkl", "rb") as file:
#     train_pairs, y_train = pickle.load(file)
# with open(ROOT_DIR + "val_data.pkl", "rb") as file:
#     val_pairs, y_val = pickle.load(file)

#### Build PyTorch dataloader for train/val image/text

In [142]:
class FurnitureImagePairsDataset(Dataset):
    """Dataset containing pairs of furniture items."""

    def __init__(self, image_path, pairs, labels):
        """
        Args:
            image_path (string): Path to the directory containing images.
            pairs (list of tuples of strings): Pairs of image IDs to be used as training samples.
            labels (array of integers): Labels for the training samples.
        """
        super(FurnitureImagePairsDataset, self).__init__()
        self.image_ids = list(set(x for pair in pairs for x in pair))
        self.index_mapping = {image_id: i for i, image_id in enumerate(self.image_ids)}
        self.images = [preprocess_img(image_path + image_id + ".jpg") for image_id in tqdm.tqdm(self.image_ids)]
        self.pairs = pairs
        self.labels = labels

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        if isinstance(idx, (list, tuple)):
            x1, x2, y = zip(*[self[i] for i in idx])
            return torch.stack(x1), torch.stack(x2), torch.from_numpy(np.array(y))
        pair = self.pairs[idx]
        return self.images[self.index_mapping[pair[0]]], self.images[self.index_mapping[pair[1]]], self.labels[idx]

In [143]:
X_train_image = FurnitureImagePairsDataset(IMAGES_DIR, train_pairs, y_train)
X_val_image = FurnitureImagePairsDataset(IMAGES_DIR, val_pairs, y_val)

100%|██████████| 1456/1456 [00:07<00:00, 190.23it/s]
100%|██████████| 718/718 [00:03<00:00, 189.83it/s]


In [144]:
train_premise_texts = [item_to_info[id] for id, _ in train_pairs]
train_hypothesis_texts = [item_to_info[id] for _, id in train_pairs]
tokenizer = Tokenizer(num_words=NUM_WORDS_TOKENIZER, lower=True)
tokenizer.fit_on_texts(train_premise_texts + train_hypothesis_texts)
WORD_INDEX = tokenizer.word_index
print('Found %s unique tokens.' % len(WORD_INDEX))
print('Max len:', MAX_SEQUENCE_LENGTH)
WORD2VEC_EMBEDDING_MATRIX = get_embedding_matrix(WORD_INDEX)

X_train_text_premise = tokenizer.texts_to_sequences(train_premise_texts)
X_train_text_premise = pad_sequences(X_train_text_premise, maxlen=MAX_SEQUENCE_LENGTH)

X_train_text_hypothesis = tokenizer.texts_to_sequences(train_hypothesis_texts)
X_train_text_hypothesis = pad_sequences(X_train_text_hypothesis, maxlen=MAX_SEQUENCE_LENGTH)

Found 2037 unique tokens.
Max len: 1950
Loaded 3000000 word vectors.
total embedded: 1904 common words


In [145]:
val_premise_texts = [item_to_info[id] for id, _ in val_pairs]
val_hypothesis_texts = [item_to_info[id] for _, id in val_pairs]

# Please notice that: tokenizer is ONLY used on training set to build vocab
X_val_text_premise = tokenizer.texts_to_sequences(val_premise_texts)
X_val_text_premise = pad_sequences(X_val_text_premise, maxlen=MAX_SEQUENCE_LENGTH)

X_val_text_hypothesis = tokenizer.texts_to_sequences(val_hypothesis_texts)
X_val_text_hypothesis = pad_sequences(X_val_text_hypothesis, maxlen=MAX_SEQUENCE_LENGTH)

In [147]:
img_train_data = X_train_image
text_train_data = TensorDataset(torch.from_numpy(X_train_text_premise), torch.from_numpy(X_train_text_hypothesis), torch.from_numpy(y_train))

img_val_data = X_val_image
text_val_data = TensorDataset(torch.from_numpy(X_val_text_premise), torch.from_numpy(X_val_text_hypothesis), torch.from_numpy(y_val))

text_train_loader = DataLoader(text_train_data, batch_size=BATCH_SIZE)
img_train_loader = DataLoader(img_train_data, batch_size=BATCH_SIZE)

text_val_loader = DataLoader(text_val_data, batch_size=BATCH_SIZE)
img_val_loader = DataLoader(img_val_data, batch_size=BATCH_SIZE)

print(len(text_train_loader), len(img_train_loader))
print(len(text_val_loader), len(img_val_loader))

1355 1355
364 364


In [221]:
for lstm in text_train_loader:
  print(type(lstm))
  lstm_inp1, lstm_inp2, lstm_labels = lstm
  print(type(lstm_inp1))
  print(type(lstm_inp2))
  print(type(lstm_labels))
  print(lstm_inp1)
  break

<class 'list'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
<class 'torch.Tensor'>
tensor([[   0,    0,    0,  ...,  572,  860,  636],
        [   0,    0,    0,  ..., 1347,    9,  372],
        [   0,    0,    0,  ...,  145,   45,   62],
        ...,
        [   0,    0,    0,  ...,   14,    3,  179],
        [   0,    0,    0,  ...,  309,    2, 1166],
        [   0,    0,    0,  ...,    8,   10,   51]], dtype=torch.int32)


# Define Model

In [202]:
class VAE(nn.Module):
  def __init__(self, img_channels=3, feature_dim=32*20*20, z_dim=256): # IKEA
    super(VAE, self).__init__()
    self.image_encoder_conv1 = nn.Conv2d(img_channels, 16, 5)
    self.image_encoder_conv2 = nn.Conv2d(16, 32, 5)
    self.image_encoder_fc1 = nn.Linear(feature_dim, z_dim // 2)
    self.image_encoder_fc2 = nn.Linear(feature_dim, z_dim // 2)
    self.text_encoder_fc1 = nn.Linear(MAX_SEQUENCE_LENGTH, 512)
    self.text_encoder_fc2 = nn.Linear(512, 512)
    self.text_encoder_fc3 = nn.Linear(512, z_dim // 2)
    self.text_encoder_fc4 = nn.Linear(512, z_dim // 2)
    self.decoder_fc = nn.Linear(z_dim, feature_dim)
    self.decoder_conv1 = nn.ConvTranspose2d(32, 16, 5)
    self.decoder_conv2 = nn.ConvTranspose2d(16, img_channels, 5)
      
  def image_encoder(self, x):
    x = F.relu(self.image_encoder_conv1(x))
    x = F.relu(self.image_encoder_conv2(x))
    x = x.view(-1, 32*20*20)
    mu = self.image_encoder_fc1(x)
    logVar = self.image_encoder_fc2(x)
    return mu, logVar
  
  def text_encoder(self, x):
    x = F.relu(self.text_encoder_fc1(x))
    x = F.relu(self.text_encoder_fc2(x))
    mu = F.relu(self.text_encoder_fc3(x))
    logVar = F.relu(self.text_encoder_fc4(x))
    return mu, logVar

  def reparameterize(self, mu, logVar):
    std = torch.exp(logVar / 2)
    eps = torch.randn_like(std)
    return mu + std * eps, std, eps
  
  def decoder(self, z):
    x = F.relu(self.decoder_fc(z))
    x = x.view(-1, 32, 20, 20)
    x = F.relu(self.decoder_conv1(x))
    x = torch.sigmoid(self.decoder_conv2(x))
    return x
  
  def forward(self, image, text):
    mu1, logVar1 = self.image_encoder(image)
    mu2, logVar2 = self.text_encoder(text)
    mu = torch.cat((mu1, mu2), dim = 1)
    logVar = torch.cat((logVar1, logVar2), dim = 1)
    z, std, eps = self.reparameterize(mu, logVar)
    out = self.decoder(z)
    return out, mu, std, eps, logVar

In [205]:
class VAE_Classifier(nn.Module):
  def __init__(self):
    super(VAE_Classifier, self).__init__()
    self.vae = VAE()
    self.fc1 = nn.Linear(256 * 2, 512)
    self.fc2 = nn.Linear(512, 128)
    self.output_fc = nn.Linear(128, 1)

  def forward(self, text_inp1, text_inp2, img_inp1, img_inp2):
    vae = torch.load('/content/drive/Othercomputers/My MacBook Pro/GitHub/DecorAssistant/multimodal_vae.pth').to(DEVICE)
    vae.eval()
    out1, mu1, std1, eps1, logVar1 = vae(img_inp1, text_inp1)
    out2, mu2, std2, eps2, logVar2 = vae(img_inp2, text_inp2)
    bottleneck1 = mu1 + std1 * eps1
    bottleneck2 = mu2 + std2 * eps2
    combined_bottleneck = torch.cat((bottleneck1, bottleneck2), 1)
    x_comb = F.relu(self.fc1(combined_bottleneck))
    x_comb = F.relu(self.fc2(x_comb))
    x = self.output_fc(x_comb)
    return x

In [222]:
print("Currently using device: {}\n".format(DEVICE))

model = VAE_Classifier()
model.to(DEVICE)
print("Model Architecture {}\n".format(model))

# lr = LEARNING_RATE
lr = 1e-4
criterion = nn.BCEWithLogitsLoss() # this means the sigmoid is INCORPORATED into the loss!!
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)

print("Training Started...")
model.train()
best_loss = float('inf')
for i in range(EPOCHS):
  total_acc_train = 0
  total_loss_train = 0
    
  for lstm, cnn in tqdm.tqdm(zip(text_train_loader, img_train_loader), total=len(text_train_loader)):
    lstm_inp1, lstm_inp2, lstm_labels = lstm
    cnn_inp1, cnn_inp2, cnn_labels = cnn
    lstm_inp1, lstm_inp2, lstm_labels = lstm_inp1.float().to(DEVICE), lstm_inp2.float().to(DEVICE), lstm_labels.to(DEVICE)
    cnn_inp1, cnn_inp2, cnn_labels = cnn_inp1.to(DEVICE), cnn_inp2.to(DEVICE), cnn_labels.to(DEVICE)
    model.zero_grad()
    output = model(lstm_inp1, lstm_inp2, cnn_inp1, cnn_inp2)
    loss = criterion(torch.round(torch.sigmoid(output.squeeze())), lstm_labels.float())
    loss.backward()
    # nn.utils.clip_grad_norm_(model.parameters(), CLIP)
    optimizer.step()
    
    with torch.no_grad():
      acc = torch.abs(torch.round(torch.sigmoid(output.squeeze())) - lstm_labels.float()).view(-1)
      acc = (1. - acc.sum() / acc.size()[0])
      total_acc_train += acc
      total_loss_train += loss.item()
  
  train_acc = total_acc_train/len(text_train_loader)
  train_loss = total_loss_train/len(text_train_loader)
  model.eval()
  total_acc_val = 0
  total_loss_val = 0
  with torch.no_grad():
    for lstm, cnn in tqdm.tqdm(zip(text_val_loader, img_val_loader), total=len(text_val_loader)):
      lstm_inp1, lstm_inp2, lstm_labels = lstm
      cnn_inp1, cnn_inp2, cnn_labels = cnn
      lstm_inp1, lstm_inp2, lstm_labels = lstm_inp1.float().to(DEVICE), lstm_inp2.float().to(DEVICE), lstm_labels.to(DEVICE)
      cnn_inp1, cnn_inp2, cnn_labels = cnn_inp1.to(DEVICE), cnn_inp2.to(DEVICE), cnn_labels.to(DEVICE)
      model.zero_grad()
      output = model(lstm_inp1, lstm_inp2, cnn_inp1, cnn_inp2)
      val_loss = criterion(output.squeeze(), lstm_labels.float())
      if val_loss < best_loss:
       best_loss = val_loss
       torch.save(model, 'multimodal_vae_classifier.pth')
      acc = torch.abs(torch.round(torch.sigmoid(output.squeeze())) - lstm_labels.float()).view(-1)
      acc = (1. - acc.sum() / acc.size()[0])
      total_acc_val += acc
      total_loss_val += val_loss.item()
  val_acc = total_acc_val/len(text_val_loader)
  val_loss = total_loss_val/len(text_val_loader)
  print(f'Epoch {i+1}: train_loss: {train_loss:.4f} train_acc: {train_acc:.4f} | val_loss: {val_loss:.4f} val_acc: {val_acc:.4f}')
  model.train()
  torch.cuda.empty_cache()

Currently using device: cuda

Model Architecture VAE_Classifier(
  (vae): VAE(
    (image_encoder_conv1): Conv2d(3, 16, kernel_size=(5, 5), stride=(1, 1))
    (image_encoder_conv2): Conv2d(16, 32, kernel_size=(5, 5), stride=(1, 1))
    (image_encoder_fc1): Linear(in_features=12800, out_features=128, bias=True)
    (image_encoder_fc2): Linear(in_features=12800, out_features=128, bias=True)
    (text_encoder_fc1): Linear(in_features=1950, out_features=512, bias=True)
    (text_encoder_fc2): Linear(in_features=512, out_features=512, bias=True)
    (text_encoder_fc3): Linear(in_features=512, out_features=128, bias=True)
    (text_encoder_fc4): Linear(in_features=512, out_features=128, bias=True)
    (decoder_fc): Linear(in_features=256, out_features=12800, bias=True)
    (decoder_conv1): ConvTranspose2d(32, 16, kernel_size=(5, 5), stride=(1, 1))
    (decoder_conv2): ConvTranspose2d(16, 3, kernel_size=(5, 5), stride=(1, 1))
  )
  (fc1): Linear(in_features=512, out_features=512, bias=True)
 

  6%|▌         | 78/1355 [00:04<01:17, 16.43it/s]


KeyboardInterrupt: ignored

# Ranker

In [226]:
model = torch.load('/content/drive/Othercomputers/My MacBook Pro/GitHub/DecorAssistant/multimodal_vae_classifier.pth').to(DEVICE)

In [227]:
def single_pair_inference(premise_image_path, hypothesis_image_path, premise_text, hypothesis_text, model, tokenizer, threshold, do_plot=False):
  premise_sequence = tokenizer.texts_to_sequences([premise_text])
  premise_sequence = pad_sequences(premise_sequence, maxlen=MAX_SEQUENCE_LENGTH)
  hypothesis_sequence = tokenizer.texts_to_sequences([hypothesis_text])
  hypothesis_sequence = pad_sequences(hypothesis_sequence, maxlen=MAX_SEQUENCE_LENGTH)
  image_premise, image_hypothesis = preprocess_img(premise_image_path), preprocess_img(hypothesis_image_path)

  if do_plot:
    image_1 = Image.open(premise_image_path)
    image_2 = Image.open(hypothesis_image_path)
    fig = plt.figure(figsize=(15, 15))
    ax1 = fig.add_subplot(2,2,1)
    ax1.imshow(image_1)
    ax2 = fig.add_subplot(2,2,2)
    ax2.imshow(image_2)
    print("Left item description ------ {}".format(premise_text))
    print("Right item description ------  {}".format(hypothesis_text))


  image_premise = np.reshape(image_premise, (1, 3, 28, 28))
  image_hypothesis = np.reshape(image_hypothesis, (1, 3, 28, 28))

  img_data = TensorDataset(torch.from_numpy(image_premise), torch.from_numpy(image_hypothesis))
  text_data = TensorDataset(torch.from_numpy(premise_sequence), torch.from_numpy(hypothesis_sequence))
  
  text_loader = DataLoader(text_data, batch_size=1)
  img_loader = DataLoader(img_data, batch_size=1)

  for lstm, cnn in zip(text_loader, img_loader):
    lstm_inp1, lstm_inp2 = lstm
    cnn_inp1, cnn_inp2 = cnn
    lstm_inp1, lstm_inp2 = lstm_inp1.float().to(DEVICE), lstm_inp2.float().to(DEVICE)
    cnn_inp1, cnn_inp2 = cnn_inp1.to(DEVICE), cnn_inp2.to(DEVICE)
    model.zero_grad()
    output = model(lstm_inp1, lstm_inp2, cnn_inp1, cnn_inp2)
    print(output)

  score = output.squeeze().cpu().detach().numpy().tolist()
  if score > threshold:
    return "Positive", score
  else:
    return "Negative", 1 - score


In [228]:
for i in range(10):
  random_index = random.randint(0, len(train_pairs))
  image_id_1 = train_pairs[random_index][0]
  image_id_2 = train_pairs[random_index][1]
  text_1 = train_premise_texts[random_index]
  text_2 = train_hypothesis_texts[random_index]
  prediction, confidence = single_pair_inference(premise_image_path=IMAGES_DIR + image_id_1 + ".jpg",
            hypothesis_image_path=IMAGES_DIR + image_id_2 + ".jpg",
            premise_text=text_1,
            hypothesis_text=text_2,
            model=model,
            tokenizer=tokenizer,
            threshold=0.4,
            do_plot=False)
  print("Actual Label for this pair is", "Positive" if y_train[random_index] == 1 else "Negative")
  print("The prediction for this pair is", prediction, "with confidence", confidence)

tensor([[0.0009]], device='cuda:0', grad_fn=<AddmmBackward0>)
Actual Label for this pair is Negative
The prediction for this pair is Negative with confidence 0.9990631838445552
tensor([[0.0009]], device='cuda:0', grad_fn=<AddmmBackward0>)
Actual Label for this pair is Negative
The prediction for this pair is Negative with confidence 0.9990632036351599
tensor([[0.0009]], device='cuda:0', grad_fn=<AddmmBackward0>)
Actual Label for this pair is Positive
The prediction for this pair is Negative with confidence 0.999063209281303
tensor([[0.0009]], device='cuda:0', grad_fn=<AddmmBackward0>)
Actual Label for this pair is Positive
The prediction for this pair is Negative with confidence 0.9990631905966438
tensor([[0.0009]], device='cuda:0', grad_fn=<AddmmBackward0>)
Actual Label for this pair is Negative
The prediction for this pair is Negative with confidence 0.9990631949622184
tensor([[0.0009]], device='cuda:0', grad_fn=<AddmmBackward0>)
Actual Label for this pair is Negative
The prediction 

In [229]:
def ranker(input_image_id, input_description, item_id_pool, model, tokenizer, threshold=0.3, top_n=5, do_plot=False):
  input_image_path = IMAGES_DIR + input_image_id + ".jpg"
  item_id_to_score = {}

  for item_id in item_id_pool:
    candidate_image_path = IMAGES_DIR + item_id + ".jpg"
    candidate_description = item_to_info[item_id]
    output_prediction, output_confidence = single_pair_inference(premise_image_path=input_image_path,
          hypothesis_image_path=candidate_image_path,
          premise_text=input_description,
          hypothesis_text=candidate_description,
          model=model,
          tokenizer=tokenizer,
          threshold=threshold,
          do_plot=False)
    if output_prediction == "Positive":
      item_id_to_score[item_id] = output_confidence
    else:
      continue
  return item_id_to_score

In [None]:
random_index = random.randint(0, len(train_pairs))
image_id = train_pairs[random_index][0]
input_description = item_to_info[image_id]
all_item_ids = list(set([x[0] for x in all_positive_pairs] + [x[1] for x in all_positive_pairs]))

ranker(image_id, input_description, all_item_ids, model, tokenizer)

# Ranking Evaluation

Build a dataset where a random sample of "premise" products is compared against all "hypothesis" products in the validation set.

In [231]:
# Build pairs to rank

NUM_QUERIES = 50

val_products = sorted(list(set(x for pair in val_pairs for x in pair)))

np.random.seed(1234)
premise_products = np.random.choice(val_products, size=min(NUM_QUERIES, len(val_products)), replace=False)
hypothesis_products = val_products

ranking_pairs = list(itertools.product(premise_products, hypothesis_products))
print(len(ranking_pairs), "pairs,", len(hypothesis_products), "products per query")

35900 pairs, 718 products per query


In [232]:
# Get the ground-truth

ground_truth_map = {}
for item_url, room_url_list in item_to_rooms_map.items():
    item_id = item_url.split("/")[-1].split(".jpg")[0]
    if item_id not in premise_products: continue

    for room_url in room_url_list:
        room_id = room_url.split("/")[-1].split(".jpg")[0]
        ground_truth_map[item_id] = ground_truth_map.get(item_id, set()) | set(room_to_items[room_id])
ground_truth_lists = [ground_truth_map[item_id] for item_id in premise_products]

# plt.hist([len(x) for x in ground_truth_lists], bins=np.arange(0, max(len(x) for x in ground_truth_lists), 5));

In [233]:
X_rank_image = FurnitureImagePairsDataset(IMAGES_DIR, ranking_pairs, np.zeros(len(ranking_pairs)))
X_rank_text_premise = ranking_pairs
X_rank_text_hypothesis = ranking_pairs
X_rank_text_premise.size()

def tokenize(text):
  try:
      return clip.tokenize(text)
  except:
      return clip.tokenize(' '.join(text.split()[:50]))

# X_rank_text_premise = torch.cat([tokenize(item_to_info[id]) for id, _ in ranking_pairs], 0)
# X_rank_text_hypothesis = torch.cat([tokenize(item_to_info[id]) for _, id in ranking_pairs], 0)
# X_rank_text_premise.size()

100%|██████████| 718/718 [00:03<00:00, 189.50it/s]


AttributeError: ignored

In [None]:
BATCH_SIZE = 32

img_ranking_data = X_rank_image # TensorDataset(torch.from_numpy(X_val_image_premise), torch.from_numpy(X_val_image_hypothesis), torch.from_numpy(y_val))
text_ranking_data = TensorDataset(X_rank_text_premise, X_rank_text_hypothesis, torch.zeros(len(ranking_pairs)))

text_ranking_loader = DataLoader(text_ranking_data, batch_size=BATCH_SIZE)
img_ranking_loader = DataLoader(img_ranking_data, batch_size=BATCH_SIZE)

print(len(text_ranking_loader), len(img_ranking_loader))

In [None]:
checkpoint_path = CHECKPOINT_DIR + "aws_image_embedding_one_layer_dual_lr_epoch_2.p"
state = torch.load(checkpoint_path, map_location=torch.device('cpu'))
full_model.load_state_dict(state)

In [None]:
model.eval()
ranking_results = []
with torch.no_grad():
    for lstm, cnn in tqdm.tqdm(zip(text_ranking_loader, img_ranking_loader), total=len(text_ranking_loader)):
        lstm_inp1, lstm_inp2, _ = lstm
        cnn_inp1, cnn_inp2, _ = cnn
        lstm_inp1, lstm_inp2 = lstm_inp1.to(device), lstm_inp2.to(device)
        cnn_inp1, cnn_inp2 = cnn_inp1.to(device), cnn_inp2.to(device)
        model.zero_grad()
        output = model(lstm_inp1, lstm_inp2, cnn_inp1, cnn_inp2)
        ranking_results.append(output.cpu().numpy())
ranking_results = np.concatenate(ranking_results).reshape(len(premise_products), len(hypothesis_products))
print(ranking_results.shape)

In [None]:
import math
class Evaluator:
    def __init__(self, GroundTruth):
      self.GroundTruth = GroundTruth

    def NDCG_Eval(self, rankresult, topk):
      sortedRankResult = sorted(rankresult.items(), key = lambda x:x[1], reverse=True)
      DCGScore = 0
      result = []
      for i, item in enumerate(sortedRankResult[:topk]):
        if item[0] in self.GroundTruth:
          result.append((item, i))
      DCGScore = sum([item[0][1]/math.log(item[1]+2, 2) for item in result])
      IDCGScore = sum([1/math.log(i+2,2) for i in range(topk)])
      NDCG = DCGScore / IDCGScore

      return NDCG
    
    def Score_Eval(self, rankresult, topk):
      sortedRankResult = sorted(rankresult.items(), key = lambda x:x[1], reverse=True)
      return sum(i[1] for i in sortedRankResult[:topk] if i[0] in self.GroundTruth) / topk
    
    def Precision(self, rankresult, topk):
      sortedRankResult = sorted(rankresult.items(), key = lambda x:x[1], reverse=True)
      topkresult = sortedRankResult[:topk]
      return len([i for i in sortedRankResult[:topk] if i[0] in self.GroundTruth]) / len(topkresult)

    def Recall(self, rankresult, topk):
      sortedRankResult = sorted(rankresult.items(), key = lambda x:x[1], reverse=True)
      topkresult = sortedRankResult[:topk]
      return len([i for i in sortedRankResult[:topk] if i[0] in self.GroundTruth]) / len(self.GroundTruth)
    
    def FValue(self, rankresult, topk):
      precision = self.Precision(rankresult, topk)
      recall = self.Recall(rankresult, topk)
      return 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0

In [None]:
# Now use the evaluator
TOP_K = 10

ndcg = np.zeros(len(premise_products))
score = np.zeros(len(premise_products))
precision = np.zeros(len(premise_products))
recall = np.zeros(len(premise_products))
fvalue = np.zeros(len(premise_products))
for i, (ground_truth, rankings) in enumerate(zip(ground_truth_lists, ranking_results)):
    evaluator = Evaluator(ground_truth)
    rankings = {product: output for product, output in zip(hypothesis_products, rankings)}
    ndcg[i] = evaluator.NDCG_Eval(rankings, TOP_K)
    score[i] = evaluator.Score_Eval(rankings, TOP_K)
    precision[i] = evaluator.Precision(rankings, TOP_K)
    recall[i] = evaluator.Recall(rankings, TOP_K)
    fvalue[i] = evaluator.FValue(rankings, TOP_K)
print("NDCG: {:.4f} (95% CI {:.3f}-{:.3f})".format(ndcg.mean(), ndcg.mean() - 1.96 * ndcg.std(), ndcg.mean() + 1.96 * ndcg.std()))
print("Score: {:.4f} (95% CI {:.3f}-{:.3f})".format(score.mean(), score.mean() - 1.96 * score.std(), score.mean() + 1.96 * score.std()))
print("Precision: {:.4f} (95% CI {:.3f}-{:.3f})".format(precision.mean(), precision.mean() - 1.96 * precision.std(), precision.mean() + 1.96 * precision.std()))
print("Recall: {:.4f} (95% CI {:.3f}-{:.3f})".format(recall.mean(), recall.mean() - 1.96 * recall.std(), recall.mean() + 1.96 * recall.std()))
print("FValue: {:.4f} (95% CI {:.3f}-{:.3f})".format(fvalue.mean(), fvalue.mean() - 1.96 * fvalue.std(), fvalue.mean() + 1.96 * fvalue.std()))

with open(os.path.splitext(checkpoint_path)[0] + "_rankings_{}_queries.pkl".format(len(premise_products)), "wb") as file:
    pickle.dump({
        "ndcg": ndcg,
        "score": score,
        "precision": precision,
        "recall": recall,
        "fvalue": fvalue
    }, file)