In [71]:
# !wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"
# !gunzip GoogleNews-vectors-negative300.bin.gz

--2021-10-20 19:28:21--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.170.88
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.170.88|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1647046227 (1.5G) [application/x-gzip]
Saving to: ‘GoogleNews-vectors-negative300.bin.gz’


2021-10-20 19:28:43 (74.1 MB/s) - ‘GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]



In [112]:
import cv2
import numpy as np
import pickle
import os
import itertools

import torch 
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from gensim.models import KeyedVectors

In [113]:
# Global Path Vairables
ROOT_DIR =  "drive/MyDrive/DecorAssist/"
DATASET_DIR = ROOT_DIR + "IKEA/text_data/"
IMAGES_DIR = ROOT_DIR + "IKEA/images/all_items/"

# Global Parameter Variables
MAX_SEQUENCE_LENGTH = 100
NUM_WORDS_TOKENIZER = 50000
EMBEDDING_DIM = 300
BATCH_SIZE = 16

In [114]:
def preprocess_img(path):
  img = cv2.imread(path)
  img = cv2.resize(img, (75, 115))
  img = img.astype(np.float32) / 255
  return img

def read_pickle(fn):
	with open(fn, "rb") as f:
		return pickle.load(f)

### Data Loading

In [115]:
# {room image url -> string of room category}; e.g.: 'ikea-town-and-country__1364308377063-s4.jpg': 'Living Room'
room_categories = read_pickle(DATASET_DIR + "categories_dict.p")
# {item image ID -> string of item category}; e.g.: '291.292.29': 'Footstool',
item_categories = read_pickle(DATASET_DIR + "categories_images_dict.p")
# {item image id -> dict of descriptions}; e.g. '202.049.06': {'color': 'Grey,black','desc': 'View more product information Concealed press studs keep the quilt in place','img': 'images/objects/202.049.06.jpg','name': 'GURLI','size': '120x180 cm','type': 'Throw'},
item_property = read_pickle(DATASET_DIR + "products_dict.p")
# {item image url -> {description, name}}; e.g: '/static/images/902.592.50.jpg': {'desc': 'The high pile dampens sound and provides a soft surface to walk on.','name': 'GSER'},
item_to_description = read_pickle(DATASET_DIR + "img_to_desc.p")
# {item image url -> list of corresponding room image url}; e.g.: 'images/001.509.85.jpg': ['images/room_scenes/ikea-wake-up-and-grow__1364335362013-s4.jpg','images/room_scenes/ikea-wake-up-and-grow-1364335370196.jpg'],
item_to_rooms_map = read_pickle(DATASET_DIR + "item_to_room.p")
# {room image url -> list of items}; e.g.: 'ikea-work-from-home-in-perfect-harmony__1364319311386-s4.jpg': ['desk','chair']
room_to_item_categories = read_pickle(DATASET_DIR + "room_to_items.p")

# Some simple preprossing
item_to_info = {key : value["type"] + " " +
                             value["desc"]
                       for key, value in item_property.items()}

room_to_items = {}

for item_url, room_url_list in item_to_rooms_map.items():
  item_id = item_url.split("/")[-1].split(".jpg")[0]

  for room_url in room_url_list:
    room_id = room_url.split("/")[-1].split(".jpg")[0]
    if room_id not in room_to_items:
      room_to_items[room_id] = []
    else:
      room_to_items[room_id].append(item_id)

all_positive_pairs = []
for room, item_id_list in room_to_items.items():
  pairs_for_current_room = list(itertools.combinations(room_to_items[room], 2))
  all_positive_pairs += pairs_for_current_room


train_pairs = all_positive_pairs[500:650]
val_pairs = train_pairs

In [133]:
image_premise_id_list = [x[0] for x in train_pairs]
image_hypothesis_id_list = [x[1] for x in train_pairs]
X_image_premise = np.array(list(map(lambda image_id: preprocess_img(IMAGES_DIR + image_id + ".jpg"), image_premise_id_list)))
X_image_hypothesis = np.array(list(map(lambda image_id: preprocess_img(IMAGES_DIR + image_id + ".jpg"), image_hypothesis_id_list)))
X_image_premise = np.reshape(X_image_premise, (X_image_premise.shape[0], 3, 75, 115))
X_image_hypothesis = np.reshape(X_image_hypothesis, (X_image_hypothesis.shape[0], 3, 75, 115))

In [117]:
y = np.array([np.array([0, 1]) for _ in range(len(train_pairs))])

In [118]:
premise_texts = [item_to_info[id] for id in image_premise_id_list]
hypothesis_texts = [item_to_info[id] for id in image_hypothesis_id_list]
tokenizer = Tokenizer(num_words=NUM_WORDS_TOKENIZER, lower=True)
tokenizer.fit_on_texts(premise_texts + hypothesis_texts)
WORD_INDEX = tokenizer.word_index
print('Found %s unique tokens.' % len(WORD_INDEX))
print('Max len:', MAX_SEQUENCE_LENGTH)

X_text_premise = tokenizer.texts_to_sequences(premise_texts)
X_text_premise = pad_sequences(X_text_premise, maxlen=MAX_SEQUENCE_LENGTH)

X_text_hypothesis = tokenizer.texts_to_sequences(hypothesis_texts)
X_text_hypothesis = pad_sequences(X_text_hypothesis, maxlen=MAX_SEQUENCE_LENGTH)

Found 217 unique tokens.
Max len: 100


In [134]:
img_train_data = TensorDataset(torch.from_numpy(X_image_premise), torch.from_numpy(X_image_hypothesis), torch.from_numpy(y))
text_train_data = TensorDataset(torch.from_numpy(X_text_premise), torch.from_numpy(X_text_hypothesis), torch.from_numpy(y))

img_val_data = img_train_data
text_val_data = text_train_data

text_train_loader = DataLoader(text_train_data, batch_size=BATCH_SIZE)
img_train_loader = DataLoader(img_train_data, batch_size=BATCH_SIZE)

text_val_loader = DataLoader(text_val_data, batch_size=BATCH_SIZE)
img_val_loader = DataLoader(img_val_data, batch_size=BATCH_SIZE)

print(len(text_train_loader), len(img_train_loader))
print(len(text_val_loader), len(img_val_loader))



10 10
10 10


In [120]:
def get_embedding_matrix(word_index):
  word2vecDict = KeyedVectors.load_word2vec_format("/content/GoogleNews-vectors-negative300.bin", binary=True)
  embed_size = 300
  embeddings_index = dict()
  for word in word2vecDict.wv.vocab:
    embeddings_index[word] = word2vecDict.word_vec(word)
  print("Loaded " + str(len(embeddings_index)) + " word vectors.")
        
  embedding_matrix = 1 * np.random.randn(len(word_index)+1, embed_size)

  embeddedCount = 0
  for word, i in word_index.items():
    i-=1
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: 
      embedding_matrix[i] = embedding_vector
      embeddedCount+=1
  print("total embedded:", embeddedCount, "common words")
  del(embeddings_index)
  return embedding_matrix

In [121]:
word2vec_embedding_matrix = get_embedding_matrix(WORD_INDEX)

  """


Loaded 3000000 word vectors.
total embedded: 204 common words


# Define Model

In [135]:
class CNN_LSTM(nn.Module):
  def __init__(self, vocab_size, weights_matrix, n_hidden, n_layers, n_out):
    super(CNN_LSTM, self).__init__()

    # LSTM for the text overview
    self.vocab_size, self.n_hidden, self.n_out, self.n_layers = vocab_size, n_hidden, n_out, n_layers
    num_embeddings, embedding_dim = weights_matrix.shape[0], weights_matrix.shape[1]
    self.emb = nn.Embedding(num_embeddings, embedding_dim)
    self.emb.weight.data.copy_(torch.from_numpy(weights_matrix))
    self.emb.weight.requires_grad = True
    self.lstm = nn.LSTM(embedding_dim, self.n_hidden, self.n_layers, dropout=0.2, batch_first=True)
    self.dropout = nn.Dropout(0.1)
    self.lstm_fc = nn.Linear(self.n_hidden, 128)
    # self.sigmoid = nn.Sigmoid()

    # CNN for the posters
    self.conv1 = nn.Conv2d(3, 32, 3)
    self.max_pool1 = nn.MaxPool2d(2)
    self.conv2 = nn.Conv2d(32, 64, 3)
    self.max_pool2 = nn.MaxPool2d(2)
    self.conv3 = nn.Conv2d(64, 128, 3)
    self.max_pool3 = nn.MaxPool2d(2)
    self.conv4 = nn.Conv2d(128, 128, 3)
    self.max_pool4 = nn.MaxPool2d(2)
    self.cnn_dropout = nn.Dropout(0.1)
    self.cnn_fc = nn.Linear(5*2*128, 512)

    # Concat layer for the combined feature space
    # self.combined_fc1 = nn.Linear(640, 256)
    self.combined_fc1 = nn.Linear(640*2, 256)
    self.combined_fc2 = nn.Linear(256, 128)
    self.output_fc = nn.Linear(128, n_out)

  def forward(self, lstm_inp1, lstm_inp2, cnn_inp1, cnn_inp2):
      # LSTM Forward
      batch_size = lstm_inp1.size(0)
      hidden = self.init_hidden(batch_size)

      lstm_inp1 = lstm_inp1.long()
      lstm_inp2 = lstm_inp2.long()
      embeds1 = self.emb(lstm_inp1)
      embeds2 = self.emb(lstm_inp2)
      lstm_out1, hidden = self.lstm(embeds1, hidden)
      lstm_out1 = self.dropout(lstm_out1[:, -1])
      lstm_out1 = F.relu(self.lstm_fc(lstm_out1))

      lstm_out2, hidden = self.lstm(embeds2, hidden)
      lstm_out2 = self.dropout(lstm_out2[:, -1])
      lstm_out2 = F.relu(self.lstm_fc(lstm_out2))

      # CNN Forward
      x1 = F.relu(self.conv1(cnn_inp1))
      x1 = self.max_pool1(x1)
      x1 = F.relu(self.conv2(x1))
      x1 = self.max_pool2(x1)
      x1 = F.relu(self.conv3(x1))
      x1 = self.max_pool3(x1)
      x1 = F.relu(self.conv4(x1))
      x1 = self.max_pool4(x1)
      x1 = x1.view(-1, 5*2*128)
      x1 = self.cnn_dropout(x1)
      cnn_out1 = F.relu(self.cnn_fc(x1))

      x2 = F.relu(self.conv1(cnn_inp2))
      x2 = self.max_pool1(x2)
      x2 = F.relu(self.conv2(x2))
      x2 = self.max_pool2(x2)
      x2 = F.relu(self.conv3(x2))
      x2 = self.max_pool3(x2)
      x2 = F.relu(self.conv4(x2))
      x2 = self.max_pool4(x2)
      x2 = x2.view(-1, 5*2*128)
      x2 = self.cnn_dropout(x2)
      cnn_out2 = F.relu(self.cnn_fc(x2))

      combined_inp = torch.cat((cnn_out1, cnn_out2, lstm_out1, lstm_out2), 1)
      x_comb = F.relu(self.combined_fc1(combined_inp))
      x_comb = F.relu(self.combined_fc2(x_comb))
      out = torch.sigmoid(self.output_fc(x_comb))

      return out

  def init_hidden(self, batch_size):
    weight = next(self.parameters()).data
    hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device),
                      weight.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device))
    return hidden

In [136]:
vocab_size = len(WORD_INDEX)+1
output_size = y.shape[1]
embedding_dim = 300
hidden_dim = 64
n_layers = 2
print(output_size)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

model = CNN_LSTM(vocab_size, word2vec_embedding_matrix, hidden_dim, n_layers, output_size)
model.to(device)
print(model)

lr=0.001
# criterion = nn.MultiLabelSoftMarginLoss()
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)

2
cpu
CNN_LSTM(
  (emb): Embedding(218, 300)
  (lstm): LSTM(300, 64, num_layers=2, batch_first=True, dropout=0.2)
  (dropout): Dropout(p=0.1, inplace=False)
  (lstm_fc): Linear(in_features=64, out_features=128, bias=True)
  (conv1): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1))
  (max_pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1))
  (max_pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1))
  (max_pool3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv4): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1))
  (max_pool4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (cnn_dropout): Dropout(p=0.1, inplace=False)
  (cnn_fc): Linear(in_features=1280, out_features=512, bias=True)
  (combined_fc1): Linear(in_features=1280, out_features=256

In [138]:
epochs = 1
clip = 5

model.train()
for i in range(epochs):
  total_acc_train = 0
  total_loss_train = 0
    
  for lstm, cnn in zip(text_train_loader, img_train_loader):
    lstm_inp1, lstm_inp2, lstm_labels = lstm
    cnn_inp1, cnn_inp2, cnn_labels = cnn
    lstm_inp1, lstm_inp2, lstm_labels = lstm_inp1.to(device), lstm_inp2.to(device), lstm_labels.to(device)
    cnn_inp1, cnn_inp2, cnn_labels = cnn_inp1.to(device), cnn_inp2.to(device), cnn_labels.to(device)
    model.zero_grad()
    output = model(lstm_inp1, lstm_inp2, cnn_inp1, cnn_inp2)
    loss = criterion(output.squeeze(), lstm_labels.float())
    loss.backward()
    nn.utils.clip_grad_norm_(model.parameters(), clip)
    optimizer.step()
    
    with torch.no_grad():
      acc = torch.abs(output.squeeze() - lstm_labels.float()).view(-1)
      acc = (1. - acc.sum() / acc.size()[0])
      total_acc_train += acc
      total_loss_train += loss.item()
  
  train_acc = total_acc_train/len(text_train_loader)
  train_loss = total_loss_train/len(text_train_loader)
  model.eval()
  total_acc_val = 0
  total_loss_val = 0
  with torch.no_grad():
    for lstm, cnn in zip(text_val_loader, img_val_loader):
      lstm_inp1, lstm_inp2, lstm_labels = lstm
      cnn_inp1, cnn_inp2, cnn_labels = cnn
      lstm_inp1, lstm_inp2, lstm_labels = lstm_inp1.to(device), lstm_inp2.to(device), lstm_labels.to(device)
      cnn_inp1, cnn_inp2, cnn_labels = cnn_inp1.to(device), cnn_inp2.to(device), cnn_labels.to(device)
      model.zero_grad()
      output = model(lstm_inp1, lstm_inp2, cnn_inp1, cnn_inp2)
      val_loss = criterion(output.squeeze(), lstm_labels.float())
      acc = torch.abs(output.squeeze() - lstm_labels.float()).view(-1)
      acc = (1. - acc.sum() / acc.size()[0])
      total_acc_val += acc
      total_loss_val += val_loss.item()
  val_acc = total_acc_val/len(text_val_loader)
  val_loss = total_loss_val/len(text_val_loader)
  print(f'Epoch {i+1}: train_loss: {train_loss:.4f} train_acc: {train_acc:.4f} | val_loss: {val_loss:.4f} val_acc: {val_acc:.4f}')
  model.train()
  torch.cuda.empty_cache()

Epoch 1: train_loss: 0.0000 train_acc: 1.0000 | val_loss: 0.0000 val_acc: 1.0000
