In [3]:
import json
import re
from transformers import BertModel
from transformers import BertTokenizer
import torch
from torchvision import transforms
from PIL import Image
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
from collections import OrderedDict, Counter
import math
from numpy import asarray
from numpy import save
from numpy import load

In [4]:
with open('/001/usuarios/isaac.bribiesca/mmimdb/split.json') as json_data:
    data_dict_raw = json.load(json_data)

In [5]:
def resize_and_crop_image(input_file, output_box=[224, 224], fit=True):
        # https://github.com/BVLC/caffe/blob/master/tools/extra/resize_and_crop_images.py
        '''Downsample the image.
        '''
        img = Image.open(input_file)
        #img.save("orig_"+input_file.split('/')[-1])
        box = output_box
        # preresize image with factor 2, 4, 8 and fast algorithm
        factor = 1
        while img.size[0] / factor > 2 * box[0] and img.size[1] * 2 / factor > 2 * box[1]:
            factor *= 2
        if factor > 1:
            img.thumbnail(
                (img.size[0] / factor, img.size[1] / factor), Image.NEAREST)

        # calculate the cropping box and get the cropped part
        if fit:
            x1 = y1 = 0
            x2, y2 = img.size
            wRatio = 1.0 * x2 / box[0]
            hRatio = 1.0 * y2 / box[1]
            if hRatio > wRatio:
                y1 = int(y2 / 2 - box[1] * wRatio / 2)
                y2 = int(y2 / 2 + box[1] * wRatio / 2)
            else:
                x1 = int(x2 / 2 - box[0] * hRatio / 2)
                x2 = int(x2 / 2 + box[0] * hRatio / 2)
            img = img.crop((x1, y1, x2, y2))

        # Resize the image with best quality algorithm ANTI-ALIAS
        img = img.resize(box, Image.ANTIALIAS).convert('RGB')
        #img = numpy.asarray(img, dtype='float32')
        return img

In [6]:
def get_image_feature(feature_extractor, image):
    with torch.no_grad():
        feature_images = feature_extractor.features(image)
        feature_images = feature_extractor.avgpool(feature_images)
        feature_images = torch.flatten(feature_images, 1)
        feature_images = feature_extractor.classifier[0](feature_images)
    
    return feature_images

In [7]:
preprocess = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [8]:
def normalizeText(text):
    #text = text.lower()
    text = re.sub(r'<br />', r' ', text).strip()
    text = re.sub(r'^https?:\/\/.*[\r\n]*', ' L ', text, flags=re.MULTILINE)
    text = re.sub(r'[\~\*\+\^`_#\[\]|]', r' ', text).strip()
    text = re.sub(r'[0-9]+', r' N ', text).strip()
    text = re.sub(r'([/\'\-\.?!\(\)",:;])', r' \1 ', text).strip()
    return text.split()

In [9]:
with open('../mmimdb/dataset/0918557.json') as json_data:
    movie = json.load(json_data)

In [11]:
movie['plot']

['When Sarah Cain, a self-involved big-city newspaper columnist, travels to Pennsylvania for the funeral of her Amish sister, she soon discovers that she is the legal guardian of her five Amish nieces and nephews. Rather than choose to move to Lancaster County to finish raising them there herself, or let them be separated by the foster care system, Sarah decides to take them with her back to Portland where she believes she can make a new life for them. However, she soon realizes that the modern world has forced them to compromise who they are, and that she has moved them there for all the wrong reasons - a motive which is soon exposed - because secrets can really never be kept secret. In order to find her own redemption, she knows she must make a choice to give them back their lives in Amish Country. And whether she remains part of their lives will have a lot to do with how much she has grown to love them.']

In [12]:
text = " ".join(normalizeText(movie['plot'][0]))
text

'When Sarah Cain , a self - involved big - city newspaper columnist , travels to Pennsylvania for the funeral of her Amish sister , she soon discovers that she is the legal guardian of her five Amish nieces and nephews . Rather than choose to move to Lancaster County to finish raising them there herself , or let them be separated by the foster care system , Sarah decides to take them with her back to Portland where she believes she can make a new life for them . However , she soon realizes that the modern world has forced them to compromise who they are , and that she has moved them there for all the wrong reasons - a motive which is soon exposed - because secrets can really never be kept secret . In order to find her own redemption , she knows she must make a choice to give them back their lives in Amish Country . And whether she remains part of their lives will have a lot to do with how much she has grown to love them .'

In [60]:
len(text.split())

186

In [14]:
bert = BertModel.from_pretrained('bert-base-cased')
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [65]:
text_encoded = tokenizer.encode_plus(
                [text, "Hello world"],
                add_special_tokens=True,
                max_length = 50,
                return_token_type_ids=False,
                pad_to_max_length=True,
                return_attention_mask=True,
                return_tensors='pt',
            )
text_encoded['input_ids'].shape

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


torch.Size([1, 50])

In [66]:
text_encoded

{'input_ids': tensor([[101, 100, 100, 102,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0]]), 'attention_mask': tensor([[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0]])}

In [67]:
bert_vects, _ = bert(
                input_ids=text_encoded['input_ids'],
                attention_mask=text_encoded['attention_mask']
            )

In [71]:
bert_vects[0][4:6]

tensor([[-0.2520, -0.1188, -0.1053,  ...,  0.7385,  0.7015,  0.6301],
        [-0.2502, -0.1201, -0.0856,  ...,  0.7117,  0.6825,  0.6160]],
       grad_fn=<SliceBackward>)

In [17]:
txt_feature = bert_vects.squeeze(0).mean(0)

In [18]:
txt_feature.size()

torch.Size([768])

In [19]:
feature_extractor = torch.hub.load('pytorch/vision:v0.6.0', 'vgg16', pretrained=True)

Using cache found in /001/usuarios/isaac.bribiesca/.cache/torch/hub/pytorch_vision_v0.6.0


In [20]:
img = resize_and_crop_image("/001/usuarios/isaac.bribiesca/mmimdb/dataset/0918557.jpeg", (256,256))
img = preprocess(img)
img = img.unsqueeze(0)
img.shape

torch.Size([1, 3, 256, 256])

In [21]:
feature = get_image_feature(feature_extractor, img)
img_feature = feature.squeeze(0)

In [22]:
img_feature.size()

torch.Size([4096])

In [66]:
combined_features = {'txt': txt_feature, 'img': img_feature}

In [67]:
torch.save(combined_features, 'my_tensor')

In [68]:
loaded = torch.load('my_tensor')

In [70]:
loaded['txt'].size()

torch.Size([768])

In [None]:
loaded['img'].size()

In [81]:
def extract_embedding(txt):
    text_encoded = tokenizer.encode_plus(
                txt,
                add_special_tokens=True,
                max_length=200,
                return_token_type_ids=False,
                pad_to_max_length=True,
                truncation=True,
                return_attention_mask=True,
                return_tensors='pt',
            )
    
    bert_vects, _ = bert(
                input_ids=text_encoded['input_ids'],
                attention_mask=text_encoded['attention_mask']
            )
    
    return bert_vects.squeeze(0).detach()

In [82]:
extract_embedding(text).requires_grad

False

In [45]:
def extract_visual(img_name):
    img = resize_and_crop_image(f"/001/usuarios/isaac.bribiesca/mmimdb/dataset/{img_name}.jpeg", (256,256))
    img = preprocess(img)
    img = img.unsqueeze(0)
    feature = get_image_feature(feature_extractor, img)
    return feature

In [83]:
extract_visual("0918557").requires_grad

False

Load Genres

In [30]:
with open('list.txt', 'r') as f:
    files = f.read().splitlines()

In [47]:
genres = []
ids = []
txt_f = []

for i, file in enumerate(files):

    with open(file) as f:
        data = json.load(f)
        index = file.split('/')[-1].split('.')[0]
        ids.append(index)
        genres.append(data['genres'])
        txt_f.append(data['plot'])

    if i%500==0:
        print("reading {0} ...".format(i))

reading 0 ...
reading 500 ...
reading 1000 ...
reading 1500 ...
reading 2000 ...
reading 2500 ...
reading 3000 ...
reading 3500 ...
reading 4000 ...
reading 4500 ...
reading 5000 ...
reading 5500 ...
reading 6000 ...
reading 6500 ...
reading 7000 ...
reading 7500 ...
reading 8000 ...
reading 8500 ...
reading 9000 ...
reading 9500 ...
reading 10000 ...
reading 10500 ...
reading 11000 ...
reading 11500 ...
reading 12000 ...
reading 12500 ...
reading 13000 ...
reading 13500 ...
reading 14000 ...
reading 14500 ...
reading 15000 ...
reading 15500 ...
reading 16000 ...
reading 16500 ...
reading 17000 ...
reading 17500 ...
reading 18000 ...
reading 18500 ...
reading 19000 ...
reading 19500 ...
reading 20000 ...
reading 20500 ...
reading 21000 ...
reading 21500 ...
reading 22000 ...
reading 22500 ...
reading 23000 ...
reading 23500 ...
reading 24000 ...
reading 24500 ...
reading 25000 ...
reading 25500 ...


In [55]:
for idx in ids:
    if 't' in idx:
        print(idx)

In [33]:
n_classes = 23
counts = OrderedDict(
    Counter([g for m in genres for g in m]).most_common())
target_names = list(counts.keys())[:n_classes]

In [34]:
le = MultiLabelBinarizer()
Y = le.fit_transform([m for m in genres])
labels = np.nonzero(le.transform([[t] for t in target_names]))[1]

In [35]:
B = np.copy(Y)
rng = np.random.RandomState([2014, 8, 6])
train_idx, dev_idx, test_idx = [], [], []
test_size = 0.3
dev_size = 0.1
for l in labels[::-1]:
    t = B[:, l].nonzero()[0]
    t = rng.permutation(t)
    n_test = int(math.ceil(len(t) * test_size))
    n_dev = int(math.ceil(len(t) * dev_size))
    n_train = len(t) - n_test - n_dev
    test_idx.extend(t[:n_test])
    dev_idx.extend(t[n_test:n_test + n_dev])
    train_idx.extend(t[n_test + n_dev:])
    B[t, :] = 0

In [36]:
indices = np.concatenate([train_idx, dev_idx, test_idx])
nsamples = len(indices)
nsamples_train, nsamples_dev, nsamples_test = len(
    train_idx), len(dev_idx), len(test_idx)

In [37]:
train_idx = [int(i) for i in train_idx]
dev_idx = [int(i) for i in dev_idx]
test_idx = [int(i) for i in test_idx]

In [79]:
list(np.array(ids)[train_idx])

['0028219',
 '0044863',
 '0037795',
 '0042046',
 '0047542',
 '0044314',
 '0046791',
 '0037884',
 '0049949',
 '0049006',
 '0048724',
 '0040202',
 '0037932',
 '0047976',
 '0043090',
 '0038458',
 '0045566',
 '0042669',
 '0043465',
 '0040506',
 '0037055',
 '0028096',
 '0048488',
 '0022660',
 '0044202',
 '0033717',
 '0040798',
 '0030996',
 '0042994',
 '0023268',
 '0029870',
 '0036711',
 '0041368',
 '0043660',
 '0047424',
 '0040607',
 '0044357',
 '0049552',
 '0041958',
 '0045191',
 '0045932',
 '0045205',
 '0034732',
 '0038453',
 '0029808',
 '0042788',
 '0038991',
 '0039545',
 '0040444',
 '0047878',
 '0043292',
 '0040525',
 '0040257',
 '0048182',
 '0034922',
 '0038774',
 '0029217',
 '0037415',
 '0038369',
 '0041268',
 '0042206',
 '0051207',
 '0038965',
 '0037075',
 '0043132',
 '0029511',
 '0038360',
 '0042379',
 '0038429',
 '0032636',
 '0052311',
 '0041187',
 '0042397',
 '0041373',
 '0039661',
 '0038559',
 '0041207',
 '0047679',
 '0037638',
 '0041178',
 '0039211',
 '0038455',
 '0042832',
 '00

In [51]:
dict_idxs = {'train': list(np.array(ids)[train_idx]), 'dev': list(np.array(ids)[dev_idx]), 'test': list(np.array(ids)[test_idx])}

In [80]:
with open('../mmimdb/data_bert_200/partition.json', 'w') as fp:
    json.dump(dict_idxs, fp)

In [None]:
train_data = {'txt': torch.tensor(txt_f[train_idx]), 'img': torch.tensor(img_f_f[train_idx]), 'labels': torch.tensor(Y[train_idx][:,n_classes])}

In [None]:
torch.save({'txt': torch.tensor(txt_f[train_idx]), 'img': torch.tensor(img_f_f[train_idx]), 'labels': torch.tensor(Y[train_idx][:,n_classes])}, 'train_data')

In [84]:
for i, idx in enumerate(ids):
    txt_tensor = extract_embedding(txt_f[i])
    img_tensor = extract_visual(idx)
    
    example = {'txt': txt_tensor, 'img': img_tensor, 'labels': torch.tensor(Y[i,:n_classes])}
    torch.save(example, f'../mmimdb/data_bert_200/features/{idx}.pt')
    
    if i%300==0:
        print(f"Saving {i} ...")

Saving 0 ...
Saving 300 ...
Saving 600 ...
Saving 900 ...
Saving 1200 ...
Saving 1500 ...
Saving 1800 ...
Saving 2100 ...
Saving 2400 ...
Saving 2700 ...
Saving 3000 ...
Saving 3300 ...
Saving 3600 ...
Saving 3900 ...
Saving 4200 ...
Saving 4500 ...
Saving 4800 ...
Saving 5100 ...
Saving 5400 ...
Saving 5700 ...
Saving 6000 ...
Saving 6300 ...
Saving 6600 ...
Saving 6900 ...
Saving 7200 ...
Saving 7500 ...
Saving 7800 ...
Saving 8100 ...
Saving 8400 ...
Saving 8700 ...
Saving 9000 ...
Saving 9300 ...
Saving 9600 ...
Saving 9900 ...
Saving 10200 ...
Saving 10500 ...
Saving 10800 ...
Saving 11100 ...
Saving 11400 ...
Saving 11700 ...
Saving 12000 ...
Saving 12300 ...
Saving 12600 ...
Saving 12900 ...
Saving 13200 ...
Saving 13500 ...
Saving 13800 ...
Saving 14100 ...
Saving 14400 ...
Saving 14700 ...
Saving 15000 ...
Saving 15300 ...
Saving 15600 ...
Saving 15900 ...
Saving 16200 ...
Saving 16500 ...
Saving 16800 ...
Saving 17100 ...
Saving 17400 ...
Saving 17700 ...
Saving 18000 ...
Sav