In [None]:
!pip install madgrad
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git

In [None]:
import os
import json
import clip
import copy
import torch
import random
import numpy as np
import pandas as pd
import transformers
import torch.nn as nn

from PIL import Image
from madgrad import MADGRAD
from ast import literal_eval
from tqdm.notebook import tqdm
from collections import Counter
from sklearn import preprocessing
from matplotlib import pyplot as plt
from torch.utils.data import Dataset
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

from transformers import (
    AutoConfig,
    AutoModel,
    AutoTokenizer,
    MMBTConfig,
    MMBTModel,
    MMBTForClassification,
    get_linear_schedule_with_warmup,
)

In [2]:
path = '../input/48k-imdb-movies-data/Data'
image_ids, names, genres, descriptions = [], [], [], []
all_directories = os.listdir(path)
for directory in all_directories:
    directories = os.listdir(os.path.join(path, directory))
    for dir_ in directories:
        file_path = os.path.join(path, directory, dir_, dir_ + '.json')
        with open(file_path) as file:
            movie = json.load(file)
            try:
                description = movie['description']
                genre = movie['genre']
                descriptions.append(description)
                image_ids.append(dir_)
                names.append(movie['name'])
                genres.append(genre)
            except KeyError:
                continue

In [3]:
path = '../input/48k-imdb-movies-with-posters/Poster'
image_paths = []
all_directories = os.listdir(path)
for directory in all_directories:
    directories = os.listdir(os.path.join(path, directory))
    for dir_ in directories:
        file_path = os.path.join(path, directory, dir_, dir_ + '.jpg')
        image_paths.append(file_path)

In [4]:
image_paths = [path for path in image_paths if path != '../input/48k-imdb-movies-with-posters/Poster/2015/tt3317562/tt3317562.jpg']
image_paths = pd.Series(image_paths)

In [5]:
dict_ = dict(zip(image_paths.apply(lambda x: x.split('/')[5]), image_paths))

In [13]:
data = pd.DataFrame({'title': names,
                  'genres': genres,
                  'description': descriptions,
                  'image': image_ids})
data.image = data.image.map(dict_)
data.dropna(inplace=True)
data = data.sample(frac=1)
data.reset_index(drop=True, inplace=True)
data = data.explode('genres')
data = data.groupby(['title', 'description', 'image']).agg({'genres': lambda x: x.tolist()}).reset_index()

In [14]:
mlb = MultiLabelBinarizer(sparse_output=True)
data = data.join(pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(data['genres']),
                index=data.index,
                columns=mlb.classes_)[targets])

In [12]:
targets = ['Action', 'Crime', 'Adventure', 'Thriller', 'Drama', 'Family',
           'Sport', 'Mystery', 'Western', 'History', 'Sci-Fi', 'Animation',
           'Documentary', 'Music', 'War', 'Biography', 'Musical', 'Superhero',
           'Horror', 'Short', 'Comedy', 'Fantasy', 'Romance', 'Film-Noir']

In [None]:
data['concat'] = data['title'] + ' | ' + data['description'] 
data.reset_index(inplace=True)

In [20]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [None]:
clip_model, preprocess = clip.load("RN50x4", device=device, jit=False)

for p in clip_model.parameters():
    p.requires_grad = False

In [21]:
num_image_embeds = 4
num_labels = 24
gradient_accumulation_steps = 20
max_seq_length = 80 
max_grad_norm = 0.5
train_batch_size = 16
eval_batch_size = 16
image_encoder_size = 288
image_features_size = 640
num_train_epochs = 5

100%|███████████████████████████████████████| 402M/402M [00:06<00:00, 65.9MiB/s]


In [23]:
def slice_image(im, desired_size):
    old_size = im.size
    ratio = float(desired_size) / min(old_size)
    new_size = tuple([int(x * ratio) for x in old_size])
    im = im.resize(new_size, Image.ANTIALIAS)    
    ar = np.array(im)
    images = []
    if ar.shape[0] < ar.shape[1]:
        middle = ar.shape[1] // 2
        half = desired_size // 2
        images.append(Image.fromarray(ar[:, :desired_size]))
        images.append(Image.fromarray(ar[:, middle-half:middle+half]))
        images.append(Image.fromarray(ar[:, ar.shape[1]-desired_size:ar.shape[1]]))
    else:
        middle = ar.shape[0] // 2
        half = desired_size // 2
        images.append(Image.fromarray(ar[:desired_size, :]))
        images.append(Image.fromarray(ar[middle-half:middle+half, :]))
        images.append(Image.fromarray(ar[ar.shape[0]-desired_size:ar.shape[0], :]))
    return images
  
def resize_pad_image(im, desired_size):
    old_size = im.size
    ratio = float(desired_size) / max(old_size)
    new_size = tuple([int(x * ratio) for x in old_size])
    im = im.resize(new_size, Image.ANTIALIAS)
    new_im = Image.new("RGB", (desired_size, desired_size))
    new_im.paste(im, ((desired_size - new_size[0]) // 2,
                        (desired_size - new_size[1]) // 2))
    return new_im

In [24]:
class ClipEncoderMulti(nn.Module):
    def __init__(self, num_embeds, num_features=image_features_size):
        super().__init__()        
        self.model = clip_model
        self.num_embeds = num_embeds
        self.num_features = num_features

    def forward(self, x):
        # 4x3x288x288 -> 1x4x640
        out = self.model.encode_image(x.view(-1, 3, 288, 288))
        out = out.view(-1, self.num_embeds, self.num_features).float()
        return out

In [25]:
class JsonlDataset(Dataset):
    def __init__(self, data_path, tokenizer, transforms, max_seq_length):
        self.data = [json.loads(l) for l in open(data_path)]
        self.data_dir = os.path.dirname(data_path)
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length
        self.transforms = transforms

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sentence = torch.LongTensor(self.tokenizer.encode(self.data[index]["text"], add_special_tokens=True))
        start_token, sentence, end_token = sentence[0], sentence[1:-1], sentence[-1]
        sentence = sentence[:self.max_seq_length]

        label = torch.FloatTensor([self.data[index]["label"]])

        image = Image.open(os.path.join(self.data_dir, self.data[index]["img"])).convert("RGB")
        sliced_images = slice_image(image, 288)
        sliced_images = [np.array(self.transforms(im)) for im in sliced_images]
        image = resize_pad_image(image, image_encoder_size)
        image = np.array(self.transforms(image))
        
        sliced_images = [image] + sliced_images         
        sliced_images = torch.from_numpy(np.array(sliced_images)).to(device)

        return {"image_start_token": start_token,            
                "image_end_token": end_token,
                "sentence": sentence,
                "image": sliced_images,
                "label": label}

    def get_label_frequencies(self):
        label_freqs = Counter()
        for row in self.data:
            label_freqs.update([row["label"]])
        return label_freqs
    
    def get_labels(self):
        labels = []
        for row in self.data:
            labels.append(row["label"])
        return labels
   

def collate_fn(batch):
    lens = [len(row["sentence"]) for row in batch]
    bsz, max_seq_len = len(batch), max(lens)

    mask_tensor = torch.zeros(bsz, max_seq_len, dtype=torch.long)
    text_tensor = torch.zeros(bsz, max_seq_len, dtype=torch.long)

    for i_batch, (input_row, length) in enumerate(zip(batch, lens)):
        text_tensor[i_batch, :length] = input_row["sentence"]
        mask_tensor[i_batch, :length] = 1
    
    img_tensor = torch.stack([row["image"] for row in batch])
    tgt_tensor = torch.stack([row["label"] for row in batch])
    img_start_token = torch.stack([row["image_start_token"] for row in batch])
    img_end_token = torch.stack([row["image_end_token"] for row in batch])

    return text_tensor, mask_tensor, img_tensor, img_start_token, img_end_token, tgt_tensor

In [26]:
def load_examples(tokenizer, evaluate=False):
    path = "dev_seen_clean.jsonl" if evaluate else f"train_augmented.jsonl"
    transforms = preprocess
    dataset = JsonlDataset(path, tokenizer, transforms, max_seq_length - num_image_embeds - 2)
    return dataset

In [None]:
model_name = 'bert-base-multilingual-cased'
transformer_config = AutoConfig.from_pretrained(model_name) 
transformer = AutoModel.from_pretrained(model_name, config=transformer_config)
img_encoder = ClipEncoderMulti(num_image_embeds)
tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)

config = MMBTConfig(transformer_config, num_labels=num_classes, modal_hidden_size=image_features_size)
model = MMBTForClassification(config, transformer, img_encoder)
model.to(device) 

In [33]:
x_train, x_test, y_train, y_test = train_test_split(data[['index', 'genres', 'concat', 'image']], data[targets])

In [34]:
test = []
for row_x, row_y in zip(x_test.iterrows(), y_test.iterrows()):
    id_ = row_x[1]['index']
    genre = row_x[1]['genres']
    image_path = row_x[1]['image']
    test.append({'id':id_,
                 'img': image_path,
                 'label':row_y[1].to_list(),
                 'text':row_x[1]['concat']})
    
train = []
for row_x, row_y in zip(x_train.iterrows(), y_train.iterrows()):
    id_ = row_x[1]['index']
    genre = row_x[1]['genres']
    image_path = row_x[1]['image']
    train.append({'id':id_,
                 'img': image_path,
                 'label':row_y[1].to_list(),
                 'text':row_x[1]['concat']})

In [35]:
with open('./train_augmented.jsonl', 'w') as outfile:
    for entry in train:
        json.dump(entry, outfile)
        outfile.write('\n')
        
with open('./dev_seen_clean.jsonl', 'w') as outfile:
    for entry in test:
        json.dump(entry, outfile)
        outfile.write('\n')

In [36]:
train_dataset = load_examples(tokenizer, evaluate=False)
eval_dataset = load_examples(tokenizer, evaluate=True)   

train_sampler = RandomSampler(train_dataset)
eval_sampler = SequentialSampler(eval_dataset)

train_dataloader = DataLoader(train_dataset,
                              sampler=train_sampler,
                              batch_size=train_batch_size,
                              collate_fn=collate_fn)

eval_dataloader = DataLoader(eval_dataset, 
                             sampler=eval_sampler, 
                             batch_size=eval_batch_size, 
                             collate_fn=collate_fn)

In [39]:
no_decay = ["bias", "LayerNorm.weight"]
weight_decay = 0.0005
optimizer_grouped_parameters = [{"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                                 "weight_decay": weight_decay},
                                {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                                 "weight_decay": 0.0}]

t_total = (len(train_dataloader) // gradient_accumulation_steps) * num_train_epochs
warmup_steps = t_total // 10
optimizer = MADGRAD(optimizer_grouped_parameters, lr=2e-4)
scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps, t_total)
criterion = nn.BCEWithLogitsLoss()

In [40]:
def evaluate(model, tokenizer, criterion, dataloader, tres = 0.5): 
    print('eval')
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    proba = None
    out_label_ids = None
    for batch in dataloader:
        model.eval()
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            labels = batch[5]
            inputs = {"input_ids": batch[0],
                      "input_modal": batch[2],
                      "attention_mask": batch[1],
                      "modal_start_tokens": batch[3],
                      "modal_end_tokens": batch[4],
                      "return_dict": False}
            outputs = model(**inputs)
            logits = outputs[0]
            tmp_eval_loss = criterion(logits, labels.reshape(-1, 23))
            eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
        if preds is None:
            preds = torch.sigmoid(logits).detach().cpu().numpy() > tres
            proba = torch.sigmoid(logits).detach().cpu().numpy()
            out_label_ids = labels.detach().cpu().numpy()
        else:            
            preds = np.append(preds, torch.sigmoid(logits).detach().cpu().numpy() > tres, axis=0)
            proba = np.append(proba, torch.sigmoid(logits).detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids, labels.detach().cpu().numpy(), axis=0)
    
    eval_loss = eval_loss / nb_eval_steps
    result = {"loss": eval_loss,
              "micro_f1": f1_score(out_label_ids.reshape(-1, 23), preds, average='micro'),
              "prediction": preds,
              "labels": out_label_ids,
              "proba": proba}

    return result

In [41]:
optimizer_step = 0
global_step = 0
train_step = 0
tr_loss, logging_loss = 0.0, 0.0
global_steps_list = []
train_loss_list = []
val_loss_list = []
val_f1_list = []
eval_every = 2217
file_path = ""

In [None]:
model.zero_grad()

for i in range(num_train_epochs):
    print("Epoch", i + 1, f"from {num_train_epochs}")
    whole_y_pred = np.array([])
    whole_y_t = np.array([])
    for step, batch in enumerate(tqdm(train_dataloader)):
        model.train()
        batch = tuple(t.to(device) for t in batch)
        labels = batch[5]
        inputs = {"input_ids": batch[0],
                  "input_modal": batch[2],
                  "attention_mask": batch[1],
                  "modal_start_tokens": batch[3],
                  "modal_end_tokens": batch[4],
                  "return_dict": False}
        outputs = model(**inputs)
        logits = outputs[0]
        loss = criterion(logits, labels.reshape(-1, 24))        
        if gradient_accumulation_steps > 1:
            loss = loss / gradient_accumulation_steps
        loss.backward()
        
        tr_loss += loss.item()
        running_loss += loss.item()
        global_step += 1
        
        if (step + 1) % gradient_accumulation_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
            optimizer.step()
            scheduler.step()    
            
            optimizer_step += 1
            optimizer.zero_grad()   
                        
        if (step + 1) % eval_every == 0:
            
            average_train_loss = running_loss / eval_every
            train_loss_list.append(average_train_loss)
            global_steps_list.append(global_step)
            running_loss = 0.0  
            
            val_result = evaluate(model, tokenizer, criterion, eval_dataloader)
            
            val_loss_list.append(val_result['loss'])
            val_f1_list.append(val_result['micro_f1'])

            print("Train loss:", f"{average_train_loss:.4f}", 
                  "Val loss:", f"{val_result['loss']:.4f}",
                  "Val f1:", f"{val_result['micro_f1']:.4f}")   
    print('\n')     