In [2]:
import re
import os
import nltk
import time
import json
import torch
import random
import numpy as np
import pandas as pd
import transformers
import torch.nn as nn
import torch.nn.functional as F

from tqdm import tqdm
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import DistilBertTokenizer, DistilBertModel
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [None]:
%matplotlib inline

In [None]:
words = stopwords.words("english")
lemma = nltk.stem.WordNetLemmatizer()

In [3]:
path = '../input/48k-imdb-movies-data/Data'
images, names, genres, descriptions = [], [], [], []
all_directories = os.listdir(path)
for directory in all_directories:
    directories = os.listdir(os.path.join(path, directory))
    for dir_ in directories:
        file_path = os.path.join(path, directory, dir_, dir_ + '.json')
        with open(file_path) as file:
            movie = json.load(file)
            try:
                description = movie['description']
                genre = movie['genre']
                descriptions.append(description)
                images.append(dir_)
                names.append(movie['name'])
                genres.append(genre)
            except KeyError:
                continue

In [4]:
data = pd.DataFrame({'title': names,
                  'genres': genres,
                  'description': descriptions,
                  'image': images})
data = data.sample(frac=1)
data.reset_index(drop=True, inplace=True)
data = data.explode('genres')
data = data.groupby(['title', 'description']).agg({'genres': lambda x: x.tolist()}).reset_index()
data['concat'] = data['title'] + ' [SEP] ' + data['description']

In [5]:
mlb = MultiLabelBinarizer(sparse_output=True)
data = data.join(pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(data['genres']),
                index=data.index,
                columns=mlb.classes_))

In [7]:
targets = ['Action', 'Crime', 'Adventure', 'Thriller', 'Drama', 'Family',
           'Sport', 'Mystery', 'Western', 'History', 'Sci-Fi', 'Animation',
           'Documentary', 'Music', 'War', 'Biography', 'Musical', 'Superhero',
           'Horror', 'Short', 'Comedy', 'Fantasy', 'Romance', 'Film-Noir']

In [8]:
X = data.concat.values
y = data[targets].values

X_train, X_val, y_train, y_val =train_test_split(X, y, test_size=0.1, random_state=2020)

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [11]:
def text_preprocessing(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"won't", "will not ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub(r"\'\n", " ", text)
    text = re.sub(r"-", " ", text)
    text = re.sub(r"\'\xa0", " ", text)
    text = re.sub('\s+', ' ', text)
    text = ''.join(c for c in text if not c.isnumeric())
    text = re.sub(r'&amp;', '&', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)

In [None]:
def preprocessing_for_bert(data):
    input_ids = []
    attention_masks = []

    for sent in data:
        encoded_sent = tokenizer.encode_plus(text=text_preprocessing(sent),
                                             add_special_tokens=True,
                                             max_length=MAX_LEN,
                                             pad_to_max_length=True,
                                             return_attention_mask=True)

        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    return input_ids,attention_masks

In [15]:
MAX_LEN = 256

train_inputs, train_masks = preprocessing_for_bert(X_train)
val_inputs, val_masks = preprocessing_for_bert(X_val)

In [17]:
batch_size = 16

train_labels = torch.tensor(y_train)
val_labels = torch.tensor(y_val)

train_data = TensorDataset(train_inputs,train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

In [None]:
class BertClassifier(nn.Module):
    def __init__(self, freeze_bert=False):
        super(BertClassifier,self).__init__()
        D_in, H, D_out = 768, 30, 24
        self.bert = DistilBertModel.from_pretrained("bert-base-uncased")
        self.classifier = nn.Sequential(
                            nn.Linear(D_in, H),
                            nn.ReLU(),
                            nn.Linear(H, D_out))
        self.sigmoid = nn.Sigmoid()
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad=False
    
    def forward(self,input_ids,attention_mask):
        outputs = self.bert(input_ids=input_ids,
                           attention_mask = attention_mask)

        last_hidden_state_cls = outputs[0][:,0,:]
        logit = self.classifier(last_hidden_state_cls)
        return logit

In [19]:
def initialize_model(epochs=4):
    bert_classifier = BertClassifier(freeze_bert=False)
    bert_classifier.to(device)
    optimizer = AdamW(bert_classifier.parameters(),
                      lr=5e-5,
                      eps=1e-8)
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps=0,
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

In [22]:
def set_seed(seed_value=42):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    print("Start training...\n")
    for epoch_i in range(epochs):
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'F1-score':^9} | {'Elapsed':^9}")
        print("-" * 70)
        t0_epoch, t0_batch = time.time(), time.time()
        total_loss, batch_loss, batch_counts = 0, 0, 0
        model.train()
        for step, batch in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
            batch_counts += 1
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
            model.zero_grad()
            logits = model(b_input_ids, b_attn_mask)
            loss = loss_fn(logits, b_labels.float())
            batch_loss += loss.item()
            total_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            if (step % 50000 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                time_elapsed = time.time() - t0_batch
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()
        avg_train_loss = total_loss / len(train_dataloader)

        print("-" * 70)
        if evaluation == True:
            val_loss, val_f1_score = evaluate(model, val_dataloader)
            time_elapsed = time.time() - t0_epoch
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_f1_score:^9.2f} | {time_elapsed:^9.2f}")
            print("-" * 70)
        print("\n")
    print("Training complete!")


def evaluate(model, val_dataloader):
    model.eval()
    val_f1_score = []
    val_loss = []
    for batch in val_dataloader:
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
        loss = loss_fn(logits, b_labels.float())
        val_loss.append(loss.item())
        threshold = 0.5
        f1_score_ = f1_score(np.rint(logits.cpu().view(-1, 23).numpy()) > threshold,
                            b_labels.cpu().view(-1, 23).numpy(),
                            average='micro')
        val_f1_score.append(f1_score_)
    val_loss = np.mean(val_loss)
    val_f1_score = np.mean(val_f1_score)
    return val_loss, val_f1_score

In [23]:
torch.cuda.empty_cache() 

In [None]:
set_seed(42)
loss_fn = nn.BCEWithLogitsLoss()
bert_classifier, optimizer, scheduler = initialize_model(epochs=4)  
train(bert_classifier, train_dataloader, val_dataloader, epochs=4, evaluation=True)