In [None]:
import pandas as pd
import numpy as np
import re
import string
import nltk.data
from transformers import BertTokenizer, BertModel, BertForMaskedLM, AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [None]:
if torch.cuda.is_available():
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU: ', torch.cuda.get_device_name(0))
    device = torch.device("cuda:0")
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
data = pd.read_csv('./wiki_movie_plots_deduped.csv')

In [None]:
data.head()

In [None]:
plots = data['Plot'].tolist()

In [None]:
print(len(plots))

In [None]:
model_version = 'bert-base-uncased'
wordTokenizer = BertTokenizer.from_pretrained(model_version, do_lower_case=model_version.endswith("uncased"))
sentTokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [None]:
printable = set(string.printable)
MAX_LEN = 128

def clean(plot):
  plot1 = ''.join(filter(lambda x: x in printable, plot))
  plot1 = re.sub('\[\d*\]', '', plot1)
  return plot1

def addMask(ids, wordTokenizer):
  # l = original length before padding
  label = []
  maskid = wordTokenizer.convert_tokens_to_ids('[MASK]')
  for i, id in enumerate(ids):
    if wordTokenizer.convert_ids_to_tokens(id) == '[CLS]':
      label.append(-100)
      continue
    if wordTokenizer.convert_ids_to_tokens(id) == '[SEP]':
      break
    if 0.15 >= np.random.random():
      p = np.random.random()
      if 0.8 >= p:
        label.append(id)
        ids[i] = maskid
      elif 0.9 >= p:
        newid = np.random.randint(0, len(wordTokenizer.vocab)-1)
        ids[i] = newid
        label.append(-100)
      else: 
        label.append(-100)
    else:
      label.append(-100)
  while len(label) < len(ids):
    label.append(-100)
  assert len(label) == 128
  return ids, label
  

def preprocess(plots):
  ids = []
  attentionMasks = []
  labels = []
  for plot in plots:
    plot = clean(plot)
    sents = sentTokenizer.tokenize(plot)

    for sent in sents:
      # print(sent)
      encodedDict = wordTokenizer.encode_plus(sent,add_special_tokens=True,max_length=MAX_LEN,pad_to_max_length=True)
      # labels.append(encodedDict['input_ids'].copy())
      inputid, label = addMask(encodedDict['input_ids'], wordTokenizer)
      ids.append(inputid)
      attentionMasks.append(encodedDict['attention_mask'])
      labels.append(label)
      # print(inputid)
      # print(encodedDict['attention_mask'])
      # print(label)
      # print(wordTokenizer.convert_ids_to_tokens(encodedDict['input_ids']))
  
  return ids, attentionMasks, labels

In [None]:
ids, masks, labels = preprocess(plots[:30000])

In [None]:
from sklearn.model_selection import train_test_split

train_inputs,validation_inputs,train_masks,validation_masks = train_test_split(ids, masks, random_state=42, test_size=0.1)
train_labels,validation_labels,_,_ = train_test_split(labels,ids,random_state=42,test_size=0.1)

In [None]:
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)
# print(train_inputs[0])

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

batch_size = 16

train_data = TensorDataset(train_inputs,train_masks,train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data,sampler=train_sampler,batch_size=batch_size)

validation_data = TensorDataset(validation_inputs,validation_masks,validation_labels)
validation_sampler = RandomSampler(validation_data)
validation_dataloader = DataLoader(validation_data,sampler=validation_sampler,batch_size=batch_size)

In [None]:
model_version = 'bert-base-uncased'
model = BertForMaskedLM.from_pretrained(model_version)
model = model.to(device)

In [None]:
lr = 2e-5
adam_epsilon = 1e-8

# Number of training epochs (authors recommend between 2 and 4)
epochs = 3

num_warmup_steps = 0
num_training_steps = len(train_dataloader)*epochs

### In Transformers, optimizer and schedules are splitted and instantiated like this:
optimizer = AdamW(model.parameters(), lr=lr,eps=adam_epsilon,correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)  # PyTorch scheduler

In [None]:
train_loss_set = []
learning_rate = []

# Gradients gets accumulated by default
model.zero_grad()

# tnrange is a tqdm wrapper around the normal python range
for epoch_i in range(1,epochs+1):
  print('======== Epoch {:} / {:} ========'.format(epoch_i, epochs))
  # Calculate total loss for this epoch
  train_loss = 0
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    print(step)
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)
    
    model.zero_grad()

    # Forward pass
    loss, logits = model(b_input_ids, attention_mask=b_input_mask, masked_lm_labels=b_labels)
    
    # Backward pass
    loss.backward()
    
    # Clip the norm of the gradients to 1.0
    # Gradient clipping is not in AdamW anymore
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    
    # Update learning rate schedule
    scheduler.step()

    # Clear the previous accumulated gradients
    optimizer.zero_grad()
    
    # Update tracking variables
    train_loss += loss.item()

  # Calculate the average loss over the training data.
  avg_train_loss = train_loss / len(train_dataloader)

  #store the current learning rate
  for param_group in optimizer.param_groups:
    print("\n\tCurrent Learning rate: ",param_group['lr'])
    learning_rate.append(param_group['lr'])
    
  train_loss_set.append(avg_train_loss)
  print(F'\n\tAverage Training loss: {avg_train_loss}')
    
  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Tracking variables 
  validation_loss = 0.0

  # Evaluate data for one epoch
  for batch in validation_dataloader:
    # Add batch to GPU
    # batch = tuple(t.to(device) for t in batch)
    # # Unpack the inputs from our dataloader
    # b_input_ids, b_input_mask, b_labels = batch
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      loss, logits = model(b_input_ids, attention_mask=b_input_mask, masked_lm_labels=b_labels)
    
    # Move logits and labels to CPU
    validation_loss += loss.item()
  avg_validation_loss = validation_loss / len(validation_dataloader)

  print(F'\n\tAverage Validation loss: {avg_validation_loss}')

  torch.save(model, './model'+str(epoch_i)+'.pt')