In [None]:
!wget -q  https://www.dropbox.com/s/hnnhfr0ekpr0bqo/DREAM.zip

In [None]:
!unzip -o /content/DREAM.zip

Archive:  /content/DREAM.zip
  inflating: __MACOSX/._DREAM        
  inflating: DREAM/test.json         
  inflating: __MACOSX/DREAM/._test.json  
  inflating: DREAM/dev.json          
  inflating: __MACOSX/DREAM/._dev.json  
  inflating: DREAM/train.json        
  inflating: __MACOSX/DREAM/._train.json  


In [None]:
! git clone https://github.com/NVIDIA/apex.git
% cd apex
! pip install -v --no-cache-dir ./
%cd ..

from apex import amp

import pandas as pd
import numpy as np
import logging
import argparse
import random
import numpy as np
import os
import time
import glob
import json

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

!pip install transformers
!pip install sentencepiece
from transformers import DistilBertConfig, DistilBertTokenizer, DistilBertForMultipleChoice
from transformers import AdamW, get_linear_schedule_with_warmup, AlbertConfig, AlbertTokenizer, AlbertForMultipleChoice

import torch
from torch.optim.adam import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from torch.nn.utils.rnn import pad_sequence

fatal: destination path 'apex' already exists and is not an empty directory.
/content/apex
Created temporary directory: /tmp/pip-ephem-wheel-cache-c3gatvoh
Created temporary directory: /tmp/pip-req-tracker-y9e9amt8
Created requirements tracker '/tmp/pip-req-tracker-y9e9amt8'
Created temporary directory: /tmp/pip-install-antkaqfv
Processing /content/apex
  Created temporary directory: /tmp/pip-req-build-1e2ngtf1
  Added file:///content/apex to build tracker '/tmp/pip-req-tracker-y9e9amt8'
    Running setup.py (path:/tmp/pip-req-build-1e2ngtf1/setup.py) egg_info for package from file:///content/apex
    Running command python setup.py egg_info


    torch.__version__  = 1.8.1+cu101


    running egg_info
    creating /tmp/pip-req-build-1e2ngtf1/pip-egg-info/apex.egg-info
    writing /tmp/pip-req-build-1e2ngtf1/pip-egg-info/apex.egg-info/PKG-INFO
    writing dependency_links to /tmp/pip-req-build-1e2ngtf1/pip-egg-info/apex.egg-info/dependency_links.txt
    writing top-level names to /tmp/

# ALBERT Transformer

## ALBERT Pre-Trained Tokenizers

In [None]:
def tokenization(): 
    tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
    return tokenizer

## ALBERT Pre-Trained Model

In [None]:
def dream_model():
    config = AlbertConfig.from_pretrained('albert-base-v2', num_labels=4)
    model = AlbertForMultipleChoice.from_pretrained('albert-base-v2', config=config)
    return model

# DREAM Dataset

## DREAM Feature Class

In [None]:
class dreamFeatures(object):
    def __init__(self,
                 example_id,
                 choices_features,
                 label):
        self.example_id = example_id
        self.choices_features = [
            {
                'input_ids': input_ids,
                'segment_ids': segment_ids,
                'input_mask': input_mask
            }
            for _, input_ids, segment_ids, input_mask in choices_features
        ]
        self.label = label

## DREAM Object Class

In [None]:
class dream(object):
    def __init__(self,
                 test_id,
                 article,
                 start_ending,
                 ending_0,
                 ending_1,
                 ending_2,
                 label):
        self.test_id = test_id
        self.article = article
        self.start_ending = start_ending
        self.endings = [
                 ending_0,
                 ending_1,
                 ending_2,
        ]
        self.label = label

    def __str__(self):
        return self.__repr__()

    def __repr__(self):
        l = [
            "id: {}".format(self.test_id),
            "article: {}".format(self.article),
            "question: {}".format(self.start_ending),
            "option_0: {}".format(self.endings[0]),
            "option_1: {}".format(self.endings[1]),
            "option_2: {}".format(self.endings[2]),
        ]


        if self.label is not None:
            l.append("label: {}".format(self.label))

        return ", ".join(l)

# Read DREAM 

In [None]:
def readDREAM(task):
  path = ""
  if task == "train":
    path = "/content/DREAM/train.json"
  elif task == "dev":
    path = "/content/DREAM/dev.json"
  elif task == "test":
    path = "/content/DREAM/test.json"

  
  with open(path) as f:
    data = json.load(f)
  
  dataset = []
  for story in data:
    temp = {}

    temp["article"] = ""
    for sentence in story[0]:
        if "M:" in sentence:
            sentence = sentence.replace("M: ", "")
            for sent in sent_tokenize(sentence):
                temp["article"] += "Men: " + sent + " "
        elif "W:" in sentence:
            sentence = sentence.replace("W: ", "")
            for sent in sent_tokenize(sentence):
                temp["article"] += "Woman: " + sent + " "
        else:
            temp["article"] += sentence

    temp["question"] = story[1][0]["question"]
    for i in range(len(story[1][0]["choice"])):
        temp[f"choice {i}"] = story[1][0]["choice"][i]    
    # answer choice = A/B/C/D, answer index = 0/1/2/3, answer = answer in string format
    temp["answer"] = story[1][0]["answer"]
    for i in range(len(story[1][0]["choice"])):
        if story[1][0]["choice"][i] == story[1][0]["answer"]:
            temp["answer choice"] = chr(i + 65)      # from 0 to "A"
            temp["answer index"] = i
            break

    dataset.append(temp)
  
  df = pd.DataFrame(dataset)
  examples = []
  for i in range(df.shape[0]):
    examples.append(dream(
        test_id = "dream-"+str(i),
        article=df.loc[i][0],
        start_ending=df.loc[i][1],
        ending_0=df.loc[i][2],
        ending_1=df.loc[i][3],
        ending_2=df.loc[i][4],
        label=df.loc[i][7],
    ))
    
  return examples

## DREAM Dataset

In [None]:
def dreamDataSet(tokenizer, task_type, max_seq_length):
    examples = readDREAM(task_type)
        
    features = []
    for example_index, example in enumerate(examples):
        # article tokens
        article_tokens = tokenizer.tokenize(example.article)
        # quesiton tokens
        start_ending_tokens = tokenizer.tokenize(example.start_ending)

        choices_features = []
        for ending_index, ending in enumerate(example.endings):
            article_tokens_choice = article_tokens[:]

            ending_tokens = start_ending_tokens + ["[SEP]"] + tokenizer.tokenize(ending)
            # ending_tokens = start_ending_tokens + tokenizer.tokenize(ending)

            encode = tokenizer.encode_plus(article_tokens_choice, 
                                           ending_tokens,
                                           add_special_tokens=True,
                                           padding='max_length',
                                           truncation=True,
                                           max_length=max_seq_length)
            
            input_ids = encode["input_ids"]
            segments_ids = encode["token_type_ids"]
            input_mask = encode["attention_mask"]
            tokens = tokenizer.decode(input_ids)

            assert len(input_ids) == max_seq_length
            assert len(input_mask) == max_seq_length
            assert len(segments_ids) == max_seq_length

            choices_features.append((tokens, input_ids, segments_ids, input_mask))

        label = example.label
        features.append(dreamFeatures(
            example_id=example.test_id,
            choices_features=choices_features,
            label=label
        ))

    tensor_input_ids = torch.tensor([[choice["input_ids"] for choice in feature.choices_features] for feature in features], dtype=torch.long)
    tensor_segment_ids = torch.tensor([[choice["segment_ids"] for choice in feature.choices_features] for feature in features], dtype=torch.long)
    tensor_input_mask = torch.tensor([[choice["input_mask"] for choice in feature.choices_features] for feature in features], dtype=torch.long)
    tensor_labels = torch.tensor([f.label for f in features], dtype=torch.long)   
    tensor_dataset = TensorDataset(tensor_input_ids, tensor_segment_ids, tensor_input_mask, tensor_labels)  
      
    if task_type == "test":
      new_test_data = []
      for i in tensor_dataset:
        new_input_ids = i[0]
        new_segment_ids = i[1]
        new_input_masks = i[2]
        new_test_data.append([new_input_ids, new_segment_ids, new_input_masks])
      tensor_dataset = tuple(new_test_data)

    return tensor_dataset

# GPU Configuration

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Training & Prediction

## Prediction

In [None]:
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
  
    with torch.no_grad():
        for data in dataloader:
            if next(model.parameters()).is_cuda:
                data = [t.to(device) for t in data if t is not None]
            
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            # print(outputs, "\n")
            logits = outputs[0]
            # print("logits: {}".format(logits))
                        
            _, pred = torch.max(logits.data, 1)
            
            if compute_acc:
                labels = data[3]
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    if compute_acc:
        acc = correct / total
        return predictions, acc
    return predictions

## Run Dream

In [None]:
BATCH = 32
MAX_SEQ_LEN = 128
LR = 2e-5

tokenizer = tokenization()
model = dream_model()
model = model.to(device)

train_data = dreamDataSet(tokenizer, "train", MAX_SEQ_LEN)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH)
  
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
adam_optimizer = AdamW(optimizer_grouped_parameters, lr=LR)
model, adam_optimizer = amp.initialize(model, adam_optimizer, opt_level="O1")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=760289.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1312669.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=684.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=47376696.0, style=ProgressStyle(descrip…




Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForMultipleChoice: ['predictions.bias', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.decoder.bias']
- This IS expected if you are initializing AlbertForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForMultipleChoice were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


In [None]:
EPOCHS = 4

print("Start Training...")
for epoch in range(EPOCHS):
    epoch_start = time.time()
    train_loss = 0.0
    total_loss, batch_loss, batch_counts = 0, 0, 0
    model.train()

    for data in train_dataloader:
      tokens_tensors, segments_tensors, masks_tensors, labels_tensor = [t.to(device) for t in data]
      
      # Set the gradient to 0
      adam_optimizer.zero_grad()
      outputs = model(input_ids=tokens_tensors, 
                      token_type_ids=segments_tensors, 
                      attention_mask=masks_tensors, 
                      labels=labels_tensor)
      
      # print("outputs: {}".format(outputs))
      
      loss = outputs[0]
      
      train_loss += loss.item()  
      with amp.scale_loss(loss, adam_optimizer) as scaled_loss:
        scaled_loss.backward()
      adam_optimizer.step()

    # Calculating Accuracy
    _, train_accuracy = get_predictions(model, train_dataloader, compute_acc=True)

    ## Evaluation
    model.eval()    
    dev_data = dreamDataSet(tokenizer=tokenizer, task_type="dev", max_seq_length=MAX_SEQ_LEN)
    dev_dataloader = DataLoader(dev_data, batch_size=BATCH)   
    # Tracking variables
    dev_accuracy = []
    dev_loss = []   
    eval_time_start = time.time()
    # For each batch in our validation set...
    for data in dev_dataloader:
        # Load batch to GPU
        tokens_tensors, segments_tensors, masks_tensors, labels_tensor = [t.to(device) for t in data]   
        # Compute logits
        with torch.no_grad():
          outputs = model(input_ids=tokens_tensors, 
                      token_type_ids=segments_tensors, 
                      attention_mask=masks_tensors, 
                      labels=labels_tensor)
        loss = outputs[0]
        logits = outputs[1]
        # Compute loss
        dev_loss.append(loss.item())    
        # Get the predictions
        preds = torch.argmax(logits, dim=1).flatten()   
        # Calculating Accuracy
    _, dev_acc = get_predictions(model, dev_dataloader, compute_acc=True)
    dev_accuracy.append(dev_acc)
    eval_time_stop = time.time()    
    # Compute the average accuracy and loss over the validation set.
    dev_loss = np.mean(dev_loss)
    dev_accuracy = np.mean(dev_accuracy)

    epoch_end = time.time()   
    print("Training Epoch: {}, TRAIN-Loss: {}, TRAIN-Accuracy: {}, DEV-Loss: {}, DEV-Accuracy: {},Epcoh Time: {}".
          format(epoch+1, train_loss, train_accuracy, dev_loss, dev_accuracy, epoch_end-epoch_start))

    # print('[epoch %d] loss: %.3f, acc: %.3f' % (epoch + 1, total_loss, acc))
    # print("Training Epoch: {}, Loss: {}, Accuracy: {}, Epcoh Time: {}".format(epoch+1, total_loss, acc, epDoch_end-epoch_start))

Start Training...
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 16384.0


KeyboardInterrupt: ignored

## Test Prediction

In [None]:
TEST_MAX_SEQ_LEN = 128
TEST_BATCH = 8

model.eval()
tokenizer = tokenization()

test_data = dreamDataSet(tokenizer=tokenizer, task_type="test", max_seq_length=TEST_MAX_SEQ_LEN)

test_dataloader = DataLoader(test_data, batch_size=TEST_BATCH)

predictions = get_predictions(model, test_dataloader)

In [None]:
pred_labels = predictions.tolist()
print(pred_labels)

In [None]:
real_lables = []
testExample = readDREAM("test")
for i in range(len(testExample)):
  real_lables.append(testExample[i].label)

In [None]:
# Testing Result Evaluation
from sklearn.metrics import accuracy_score

print("Testing Accuracy: {}".format(accuracy_score(real_lables, pred_labels)))