# Modeling

Here we attempt to quickly mock up some models and transfer them into a python file for a long running query.

In [1]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

import os
import subprocess
import json
import pickle
import argparse
import logging
from multiprocessing import Pool

import numpy as np
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel

from sklearn.model_selection import train_test_split

from torch.nn.init import xavier_uniform_
import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.optim.lr_scheduler as sched
import torch.utils.data as data

from tensorboardX import SummaryWriter
from tqdm import tqdm

from rouge import Rouge 

In [2]:
CNN_STORY_DIR = os.path.join('data', 'cnn', 'stories')
DM_STORY_DIR = os.path.join('data', 'dailymail', 'stories')

CNN_STORY_TOKENIZED = os.path.join('data', 'cnn', 'stories-tokenized')
DM_STORY_TOKENIZED = os.path.join('data', 'dailymail', 'stories-tokenized')

SRC_JSON = os.path.join('data', 'src.pk')
TGT_JSON = os.path.join('data', 'tgt.pk')

PREDICTED_SUMS = os.path.join('out', 'predicted')
GOLD_SUMS = os.path.join('out', 'gold')
PROCESSED_DATA = os.path.join('data', 'data.pk')

In [4]:
with open(PROCESSED_DATA, 'rb') as f:
    all_data = pickle.load(f)

## Models

Here we define some torch models 

1. Linear model - After BERT, we have a single linear layer that transforms the last hidden layer output for each timestamp into a single logit
2. Linear model with multi-head attention - After BERT, we calculate the attention distribution and reweigh each hidden state before feeding it into a linear layer as in the previous model
3. Linear model with multi-layer attention - Same as before but attend to ALL hidden layers
4. RNN decoder with attention - After BERT, have another stacked bi-directional layer with attention. 

In [5]:
def get_available_devices():
    """Get IDs of all available GPUs.

    Returns:
        device (torch.device): Main device (GPU 0 or CPU).
        gpu_ids (list): List of IDs of all GPUs that are available.
    """
    gpu_ids = []
    if torch.cuda.is_available():
        gpu_ids += [gpu_id for gpu_id in range(torch.cuda.device_count())]
        device = torch.device('cuda:{}'.format(gpu_ids[0]))
        torch.cuda.set_device(device)
    else:
        device = torch.device('cpu')

    return device, gpu_ids

In [6]:
bert = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [55]:
parser = argparse.ArgumentParser()
parser.add_argument("-seed", default=1234)
parser.add_argument("-load_path", default=None)
parser.add_argument("-split", default='tiny')
parser.add_argument("-batch_size", default=1)
parser.add_argument("-gpu_ids", default=[0])
parser.add_argument("-num_workers", default=1)
parser.add_argument("-lr", default=0.001)
parser.add_argument("-l2_wd", default=0)
parser.add_argument("-eval_steps", default=50000)
parser.add_argument("-num_epochs", default=10)
parser.add_argument("-max_grad_norm", default=2)
args = parser.parse_args([])

In [56]:
class SummarizationDataset(data.Dataset):
    def __init__(self, X, y):
        super(SummarizationDataset, self).__init__()
        self.X = X
        self.y = y
    def __getitem__(self, i):
        return (self.X[i], self.y[i])
    def __len__(self):
        return len(self.X)

def collate_fn(examples):
    """
    collate function requires all examples to be non-padded
    """
    def merge_1d(arrays, dtype=torch.int64, pad_value=0):
        lengths = [len(a) for a in arrays]
        padded = torch.zeros(len(arrays), max(lengths), dtype=dtype)
        for i, seq in enumerate(arrays):
            end = lengths[i]
            padded[i, :end] = torch.tensor(seq)[:end]
        return padded
    X, y = zip(*examples)
    return merge_1d(X), merge_1d(y)

In [77]:
BERT_HIDDEN_SIZE = 768
class SummarizerLinear(nn.Module):
    def __init__(self):
        super(SummarizerLinear, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.linear = nn.Linear(BERT_HIDDEN_SIZE, 1)
        xavier_uniform_(self.linear.weight)
        
    def forward(self, X):
        encoded_layers, _ = self.bert(X, output_all_encoded_layers=False)
        enc = self.linear(encoded_layers[0]).transpose(0, 1) ## 1D array
        return enc

In [82]:
log = logging.getLogger('main')
log.setLevel(logging.DEBUG)
def train(args):
    log.info("")
#     device, args.gpu_ids = get_available_devices()
    device, args.gpu_ids = torch.device('cpu'), []
    
    # Set random seed
    log.info('Using random seed {}...'.format(args.seed))
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    
    log.info('Building model...')
    model = SummarizerLinear()
#     model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info('Loading checkpoint from {}...'.format(args.load_path))
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    
    model.train()
    optimizer = optim.Adam(model.parameters(), args.lr,
                               weight_decay=args.l2_wd)
    
    log.info('Building dataset...')
    train_dataset = SummarizationDataset(all_data['tiny']['X'], all_data['tiny']['y'])
    dev_dataset = SummarizationDataset(all_data['tiny']['X'], all_data['tiny']['y'])
    train_loader = data.DataLoader(train_dataset, 
                                   batch_size=args.batch_size,
                                   num_workers=args.num_workers,
                                   shuffle=True,
                                   collate_fn=collate_fn)
    dev_loader = data.DataLoader(dev_dataset, 
                                   batch_size=args.batch_size,
                                   num_workers=args.num_workers,
                                   shuffle=False,
                                   collate_fn=collate_fn)
    ## Train!
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    while epoch != args.num_epochs:
        epoch += 1
        log.info('Starting epoch {}...'.format(epoch))
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for X, y in train_loader:
                X = X.to(device)
                optimizer.zero_grad()
                
                logits = model(X)
                y = y.float().to(device)
                loss = F.binary_cross_entropy_with_logits(logits, y)
                loss_val = loss.item()
                
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                # scheduler.step(step // batch_size)
                
                # Log info
                step += args.batch_size
                progress_bar.update(args.batch_size)
                progress_bar.set_postfix(epoch=epoch,
                                         NLL=loss_val)
#                 tbx.add_scalar('train/NLL', loss_val, step)
#                 tbx.add_scalar('train/LR',
#                                optimizer.param_groups[0]['lr'],
#                                step)

#                 steps_till_eval -= batch_size
#                 if steps_till_eval <= 0:
#                     steps_till_eval = args.eval_steps

#                     # Evaluate and save checkpoint
#                     log.info('Evaluating at step {}...'.format(step))
#                     ema.assign(model)
#                     results, pred_dict = evaluate(model, dev_loader, device,
#                                                   args.dev_eval_file,
#                                                   args.max_ans_len,
#                                                   args.use_squad_v2)
#                     saver.save(step, model, results[args.metric_name], device)
#                     ema.resume(model)

#                     # Log to console
#                     results_str = ', '.join('{}: {:05.2f}'.format(k, v)
#                                             for k, v in results.items())
#                     log.info('Dev {}'.format(results_str))

#                     # Log to TensorBoard
#                     log.info('Visualizing in TensorBoard...')
#                     for k, v in results.items():
#                         tbx.add_scalar('dev/{}'.format(k), v, step)
#                     util.visualize(tbx,
#                                    pred_dict=pred_dict,
#                                    eval_path=args.dev_eval_file,
#                                    step=step,
#                                    split='dev',
#                                    num_visuals=args.num_visuals)


In [83]:
train(args)

100%|██████████| 5000/5000 [1:45:39<00:00,  1.29s/it, NLL=0.283, epoch=1]
 75%|███████▍  | 3733/5000 [1:26:28<30:39,  1.45s/it, NLL=0.303, epoch=2]  


KeyboardInterrupt: 

In [16]:
len(X[3])

511

In [18]:
torch.tensor([1, 2, 3]).view(-1)

tensor([1, 2, 3])