In [1]:
!pip install transformers==4.2.0

Collecting transformers==4.2.0
[?25l  Downloading https://files.pythonhosted.org/packages/84/ea/634945faff8ad6984b98f7f3d98f6d83083a18af44e349744d90bde81f80/transformers-4.2.0-py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 6.6MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/08/cd/342e584ee544d044fb573ae697404ce22ede086c9e87ce5960772084cad0/sacremoses-0.0.44.tar.gz (862kB)
[K     |████████████████████████████████| 870kB 22.7MB/s 
[?25hCollecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/fb/36/59e4a62254c5fcb43894c6b0e9403ec6f4238cc2422a003ed2e6279a1784/tokenizers-0.9.4-cp37-cp37m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 27.6MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.44-cp37-none-any.whl size=886084 sha256=c5c471

In [2]:
import sys
sys.path.append('/content/drive/MyDrive/MAIS')

import os
import numpy as np
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import GPT2LMHeadModel, GPT2Config, AdamW, get_linear_schedule_with_warmup 
from utils import get_tokenizer, set_seed
from adataset import GPT2Dataset
from torch.utils.tensorboard import SummaryWriter
import torch.nn.functional as F


import json
import argparse
import time
from tqdm import tqdm_notebook, tnrange


In [10]:
parser = argparse.ArgumentParser()
parser.add_argument("--lr",default=5e-5, type=float, help="learning rate")
parser.add_argument("--seed",default=42, type=int,  help="seed to replicate results")
parser.add_argument("--n_gpu",default=1, type=int,  help="no of gpu available")
parser.add_argument("--gradient_accumulation_steps",default=2, type=int, help="gradient_accumulation_steps")
parser.add_argument("--batch_size",default=1, type=int,  help="batch_size")
parser.add_argument("--num_workers",default=4, type=int,  help="num of cpus available")
parser.add_argument("--device",default=torch.device('cpu'), type=torch.device, help="torch.device object")
parser.add_argument("--num_train_epochs",default=1, type=int,  help="no of epochs of training")
parser.add_argument("--output_dir",default='./output', type=str,  help="path to save evaluation results")
parser.add_argument("--model_dir",default='./weights', type=str,  help="path to save trained model")
parser.add_argument("--max_grad_norm",default=1.0, type=float, help="max gradient norm.")
parser.add_argument("--data_dir",default='./data', type=str, help="location of json dataset.")
# parser.add_argument("--ids_file",default='./data', type=str, help="location of train, valid and test file indexes")
args = parser.parse_args(["--device", "cpu", "--data_dir", "/content/drive/MyDrive/MAIS/train-balanced-sarcasm.csv", "--model_dir", "/content/drive/MyDrive/MAIS/model"])
print(args)

Namespace(batch_size=1, data_dir='/content/drive/MyDrive/MAIS/train-balanced-sarcasm.csv', device=device(type='cpu'), gradient_accumulation_steps=2, lr=5e-05, max_grad_norm=1.0, model_dir='/content/drive/MyDrive/MAIS/model', n_gpu=1, num_train_epochs=1, num_workers=4, output_dir='./output', seed=42)


In [5]:
def train(args, model, tokenizer, train_dataset, ignore_index):
    writer = SummaryWriter('./logs')
    train_sampler = RandomSampler(train_dataset)
    train_dl = DataLoader(train_dataset,
                          sampler=train_sampler,
                          batch_size=args.batch_size,
                          num_workers=args.num_workers)
    loss_fct = CrossEntropyLoss(ignore_index=ignore_index) #ignores padding token for loss calculation
    optimizer = AdamW(model.parameters(),lr=args.lr)
    scheduler = get_linear_schedule_with_warmup(optimizer,100,80000)

    global_step = 0 # for accumulated gradients
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = tnrange(int(args.num_train_epochs), desc='epochs')
    set_seed(args)

    for _ in train_iterator:
        epoch_iterator = tqdm_notebook(train_dl, desc='training') # just uses the DataLoader and shows a progress bar
        for step, batch in enumerate(epoch_iterator):
            inputs, labels = batch['context'], batch['context'] # it's the same but we'll only calculate loss over the stuff after <|sep|> token
            # use GPU (!!)
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            logits = model(inputs)[0] # get logits of the shape (batch_size, sequence_length, config.vocab_size)
            
            # now shift the logits and labels accordingly so as to calculate the loss as required
            loc_sep = batch['loc_sep'] # location of the <|sep|> token
            shifted_logits = logits[:, loc_sep:-1, :].contiguous()
            shifted_labels = labels[:, loc_sep+1:].contiguous() # make labels one ahead for inference

            # accumulate gradients
            loss = loss_fct(shifted_logits.view(-1, shifted_logits.size(-1)), shifted_labels.view(-1))
            loss /= args.gradient_accumulation_steps
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
            tr_loss += loss.item()

            # update global loss and run accumulated gradient descent when number of steps reaches the gradient_accumulation_steps
            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1
                writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
                writer.add_scalar('loss', (tr_loss - logging_loss)/args.gradient_accumulation_steps, global_step)
                logging_loss = tr_loss
                print("loss:", loss.item(), end='\n\n')

In [6]:
train_data = GPT2Dataset(args.data_dir)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




100%|██████████| 14918/14918 [00:11<00:00, 1328.67it/s]


In [8]:
len(train_data)

14918

In [None]:
tokenizer = get_tokenizer()
ignore_idx = tokenizer.pad_token_id
# model = GPT2LMHeadModel.from_pretrained('gpt2')
# model.resize_token_embeddings(len(tokenizer))

model = torch.load("/content/drive/MyDrive/MAIS/model_1.pt", map_location=args.device)
model.to(args.device)

In [13]:
# training time (!!)
set_seed(args.seed)
start = time.time()
print("start time: ", start)
train(args, model, tokenizer, train_data, ignore_idx)
print('total time: ', (time.time()-start)/60, " minutes", end='\n\n')

# print('Saving trained model...')
# model_file = os.path.join(args.model_dir, 'model_data{}_trained_after_{}_epochs_only_sum_loss_ignr_pad.bin'.format(len(train_data),args.num_train_epochs))
# config_file = os.path.join(args.model_dir, 'config_data{}_trained_after_{}_epochs_only_sum_loss_ignr_pad.json'.format(len(train_data),args.num_train_epochs))
# torch.save(model.state_dict(), model_file)
# model.config.to_json_file(config_file)

start time:  1616834604.0711982


  cpuset_checked))
  from ipykernel import kernelapp as app


HBox(children=(FloatProgress(value=0.0, description='epochs', max=1.0, style=ProgressStyle(description_width='…

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, description='training', max=14918.0, style=ProgressStyle(description_w…



loss: 1.678768277168274



KeyboardInterrupt: ignored

In [None]:
torch.save(model, '/content/drive/MyDrive/MAIS/model_2.pt')

In [25]:
tokenizer.save_vocabulary('/content/drive/MyDrive/MAIS/')

('/content/drive/MyDrive/MAIS/vocab.json',
 '/content/drive/MyDrive/MAIS/merges.txt')