Let's investigate if it's effective to augment training data in the task of [quora question pairs](https://www.kaggle.com/c/quora-question-pairs).

In [35]:
import numpy as np
import torch
from torch import nn
from torch.autograd import Variable
import torch.nn.functional as F
import torch.utils.data as Data
import torch.optim as optim
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
import random
import copy

Let's check what the training data looks like.

In [3]:
train_data = "quora/train.tsv"
dev_data = "quora/dev.tsv"
print("\n".join(open(train_data, 'r').read().splitlines()[:1]))

1	What is your review of Hidden Figures -LRB- 2016 movie -RRB- ?	What are your impressions of Hidden Figures -LRB- 2017 movie -RRB- ?	11877


## Prepare datasets

### train

We take only 120k samples from training data. Among them, samples for label 0 and 1 account for 40k and 80k, respectively.

In [4]:
train0, train1 = [], []
for line in open(train_data, 'r'):
    label, sent1, sent2, _ = line.strip().split("\t")
    if label=="0":
        train0.append((sent1, sent2))
    else:
        train1.append((sent1, sent2))

random.shuffle(train0)
random.shuffle(train1)               

In [5]:
# 0:40K, 1: 80K
train0 = train0[:40000]
train1 = train1[:80000]

In [6]:
# all sents
train01 = []
for t in (train0, train1):
    for sent1, sent2 in t:
        train01.append(sent1)
        train01.append(sent2)

### dev

In [7]:
dev0, dev1 = [], []
for line in open(dev_data, 'r'):
    label, sent1, sent2, _ = line.strip().split("\t")
    if label=="0":
        dev0.append((sent1, sent2))
    else:
        dev1.append((sent1, sent2))

## Settings

In [8]:
VOCAB_SIZE=8000
MAXLEN=100 # maximum sequence length
BATCH_SIZE=256

## BPE tokenization with sentencepiece

In [9]:
import os
import sentencepiece as spm

In [10]:
dir = 'quora/bpe'
os.makedirs(dir, exist_ok=True)
with open(f"{dir}/bpe.train", 'w') as fout:
    for sent1, sent2 in train0:
        fout.write(sent1 + "\n" + sent2 + "\n")
    for sent1, sent2 in train1:
        fout.write(sent1 + "\n" + sent2 + "\n")

In [11]:
# train bpe model
def train_bpe(fpath, vocab_size, dir):
    train = f'--input={fpath} \
              --normalization_rule_name=identity \
              --model_prefix={dir}/bpe \
              --character_coverage=1.0 \
              --vocab_size={vocab_size} \
              --model_type=bpe'
    spm.SentencePieceTrainer.Train(train)

In [12]:
train_bpe('quora/bpe/bpe.train', VOCAB_SIZE, 'quora/bpe')

## Set Vocabulary

In [13]:
vocab = ["<pad>", "<unk>", "<bos>", "<eos>", "<sep>"]
for line in open('quora/bpe/bpe.vocab', 'r').read().splitlines()[3:]:
    token = line.split("\t")[0]
    vocab.append(token)
vocab[:10]

['<pad>', '<unk>', '<bos>', '<eos>', '<sep>', '▁t', '▁?', '▁a', 'in', 'at']

In [14]:
token2idx = {token:idx for idx, token in enumerate(vocab)}
idx2token = {idx:token for idx, token in enumerate(vocab)}

In [15]:
VOCAB_SIZE = len(token2idx)
VOCAB_SIZE

8002

## Encode

In [16]:
sp = spm.SentencePieceProcessor()
sp.Load('quora/bpe/bpe.model')

True

In [18]:
def encode_sents(sent1, sent2):
    tokens1 = [token2idx.get(token, 1) for token in sp.EncodeAsPieces(sent1)]
    tokens2 = [token2idx.get(token, 1) for token in sp.EncodeAsPieces(sent2)]
    tokens = [2] + tokens1 + [4] + tokens2 + [3] + [0]*MAXLEN # <bos> ... <sep> ... <eos> <pad> <pad> ...
    tokens = tokens[:MAXLEN]
    return tokens    

### \#1. baseline

In [36]:
_X_train0, _X_train1 = [], [] # list of lists
for sent1, sent2 in train0:
    tokens = encode_sents(sent1, sent2)
    _X_train0.append(tokens)
for sent1, sent2 in train1:
    tokens = encode_sents(sent1, sent2)
    _X_train1.append(tokens)
_X_train = _X_train0 + _X_train1
_Y_train = [0]*len(_X_train0) + [1]*len(_X_train1)

### \#2. oversample (w/out aug.)

In [40]:
_X_train0_over = _X_train0*2 # list of lists
_X_train_over = _X_train0_over + _X_train1
_Y_train_over = [0]*len(_X_train0_over) + [1]*len(_X_train1)

### \#3. oversample (w/ aug.)

In [37]:
_X_train0_aug = copy.copy(_X_train0) # list of lists
for sent1, sent2 in train0*10:
    tokens = encode_sents(sent1, random.choice(train01))
    _X_train0_aug.append(tokens)
    tokens = encode_sents(random.choice(train01), sent2)
    _X_train0_aug.append(tokens)
    if len(_X_train0_aug) > len(_X_train1): break

_X_train_aug = _X_train0_aug + _X_train1
_Y_train_aug = [0]*len(_X_train0_aug) + [1]*len(_X_train1)

### dev

In [22]:
_X_dev0, _X_dev1 = [], [] # list of lists
for sent1, sent2 in dev0:
    tokens = encode_sents(sent1, sent2)
    _X_dev0.append(tokens)
for sent1, sent2 in dev1:
    tokens = encode_sents(sent1, sent2)
    _X_dev1.append(tokens)

_X_dev = _X_dev0 + _X_dev1
_Y_dev = [0]*len(_X_dev0) + [1]*len(_X_dev1)

### Convert to tensors

In [41]:
X_train = torch.LongTensor(_X_train)
Y_train = torch.LongTensor(_Y_train)

X_train_over = torch.LongTensor(_X_train_over)
Y_train_over = torch.LongTensor(_Y_train_over)

X_train_aug = torch.LongTensor(_X_train_aug)
Y_train_aug = torch.LongTensor(_Y_train_aug)

X_dev = torch.LongTensor(_X_dev)
Y_dev = torch.LongTensor(_Y_dev)

## Data Loader

### \#1. baseline

In [42]:
train_dataset = Data.TensorDataset(X_train, Y_train)
train_loader = Data.DataLoader(dataset=train_dataset,
                              batch_size=BATCH_SIZE,
                              shuffle=True,
                              num_workers=4)
print(len(train_loader))

469


### \#2. oversample

In [43]:
train_over_dataset = Data.TensorDataset(X_train_over, Y_train_over)
train_over_loader = Data.DataLoader(dataset=train_over_dataset,
                              batch_size=BATCH_SIZE,
                              shuffle=True,
                              num_workers=4)
print(len(train_over_loader))

625


### \#3. aug.

In [44]:
train_aug_dataset = Data.TensorDataset(X_train_aug, Y_train_aug)
train_aug_loader = Data.DataLoader(dataset=train_aug_dataset,
                              batch_size=BATCH_SIZE,
                              shuffle=True,
                              num_workers=4)
print(len(train_aug_loader))

626


### dev

In [45]:
dev_dataset = Data.TensorDataset(X_dev, Y_dev)
dev_loader = Data.DataLoader(dataset=dev_dataset,
                              batch_size=BATCH_SIZE,
                              shuffle=False,
                              num_workers=4)
print(len(dev_loader))

40


## Model

In [46]:
class Net(nn.Module):
    def __init__(self, embedding_dim=256, hidden_dim=256, vocab_size=VOCAB_SIZE):
        '''
        Fix the model architecture and its parameters for this purpose
        '''
        super(Net, self).__init__()
        
        self.embed = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.dense = nn.Linear(hidden_dim, 2)

    def forward(self, x):
        x = x.to('cuda')
        x = self.embed(x)  
        
        outputs, (h, c) = self.lstm(x) # (maxlen, batch, hidden_dim)
        last_hidden_states = h[-1]
        
        logits = self.dense(last_hidden_states)
        return logits


## Train & test functions

In [47]:
def train(model, train_loader, optimizer, criterion, epoch):
    model.train()
    for batch_idx, (inputs, targets) in enumerate(train_loader):
        optimizer.zero_grad()
        logits = model(inputs)
        targets = targets.to('cuda')
        loss = criterion(logits, targets)
        
        loss.backward()
        optimizer.step()
    return loss

In [51]:
def eval(model, dev_loader):
    model.eval()
    y_pred, y_true = [], []
    with torch.no_grad():
        for inputs, targets in dev_loader:
            logits = model(inputs)
            logits, preds = logits.max(1, keepdim=False)  # get the index of the max log-probability. (batch,)
            y_pred.extend(preds.tolist())
            y_true.extend(targets.tolist())           
    
    f1score = f1_score(y_true, y_pred)

    print('eval set: F1_score: %0.3f\n' %(
        f1score))

In [52]:
def train_eval(type):
    model = Net().cuda()
    optimizer = optim.Adam(model.parameters(), lr=.001)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(1, 10+1):
        if type=="base":
            loader = train_loader
        elif type=="over":
            loader = train_over_loader
        else:
            loader = train_aug_loader
        loss = train(model, loader, optimizer, criterion, epoch)
        print("epoch=%d, loss=%.3f" % (epoch, loss))
        eval(model, dev_loader)

        torch.save(model.state_dict(), f"quora/ckpt{epoch}.pt")


## Experiments

### \#1. baseline

In [53]:
train_eval("base")

epoch=1, loss=0.540
eval set: F1_score: 0.739

epoch=2, loss=0.453
eval set: F1_score: 0.754

epoch=3, loss=0.435
eval set: F1_score: 0.760

epoch=4, loss=0.393
eval set: F1_score: 0.767

epoch=5, loss=0.290
eval set: F1_score: 0.764

epoch=6, loss=0.312
eval set: F1_score: 0.769

epoch=7, loss=0.121
eval set: F1_score: 0.766

epoch=8, loss=0.110
eval set: F1_score: 0.767

epoch=9, loss=0.108
eval set: F1_score: 0.761

epoch=10, loss=0.098
eval set: F1_score: 0.756



### \#2. oversample

In [54]:
train_eval("over")

epoch=1, loss=0.520
eval set: F1_score: 0.732

epoch=2, loss=0.450
eval set: F1_score: 0.755

epoch=3, loss=0.422
eval set: F1_score: 0.750

epoch=4, loss=0.261
eval set: F1_score: 0.764

epoch=5, loss=0.164
eval set: F1_score: 0.768

epoch=6, loss=0.088
eval set: F1_score: 0.762

epoch=7, loss=0.074
eval set: F1_score: 0.760

epoch=8, loss=0.060
eval set: F1_score: 0.764

epoch=9, loss=0.041
eval set: F1_score: 0.765

epoch=10, loss=0.042
eval set: F1_score: 0.761



### \#3. aug.

In [55]:
train_eval("aug")

epoch=1, loss=0.662
eval set: F1_score: 0.739

epoch=2, loss=0.603
eval set: F1_score: 0.761

epoch=3, loss=0.034
eval set: F1_score: 0.780

epoch=4, loss=0.289
eval set: F1_score: 0.780

epoch=5, loss=0.001
eval set: F1_score: 0.784

epoch=6, loss=0.265
eval set: F1_score: 0.782

epoch=7, loss=0.004
eval set: F1_score: 0.787

epoch=8, loss=0.000
eval set: F1_score: 0.775

epoch=9, loss=0.000
eval set: F1_score: 0.777

epoch=10, loss=0.000
eval set: F1_score: 0.781

