Let's see if it's effective to augment training data in the task of [quora question pairs](https://www.kaggle.com/c/quora-question-pairs).

### Download and extract QQP dataset.

In [17]:
import os

In [25]:
os.system('wget https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FQQP.zip?alt=media&token=700c6acf-160d-4d89-81d1-de4191d02cb5')
os.system('unzip QQP.zip')

In [26]:
!ls -ltr QQP

total 60949
-rw-r--r-- 1 root root  5815716 May  2  2018 dev.tsv
-rw-r--r-- 1 root root 52360463 May  2  2018 train.tsv
drwxr-xr-x 1 root root        0 Aug  5 10:31 original
-rw-r--r-- 1 root root  4259840 Aug  5 10:32 test.tsv


Let's check what the training data looks like.

In [3]:
train_data = "QQP/train.tsv"
dev_data = "QQP/dev.tsv"
print("\n".join(open(train_data, 'r').read().splitlines()[:2]))

id	qid1	qid2	question1	question2	is_duplicate
133273	213221	213222	How is the life of a math student? Could you describe your own experiences?	Which level of prepration is enough for the exam jlpt5?	0


In [4]:
import numpy as np
import torch
from torch import nn
from torch.autograd import Variable
import torch.nn.functional as F
import torch.utils.data as Data
import torch.optim as optim
from torch.nn.utils.rnn import pack_padded_sequence
from sklearn.metrics import f1_score, accuracy_score
import random
import copy
from collections import Counter
import re

## Prepare datasets

### train

In [5]:
def normalize(sent):
    sent = sent.lower()
    sent = re.sub("[^a-z0-9' ]", "", sent)
    return sent

In [6]:
def split_data(fin, maxlen):
    '''Divide data into things of label 0's and 1's'''
    data0, data1 = [], []
    for line in open(fin, 'r').read().strip().splitlines()[1:]:
        cols = line.split("\t")
        if len(cols)==6:
            _, _, _, sent1, sent2, label = cols
            sent1 = normalize(sent1)
            sent2 = normalize(sent2)
            if len(sent1.split()) < maxlen/2 and len(sent2.split()) < maxlen/2:
                pair = (sent1, sent2)
                if label=="0":
                    data0.append(pair)
                else:
                    data1.append(pair)  
    return data0, data1

In [7]:
MAXLEN = 200 # We include sentence pairs of which lengths are not more than 200 characters.

In [8]:
train0, train1 = split_data(train_data, MAXLEN)         
print(len(train0), len(train1))

229442 134378


In [9]:
# all training sents
train01 = []
for t in (train0, train1):
    for sent1, sent2 in t:
        train01.append(sent1)
        train01.append(sent2)

### dev

In [10]:
dev0, dev1 = split_data(dev_data, MAXLEN)    
print(len(dev0), len(dev1))

25544 14885


## Vocabulary

In [11]:
# num_vocab
words = [word for sent in train01 for word in sent.split()]
word2cnt = Counter(words)
len(word2cnt)

107030

In [12]:
MIN_CNT = 5 # We include words that occurred at least 5 times.
vocab = ["<pad>", "<unk>", "<sep>"]
for word, cnt in word2cnt.most_common(len(word2cnt)):
    if cnt < MIN_CNT:
        break
    vocab.append(word)

In [13]:
VOCAB_SIZE = len(vocab)
VOCAB_SIZE

30429

In [14]:
token2idx = {token:idx for idx, token in enumerate(vocab)}
idx2token = {idx:token for idx, token in enumerate(vocab)}

## Encode

In [15]:
def encode_sents(sent1, sent2):
    tokens1 = [token2idx.get(token, 1) for token in sent1.split()] #1:<unk>
    tokens2 = [token2idx.get(token, 1) for token in sent2.split()]
    
    tokens = tokens1 + [2] + tokens2 + [0]*MAXLEN # ... <sep> ... <pad> <pad> ...
    tokens = tokens[:MAXLEN]
    return tokens#

### \#1. baseline

In [16]:
_X_train0 = [] # list of lists
for sent1, sent2 in train0:
    tokens = encode_sents(sent1, sent2)
    _X_train0.append(tokens)

_X_train1 = []
for sent1, sent2 in train1:
    tokens = encode_sents(sent1, sent2)
    _X_train1.append(tokens)

_X_train = _X_train0 + _X_train1
_Y_train = [0]*len(_X_train0) + [1]*len(_X_train1)

In [17]:
print(len(_X_train0), len(_X_train1), len(_X_train0)+len(_X_train1))

229442 134378 363820


In [18]:
NUM_EPOCHS = 10
_X_train *= NUM_EPOCHS
_Y_train *= NUM_EPOCHS

In [19]:
print(len(_X_train), len(_Y_train))

3638200 3638200


### \#2. label0 aug.

The train0, non-duplicate sentence pairs, is augmented by matching a sentence with a random sentence.

In [20]:
_X_train0_aug = copy.copy(_X_train0)

for sent1, sent2 in train0*9:
    sent = sent1 if random.random() < 0.5 else sent2
    tokens = encode_sents(sent, random.choice(train01))
    _X_train0_aug.append(tokens)

_X_train1 *= NUM_EPOCHS
_X_train_aug = _X_train0_aug + _X_train1
_Y_train_aug = [0]*len(_X_train0_aug) + [1]*(len(_X_train1))

In [21]:
print(len(_X_train_aug), len(_Y_train_aug))

3638200 3638200


### dev

In [22]:
_X_dev0, _X_dev1 = [], [] # list of lists
maxlen = 0
for sent1, sent2 in dev0:
    tokens = encode_sents(sent1, sent2)
    _X_dev0.append(tokens)
for sent1, sent2 in dev1:
    tokens = encode_sents(sent1, sent2)
    _X_dev1.append(tokens)

_X_dev = _X_dev0 + _X_dev1
_Y_dev = [0]*len(_X_dev0) + [1]*len(_X_dev1)

### Convert to tensors

In [23]:
X_train = torch.LongTensor(_X_train)
Y_train = torch.LongTensor(_Y_train)

X_train_aug = torch.LongTensor(_X_train_aug)
Y_train_aug = torch.LongTensor(_Y_train_aug)

X_dev = torch.LongTensor(_X_dev)
Y_dev = torch.LongTensor(_Y_dev)


## Data Loader

In [24]:
BATCH_SIZE=256

### \#1. baseline

In [25]:
train_dataset = Data.TensorDataset(X_train, Y_train)
train_loader = Data.DataLoader(dataset=train_dataset,
                              batch_size=BATCH_SIZE,
                              shuffle=True,
                              num_workers=4)
print(len(train_loader))

14212


### \#2. label0 aug.

In [26]:
train_aug_dataset = Data.TensorDataset(X_train_aug, Y_train_aug)
train_aug_loader = Data.DataLoader(dataset=train_aug_dataset,
                              batch_size=BATCH_SIZE,
                              shuffle=True,
                              num_workers=4)
print(len(train_aug_loader))

14212


### dev

In [27]:
dev_dataset = Data.TensorDataset(X_dev, Y_dev)
dev_loader = Data.DataLoader(dataset=dev_dataset,
                              batch_size=BATCH_SIZE,
                              shuffle=False,
                              num_workers=4)
print(len(dev_loader))

158


## Model

In [65]:
class Net(nn.Module):
    def __init__(self, embedding_dim=256, hidden_dim=256, vocab_size=VOCAB_SIZE):
        '''
        Fix the model architecture and its parameters for this purpose
        '''
        super(Net, self).__init__()
        
        self.embed = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.dense = nn.Linear(hidden_dim*2, 2)

    def forward(self, x):
        x = x.to('cuda')
        seqlens = (x!=0).long().sum(1) # (N,)
        
        x = self.embed(x) 
        
        packed_input = pack_padded_sequence(x, seqlens, batch_first=True, enforce_sorted=False)
        
        _, (last_hidden, c) = self.lstm(packed_input) # last_hidden: (num_layers * num_directions, batch, hidden_size)
        last_hidden = last_hidden.permute(1, 2, 0) # to (batch, hidden, num_directions)
        last_hidden = last_hidden.contiguous().view(last_hidden.size()[0], -1) # to (batch, hidden*num_directions)
        
        logits = self.dense(last_hidden)
        return logits


## Train & test functions

In [66]:
def eval(model, dev_loader):
    model.eval()

    y_pred, y_true = [], []
    with torch.no_grad():
        for inputs, targets in dev_loader:
            logits = model(inputs)
            _, preds = logits.max(1, keepdim=False)
            y_pred.extend(preds.tolist())
            y_true.extend(targets.tolist())           
    
    f1score = f1_score(y_true, y_pred)
    acc = accuracy_score(y_true, y_pred)
    
    print('F1_score: %0.3f, acc.: %0.3f\n' %(f1score, acc))

In [67]:
def train(model, train_loader, optimizer, criterion, eval_interval, dev_loader):
    model.train()
    for gs, (inputs, targets) in enumerate(train_loader):
        optimizer.zero_grad()
        logits = model(inputs)
        targets = targets.to('cuda')
        loss = criterion(logits, targets)
        
        loss.backward()
        optimizer.step()
        
        if gs > 0 and gs % eval_interval == 0:
            print("global step =", gs)
            print("loss =%.3f" % loss )
            eval(model, dev_loader)
        model.train()

## Experiments

In [74]:
model = Net().cuda()
optimizer = optim.Adam(model.parameters(), lr=.001)
criterion = nn.CrossEntropyLoss()
eval_interval = len(train_loader)//NUM_EPOCHS

### \#1. baseline

In [69]:
train(model, train_loader, optimizer, criterion, eval_interval, dev_loader)

global step = 1421
loss =0.418
F1_score: 0.675, acc.: 0.783

global step = 2842
loss =0.316
F1_score: 0.729, acc.: 0.802

global step = 4263
loss =0.197
F1_score: 0.740, acc.: 0.808

global step = 5684
loss =0.163
F1_score: 0.746, acc.: 0.816

global step = 7105
loss =0.088
F1_score: 0.744, acc.: 0.814

global step = 8526
loss =0.068
F1_score: 0.745, acc.: 0.811

global step = 9947
loss =0.110
F1_score: 0.746, acc.: 0.814

global step = 11368
loss =0.068
F1_score: 0.748, acc.: 0.818

global step = 12789
loss =0.016
F1_score: 0.746, acc.: 0.816

global step = 14210
loss =0.043
F1_score: 0.745, acc.: 0.813



△ The best F1 score is .748, and accuracy is .818.

### \#2. aug.

In [75]:
train(model, train_aug_loader, optimizer, criterion, eval_interval, dev_loader)

global step = 1421
loss =0.298
F1_score: 0.699, acc.: 0.727

global step = 2842
loss =0.193
F1_score: 0.730, acc.: 0.766

global step = 4263
loss =0.134
F1_score: 0.735, acc.: 0.765

global step = 5684
loss =0.122
F1_score: 0.745, acc.: 0.776

global step = 7105
loss =0.110
F1_score: 0.760, acc.: 0.799

global step = 8526
loss =0.120
F1_score: 0.764, acc.: 0.801

global step = 9947
loss =0.081
F1_score: 0.766, acc.: 0.804

global step = 11368
loss =0.086
F1_score: 0.770, acc.: 0.809

global step = 12789
loss =0.053
F1_score: 0.776, acc.: 0.820

global step = 14210
loss =0.080
F1_score: 0.775, acc.: 0.821



△ The best F1 score is .776, and accuracy is .820.