In [None]:
import pandas as pd
import spacy
import torch

from collections import defaultdict
from itertools import chain

from sklearn.model_selection import train_test_split
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from torchtext import data

from quoraquestionpairs.data import get_dataset
from quoraquestionpairs.neuralnets import RNNGRUSequential

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [None]:
nlp = spacy.load('en')

In [None]:
question = data.Field(tokenize='spacy',
                      lower=True,
                      pad_first=True)

In [None]:
data_df = get_dataset('data/train.csv', question)

In [None]:
train, val = data_df.split(0.95, stratified=True, strata_field='is_duplicate')

In [None]:
question.build_vocab(train,
                     min_freq=50)

### Neural net structure
Input: two sequences
[?1 x 1], [?2 x 1]

Embedding layer:
-> [n_vocab, embedding_dim] -> [?1 x embedding_dim], [?2 x embedding_dim]

Concatenation:
-> [?1 + ?2 x embedding_dim]

GRU:
-> [embedding_dim

In [None]:
# x1 = torch.tensor([1, 2, 0])
# x2 = torch.tensor([2, 0])

# emb = nn.Embedding(3, 10)

# x1_emb = emb(x1)
# x2_emb = emb(x2)

# assert x1_emb.size() == torch.Size([3, 10]) and x2_emb.size() == torch.Size([2, 10])

# concatenated = torch.cat([x1_emb, x2_emb])

# assert concatenated.size() == torch.Size([5, 10])

# gru = nn.GRU(input_size=10, hidden_size=8)
# _, x = gru(concatenated.view(-1, 1, 10))

# assert x.size() == torch.Size([1, 1, 8])

# linear = nn.Linear(8, 1)

# out = torch.sigmoid(linear(x))

In [None]:
rnn = RNNGRUSequential(len(question.vocab), 300, 128).to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr=0.01)

In [None]:
for epoch in range(10):
    train_iter = data.Iterator(train,
                               batch_size=256,
                               repeat=False,
                               shuffle=True)
    
    val_iter = data.Iterator(val,
                             batch_size=64)
    
    running_loss = 0.0
    for i, sample in enumerate(train_iter):
        x1 = sample.question1.to(device)
        x2 = sample.question2.to(device)
        target = sample.is_duplicate.type(torch.FloatTensor).to(device)
        
        optimizer.zero_grad()
        
        output = rnn(x1, x2)
        loss = criterion(output, target.view(1, -1, 1))
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
        if i % 50 == 0:
            print('Batch: {} Loss: {}'.format(i + 1, running_loss / 50))
            running_loss = 0.0
    
    correct = 0
    total = 0
    with torch.no_grad():
        for sample in val_iter:
            x1 = sample.question1.to(device)
            x2 = sample.question2.to(device)
            target = sample.is_duplicate.type(torch.ByteTensor).to(device)
            output = rnn(x1, x2)
            output = output.view(-1)
            pred = output >= 0.5
            correct += (target == pred).sum().item()
            total += 64
        
    print('Val accuracy: {}'.format(correct / total * 100))

In [None]:
class QuoraDataset(Dataset):
    
    def __init__(self, csv_file, train=False, transform=None):
        self.questions = pd.read_csv(csv_file, keep_default_na=False)
        self.train = train
        self.transform = transform
    
    def __len__(self):
        return len(self.questions)
    
    def __getitem__(self, idx):
        
        sample = {'q1': self.questions.loc[idx, 'question1'],
                  'q2': self.questions.loc[idx, 'question2']}
        
        if self.train:
            sample['target'] = self.questions.loc[idx, 'is_duplicate']
            
        if self.transform:
            sample = self.transform(sample)
        
        return sample
    
    def get_tokens(self):
        

In [None]:
dataset = QuoraDataset('train.csv', train=True)

In [None]:
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, num_workers=4)

In [None]:
tokenizer = Tokenizer(nlp.vocab)

In [None]:
word_dict = defaultdict(int)

for i in range(len(dataset)):
    sample = dataset[i]
    questions = [tokenizer(sample['q1']), tokenizer(sample['q2'])]
    for token in chain(*questions):
        word = token.text.lower()
        if token.is_stop:
            continue
        else:
            word_dict[word] += 1

In [None]:
sorted_words = sorted([(word, counts) for word, counts in word_dict.items()], key=lambda x: x[1], reverse=True)

In [None]:
class WordTokenizer:
    
    def __init__(self, num_words):
        self.num_words = num_words
    
    def tokenize(self, *args):
        words_counts = defaultdict(int)
        
        for text in chain(*args):
            for token in tokenizer(text):
                word = token.text.lower()
                if token.is_stop:
                    continue
                else:
                    words_counts[word] += 1
        
        words_counts = sorted([(word, counts) for word, counts in words_counts.items()], key=lambda x: x[1], reverse=True)
        self.words_dict = {word: i for i, (word, counts) in enumerate(sorted_words, 1)}
    
    def __call__(self, seq):
        int_seq = [self.words_dict[word] for word in ]

In [None]:
#{word: i for i, (word, counts) in enumerate(sorted_words, 1)}

In [None]:
def test_fun(*args):
    for i in chain(*args):
        print(i)

In [None]:
for i in dataset.questions['question1'].values:
    print(i)
    break

In [None]:
word_tokenizer = WordTokenizer(100)

In [None]:
#test_fun(dataset.questions['question1'].values, dataset.questions['question2'].values)

In [None]:
word_tokenizer.tokenize(dataset.questions['question1'].values, dataset.questions['question2'].values)