In [17]:
import pandas as pd
import spacy
import torch

from collections import defaultdict
from itertools import chain

from sklearn.model_selection import train_test_split
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader
from torchtext import data

from quoraquestionpairs.data import get_simple_dataset

In [2]:
nlp = spacy.load('en')

In [10]:
question = data.Field(tokenize='spacy',
                      lower=True,
                      pad_first=True)

In [19]:
data_df = data.TabularDataset(path='train.csv',
                              format='csv',
                              fields=[('id', None),
                                      ('qid1', None),
                                      ('qid2', None),
                                      ('question1', question),
                                      ('question2', question),
                                      ('is_duplicate', data.Field(sequential=False,
                                                                  use_vocab=False))], skip_header=True)

In [20]:
question.build_vocab(data_df,
                     #max_size=20000,
                     min_freq=50)

In [21]:
train_iter = data.Iterator(data_df,
                           batch_size=1,
                           repeat=True)

### Neural net structure
Input: two sequences
[?1 x 1], [?2 x 1]

Embedding layer:
-> [n_vocab, embedding_dim] -> [?1 x embedding_dim], [?2 x embedding_dim]

Concatenation:
-> [?1 + ?2 x embedding_dim]

GRU:
-> [embedding_dim

In [58]:
x1 = torch.tensor([1, 2, 0])
x2 = torch.tensor([2, 0])

emb = nn.Embedding(3, 10)

x1_emb = emb(x1)
x2_emb = emb(x2)

assert x1_emb.size() == torch.Size([3, 10]) and x2_emb.size() == torch.Size([2, 10])

concatenated = torch.cat([x1_emb, x2_emb])

assert concatenated.size() == torch.Size([5, 10])

gru = nn.GRU(input_size=10, hidden_size=8)
_, x = gru(concatenated.view(-1, 1, 10))

assert x.size() == torch.Size([1, 1, 8])

linear = nn.Linear(8, 1)

out = torch.sigmoid(linear(x))

In [63]:
from quoraquestionpairs.neuralnets import RNNGRU

In [64]:
len(question.vocab)

8607

In [104]:
rnn = RNNGRU(len(question.vocab), 180, 64)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(rnn.parameters(), lr=0.1)

In [107]:
for epoch in range(10):
    
    running_loss = 0.0
    for i, sample in enumerate(train_iter):
        x1 = sample.question1
        x2 = sample.question2
        target = sample.is_duplicate
        
        optimizer.zero_grad()
        
        output = rnn(x1, x2)
        loss = criterion(output, target.type(torch.FloatTensor).view(1, 1, 1))
        optimizer.step()
        
        running_loss += loss.item()
        
        if i % 1000 == 0:
            print('Sample {}'.format(i + 1))
    print('[{}] loss: {}'.format(epoch, running_loss))

Sample 1
Sample 1001
Sample 2001
Sample 3001
Sample 4001
Sample 5001


KeyboardInterrupt: 

In [6]:
class QuoraDataset(Dataset):
    
    def __init__(self, csv_file, train=False, transform=None):
        self.questions = pd.read_csv(csv_file, keep_default_na=False)
        self.train = train
        self.transform = transform
    
    def __len__(self):
        return len(self.questions)
    
    def __getitem__(self, idx):
        
        sample = {'q1': self.questions.loc[idx, 'question1'],
                  'q2': self.questions.loc[idx, 'question2']}
        
        if self.train:
            sample['target'] = self.questions.loc[idx, 'is_duplicate']
            
        if self.transform:
            sample = self.transform(sample)
        
        return sample
    
    def get_tokens(self):
        

In [7]:
dataset = QuoraDataset('train.csv', train=True)

In [8]:
dataloader = DataLoader(dataset, batch_size=4, shuffle=True, num_workers=4)

In [9]:
tokenizer = Tokenizer(nlp.vocab)

In [11]:
word_dict = defaultdict(int)

for i in range(len(dataset)):
    sample = dataset[i]
    questions = [tokenizer(sample['q1']), tokenizer(sample['q2'])]
    for token in chain(*questions):
        word = token.text.lower()
        if token.is_stop:
            continue
        else:
            word_dict[word] += 1

In [18]:
sorted_words = sorted([(word, counts) for word, counts in word_dict.items()], key=lambda x: x[1], reverse=True)

In [1]:
class WordTokenizer:
    
    def __init__(self, num_words):
        self.num_words = num_words
    
    def tokenize(self, *args):
        words_counts = defaultdict(int)
        
        for text in chain(*args):
            for token in tokenizer(text):
                word = token.text.lower()
                if token.is_stop:
                    continue
                else:
                    words_counts[word] += 1
        
        words_counts = sorted([(word, counts) for word, counts in words_counts.items()], key=lambda x: x[1], reverse=True)
        self.words_dict = {word: i for i, (word, counts) in enumerate(sorted_words, 1)}
    
    def __call__(self, seq):
        int_seq = [self.words_dict[word] for word in ]

SyntaxError: invalid syntax (<ipython-input-1-f588e6cb53ef>, line 21)

In [22]:
#{word: i for i, (word, counts) in enumerate(sorted_words, 1)}

In [38]:
def test_fun(*args):
    for i in chain(*args):
        print(i)

In [27]:
for i in dataset.questions['question1'].values:
    print(i)
    break

What is the step by step guide to invest in share market in india?


In [50]:
word_tokenizer = WordTokenizer(100)

In [40]:
#test_fun(dataset.questions['question1'].values, dataset.questions['question2'].values)

In [51]:
word_tokenizer.tokenize(dataset.questions['question1'].values, dataset.questions['question2'].values)