In [1]:
import time

import numpy as np

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torchqrnn.forget_mult
from torchqrnn import QRNN

from tensorboard_logger import configure, log_value

import re
import datetime

import random

In [2]:
from data_module.data_preprocessor import *

In [3]:
np.random.seed(10)
torch.manual_seed(10)
torch.cuda.manual_seed_all(10)
random.seed(10)

In [4]:
def tokenizer(text): # create a tokenizer function
    text = text.lower()
    TOKENIZER_RE = re.compile(r"[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\'\w\-]+", re.UNICODE) 
    return TOKENIZER_RE.findall(text)

In [5]:
text_field = data.Field(lower=True, tokenize=tokenizer, fix_length = 60)
label_field = data.Field(sequential=False)

In [6]:
import os

DATASET_FOLDER = os.path.join("..", "dataset")
DATASET_PATH = os.path.join(DATASET_FOLDER, "faqs", "list_of_questions_train_labeled_new_2.txt")

In [7]:
train_data, dev_data = FAQ.splits(text_field, label_field, DATASET_PATH, 0.1)

Initial length  87599
Lenght after  10869 10869
train: 78840 test: 8759


In [8]:
text_field.build_vocab(train_data, dev_data)
label_field.build_vocab(train_data, dev_data)

In [9]:
train_data.examples[0].text

['on',
 'what',
 'laptops',
 'are',
 'the',
 'usb',
 'ports',
 'marked',
 'with',
 'a',
 'usb',
 'symbol',
 'with',
 'an',
 'added',
 'lightening',
 'bolt',
 'icon']

In [10]:
text_field.vocab.itos[1000]

'opened'

In [10]:
text_field.vocab.load_vectors('glove.6B.300d')

In [11]:
text_field.vocab.itos[1000]

'opened'

In [12]:
train_iter, dev_iter = data.Iterator.splits(
    (train_data, dev_data), batch_sizes=(32, len(dev_data)),
    repeat=False, device = None
)

In [13]:
len(train_iter)

2464

In [14]:
i = 0
for data in train_iter:
    print(data.text)
    print(data.label)
    break

Variable containing:
    16     16      3  ...       3      7      3
     9      9     71  ...      46      2      9
     2   2987      4  ...     986    308  37645
        ...            ⋱           ...         
     1      1      1  ...       1      1      1
     1      1      1  ...       1      1      1
     1      1      1  ...       1      1      1
[torch.cuda.LongTensor of size 60x32 (GPU 0)]

Variable containing:
 153
  71
 178
 364
 274
 122
  76
 337
  22
  60
 139
 308
 368
  68
 255
 191
 397
 332
 169
 306
 125
 202
  50
 267
 234
  80
 410
  42
 234
  53
 179
 313
[torch.cuda.LongTensor of size 32 (GPU 0)]



In [16]:
text_field.vocab.itos[100]

'music'

In [27]:
text_field.vocab

<torchtext.vocab.Vocab at 0x7ff2a8e7fdd8>

In [None]:
last_element.text

In [None]:
len(dev_iter)