In [29]:
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
from gensim.models import KeyedVectors

from text_classification.datautils import ALaCarteCSVDataset
from text_classification.utils import PadCollate, pad_tensor

np.set_printoptions(threshold=np.inf)
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Check ALaCarteCSVDataset

In [2]:
toy_vectors = KeyedVectors(3)
_v = np.array([
    [1, 1, 1],  # he
    [2, 2, 2],  # was
    [3, 3, 3],  # a
    [4, 4, 4]   # big
])
toy_vectors.add(['he', 'was', 'a', 'big'], _v)
toy_text = 'he was a something of a big spoon he'

_w1 = 1
_expected_w1 = [
    [1, 1, 1],  # he
    [2, 2, 2],  # was
    [3, 3, 3],  # a
    [3, 3, 3],  # <UNK>
    [3, 3, 3],  # <UNK>
    [3, 3, 3],  # a
    [4, 4, 4],  # big
    [2.5, 2.5, 2.5],   # <UNK>
    [1, 1, 1],  # he
]

In [3]:
pd.DataFrame([{'text': toy_text, 'label': 1}]).to_csv('test.csv')

In [4]:
dataset = ALaCarteCSVDataset('test.csv', 'text', 'label', toy_vectors,
                             induce_vectors=True, induction_matrix='identity', window_half_size=_w1)

In [5]:
dataset.unk_vec

array([0.4125049 , 0.46644373, 0.47615791])

In [6]:
X, y = dataset[0]
X.numpy()

array([[1. , 1. , 1. ],
       [2. , 2. , 2. ],
       [3. , 3. , 3. ],
       [3. , 3. , 3. ],
       [3. , 3. , 3. ],
       [3. , 3. , 3. ],
       [4. , 4. , 4. ],
       [2.5, 2.5, 2.5],
       [1. , 1. , 1. ]], dtype=float32)

In [7]:
from nltk.tokenize import word_tokenize
from torch.utils.data import DataLoader

## Check pad

In [30]:
import torch

In [31]:
pad_tensor(torch.ones(2, 13, dtype=torch.long), pad=15, dim=1)

tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])

## Check dataloader

In [11]:
dataset = ALaCarteCSVDataset('/data/classification/clickbait/valid.csv', 'text', 'label', toy_vectors,
                             induce_vectors=True, induction_matrix='identity', window_half_size=3, max_text_len=2048)

In [12]:
dataset.data.iloc[0].text

'Bob Bryan, Business Insider 9.02.2017, 16:25 182 facebook linkedin twitter email print "We\'re going to be announcing something over the next, I would say, two or three weeks that will be phenomenal in terms of tax," Trump said at a meeting with airline executives on Thursday. The president also said he is "lowering the overall tax burden on American businesses, big league." Trump has long expressed a desire to redo the US\'s corporate and personal tax systems. The president promised in a meeting with manufacturing executives two weeks ago to cut businesses taxes to between 15% and 20%. "We\'re going to be cutting taxes massively for both the middle class and for companies, and that\'s massively," Trump said at the January 23 meeting. Among the proposals Trump has floated are slashing taxes for families and imposing a border tax of possibly 20% on imports, particularly those from Mexico. It\'s unclear which elements of those proposals would be included in the plan. A promise to cut ta

In [13]:
word_tokenize(dataset.data.iloc[0].text)[43:52]

["''", 'Trump', 'said', 'at', 'a', 'meeting', 'with', 'airline', 'executives']

In [14]:
dataset[0][0][43:52]

tensor([[0.2674, 0.0872, 0.5475],
        [3.0000, 3.0000, 3.0000],
        [3.0000, 3.0000, 3.0000],
        [3.0000, 3.0000, 3.0000],
        [3.0000, 3.0000, 3.0000],
        [3.0000, 3.0000, 3.0000],
        [3.0000, 3.0000, 3.0000],
        [3.0000, 3.0000, 3.0000],
        [0.2674, 0.0872, 0.5475]])

In [15]:
dataset[0][0].shape

torch.Size([266, 3])

In [16]:
dataloader = DataLoader(dataset, 3, collate_fn=PadCollate(dim=0))

In [17]:
x, y = next(iter(dataloader))

In [18]:
x.shape

torch.Size([3, 266, 3])