In [56]:
import pandas as pd
import numpy as np
import torch
from torchtext.data import TabularDataset, Field, RawField, BucketIterator

In [99]:
tokenize = lambda x: x.split()
TEXT = Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=False)
LABEL = RawField(preprocessing=lambda x: np.array(list(x)))

In [100]:
train = pd.read_csv("./data/train_real.csv")

In [101]:
train.sample(2)

Unnamed: 0,ID,text,label
701,847,show me thrillers,0000000000000001000000000000000000000000000000
1762,2179,who is the producer of the terminator,0000000000000000000000100000000000000000000000


In [102]:
datafields = [("ID", None), # we won't be needing the id, so we pass in None as the field
              ("text", TEXT),
              ("label", LABEL)]

In [103]:
trn, vld = TabularDataset.splits(
        path="data", 
        train='train_real.csv', validation="val.csv",
        format='csv',
        skip_header=True,
        fields=datafields)

In [104]:
TEXT.build_vocab(trn, vld)

In [105]:
train_iter, val_iter = BucketIterator.splits(
        (trn, vld), # we pass in the datasets we want the iterator to draw data from
        batch_sizes=(5, 5),
        device=torch.device("cuda"), # specify the GPU number here
        sort_key=lambda x: len(x.text), # the BucketIterator needs to be told what function it should use to group the data.
        sort_within_batch=False,
        repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)

In [106]:
batch = next(train_iter.__iter__()); batch


[torchtext.data.batch.Batch of size 5]
	[.text]:[torch.cuda.LongTensor of size 11x5 (GPU 0)]
	[.label]:[array(['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0',
       '0', '0', '0', '0', '0', '0', '0'], dtype='<U1'), array(['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0'], dtype='<U1'), array(['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '1', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0'], dtype='<U1'), array(['0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
    