In [1]:
import torch
from torchtext.datasets import IMDB
from torchtext import data

from spacy.lang.en.stop_words import STOP_WORDS

SEED = 42

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [26]:
# from nltk.corpus import stopwords

# stoplist = stopwords.words('english')
print(stoplist)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [27]:


print(STOP_WORDS) # <- set of Spacy's default stop words

{'something', 'four', 'give', 'all', 'already', 'amount', 'herein', 'else', 'hereby', 'whole', 'fifteen', 'ten', 'why', 'me', 'eight', 'anyhow', 'even', 'hereafter', 'first', 'everyone', 'these', 'using', 'via', 'whose', 'after', 'seem', 'someone', 'last', 'sometimes', 'too', 'us', 'because', 'next', 'almost', 'any', 'under', 'each', 'ever', 'therefore', 'alone', 'see', 'get', 'either', 'in', 'full', 'across', 'he', 'whence', 'wherein', 'being', 'him', 'less', 'whereby', 'twenty', 'towards', 'doing', 'yourselves', 'one', 'yourself', 'on', 'more', 'elsewhere', 'becomes', 'nothing', 'serious', 'put', 'always', 'six', 'may', 'done', 'themselves', 'whom', 'indeed', 'call', 'a', 'herself', 'hundred', 'noone', 'bottom', 'amongst', 'must', 'formerly', 'around', 'once', 'should', 'show', 'when', 'whereupon', 'back', 'for', 'just', 'regarding', 're', 'somehow', 'thus', 'to', 'can', 'above', 'perhaps', 'anywhere', 'down', 'myself', 'mine', 'himself', 'over', 'together', 'well', 'few', 'i', 'anyt

In [30]:
# set(stoplist) & STOP_WORDS
# set(stoplist) - STOP_WORDS

In [2]:
TEXT = data.Field(tokenize='spacy', lower=True, include_lengths=True, stop_words=STOP_WORDS)
# TEXT = data.Field(lower=True)
LABEL = data.Field(sequential=False, unk_token=None)

In [3]:
train_data, test_data = IMDB.splits(TEXT, LABEL)

print(f'Number of training examples: {len(train_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 25000
Number of testing examples: 25000


In [4]:
print(vars(train_data.examples[-1]))

{'text': ['worst', 'movies', 'production', ',', 'ever.<br', '/><br', '/>1', '.', 'exciting', 'beginning', ',', 'guy', 'walking', '...', 'walking', '...', 'walking', '(', 'spoiler', ')', '.', '15', 'minutes', 'walking', '.', '?', '<', 'br', '/><br', '/>2', '.', 'mention', "'s", 'lot', 'issues', 'lighting', ',', "'s", 'like', 'shot', 'night', 'scenes', 'day', '.', '<', 'br', '/><br', '/>3', '.', 'acting', 'terrible', '.', 'looks', 'like', 'found', 'community', 'theater', '(', 'mexico', ')', '...', 'took', 'people', 'turned', 'away.<br', '/><br', '/>please', ',', 'love', 'holy', ',', "n't", 'rent', 'movie', '.', 'know', 'owns', ',', 'apologize', '.', 'director', 'subject', 'punishment', 'war', 'crimes', 'tribunal', 'foisting', 'public', '.'], 'label': 'neg'}


In [11]:
help(TEXT.build_vocab)

Help on method build_vocab in module torchtext.data.field:

build_vocab(*args, **kwargs) method of torchtext.data.field.Field instance
    Construct the Vocab object for this field from one or more datasets.
    
    Arguments:
        Positional arguments: Dataset objects or other iterable data
            sources from which to construct the Vocab object that
            represents the set of possible values for this field. If
            a Dataset object is provided, all columns corresponding
            to this field are used; individual columns can also be
            provided directly.
        Remaining keyword arguments: Passed to the constructor of Vocab.



In [5]:
TEXT.build_vocab(train_data, max_size=25000)
print(len(TEXT.vocab))
LABEL.build_vocab(train_data)
print(len(TEXT.vocab))

25002
25002


In [7]:
TEXT.vocab.stoi['<unk>'], TEXT.vocab.stoi['<pad>'], TEXT.vocab.stoi['good']

(0, 1, 16)

In [8]:
print(TEXT.vocab.freqs.most_common(20))

[(',', 275277), ('.', 236270), ('"', 63334), ("'s", 62090), ('-', 52864), ('/><br', 50935), ('movie', 43059), ('film', 39280), ('(', 33106), (')', 32848), ("n't", 32846), ('!', 21780), ('like', 20111), ("'", 17015), ('good', 14903), ('?', 14799), ('time', 12335), ('story', 11718), ('...', 9726), (':', 9388)]


In [12]:
print(TEXT.vocab.itos[:10])

['<unk>', '<pad>', ',', '.', '"', "'s", '-', '/><br', 'movie', 'film']


In [13]:
LABEL.vocab.itos

['neg', 'pos']

In [14]:
print(LABEL.vocab.stoi)

defaultdict(<function _default_unk_index at 0x7fcd77f0db70>, {'neg': 0, 'pos': 1})


In [15]:
words = []
for word, cc in TEXT.vocab.freqs.most_common():
    if cc >= 30:
        words.append(word)
len(words)

10270

In [18]:
word2idx = {'<unk>': 0, '<pad>': 1}
for i, word in enumerate(words, 2):
    word2idx[word] = i
len(word2idx)

10272

In [34]:
a = 'neg'
1 if a == 'pos' else 0

0

In [35]:
x_train, y_train = [], []
x_test, y_test = [], []

for item in train_data:
    x_train.append([word2idx.get(word, 0) for word in item.text])
    y_train.append(1 if item.label == 'pos' else 0)

for item in test_data:
    x_test.append([word2idx.get(word, 0) for word in item.text])
    y_test.append(1 if item.label == 'pos' else 0)

In [36]:
assert len(x_train) == len(y_train) and len(y_train) == 25000
assert len(x_test) == len(y_test) and len(y_test) == 25000

In [37]:
import pickle
data = {'train': {'x': x_train, 'y': y_train}, 
        'test': {'x': x_test, 'y': y_test}, 
        'word2idx': word2idx}

with open('imdb_data.pkl', 'wb') as f:
    pickle.dump(data, f)

In [45]:
with open('imdb_data.pkl', 'rb') as f:
    new_data = pickle.load(f)
    
print(new_data['train']['x'][:2])

[[52, 203, 3, 0, 9049, 126, 8512, 2598, 5171, 9813, 0, 9049, 797, 3, 0, 1196, 4002, 1794, 0, 4, 1287, 5406, 4, 3, 965, 9, 837, 419, 13], [1008, 510, 2220, 0, 512, 914, 3, 0, 3132, 2, 151, 373, 3, 503, 5011, 7, 71, 12, 122, 1795, 3, 5348, 1222, 797, 1084, 547, 6, 1274, 502, 6, 0, 7127, 0, 852, 2, 3970, 3, 321, 3971, 1399, 10, 33, 0, 11, 6, 14, 132, 12, 3, 500, 700, 2, 872, 55, 0, 1245, 3, 230, 592, 428, 10, 503, 2181, 653, 11, 611, 13, 50, 35, 4, 984, 5829, 4, 182, 132, 2, 4498, 428, 611, 871, 132, 3, 112, 2311, 39, 65, 2679, 3, 115, 112, 45, 0, 7, 51, 352]]


In [46]:
print(new_data['train']['y'][:20])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [47]:
print(new_data['train']['y'][-20:])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [1]:
import pickle
with open('../data/imdb_data.pkl', 'rb') as rf:
    data_split = pickle.load(rf)
MAX_LEN = 100
x_train, y_train = data_split['train']['x'], data_split['train']['y']
x_test, y_test = data_split['test']['x'], data_split['test']['y']
x_train = [x[:MAX_LEN] for x in x_train]
x_test = [x[:MAX_LEN] for x in x_test]
word2idx = data_split['word2idx']

In [6]:
word2idx['<pad>'], word2idx['<unk>']

(1, 0)

In [5]:
print(x_train[:3])

[[52, 203, 3, 0, 9049, 126, 8512, 2598, 5171, 9813, 0, 9049, 797, 3, 0, 1196, 4002, 1794, 0, 4, 1287, 5406, 4, 3, 965, 9, 837, 419, 13], [1008, 510, 2220, 0, 512, 914, 3, 0, 3132, 2, 151, 373, 3, 503, 5011, 7, 71, 12, 122, 1795, 3, 5348, 1222, 797, 1084, 547, 6, 1274, 502, 6, 0, 7127, 0, 852, 2, 3970, 3, 321, 3971, 1399, 10, 33, 0, 11, 6, 14, 132, 12, 3, 500, 700, 2, 872, 55, 0, 1245, 3, 230, 592, 428, 10, 503, 2181, 653, 11, 611, 13, 50, 35, 4, 984, 5829, 4, 182, 132, 2, 4498, 428, 611, 871, 132, 3, 112, 2311, 39, 65, 2679, 3, 115, 112, 45, 0, 7, 51, 352], [353, 7581, 4907, 2, 810, 674, 2, 8216, 984, 668, 972, 2, 1525, 2, 9, 0, 10, 559, 2890, 11, 7457, 0, 6, 776, 0, 2204, 7828, 4409, 3, 393, 1370, 4908, 972, 3615, 0, 272, 1015, 6, 1197, 559, 0, 3, 5, 9, 6, 1117, 42, 206, 16, 1515, 32, 340, 7030, 3268, 2121, 8, 3, 5889, 6736, 668, 0, 3269, 3042, 4108, 10, 6737, 1408, 11, 1097, 96, 622, 532, 4370, 3, 302, 517, 3268, 2040, 95, 1536, 1125, 966, 3, 96, 0, 13]]


In [10]:
from torch.utils.data import Dataset
import numpy as np
import torch

# def collate_fn(insts, PAD_token=1):
#     # if seq_pad in class then all seqs with same length
#     maxlen = max([len(x) for x in insts])
#     #maxlen = 24
#     seq = np.array([x + [PAD_token] * (maxlen - len(x)) for x in insts])
#     seq_lens = np.array([len(x) for x in insts])
#     return torch.LongTensor(seq), torch.LongTensor(seq_lens)

def paired_collate_fn(insts):
    #src_insts, tgt_insts = list(zip(*insts))
    seq_pairs = sorted(insts, key=lambda p: len(p[0]), reverse=True)
    src_insts, tgt_insts = zip(*seq_pairs)
    src_len = np.array([len(x) for x in src_insts])
    # tgt_insts = collate_fn(tgt_insts)
    return (src_insts, src_len, tgt_insts)

class IMDBdatasets(Dataset):
    def __init__(self, src, tgt):
        # self.device = device
        self.src = src
        self.tgt = tgt

    def __len__(self):
        return len(self.src)

    def __getitem__(self, idx):
        return self.src[idx], self.tgt[idx]

train_loader = torch.utils.data.DataLoader(
                        IMDBdatasets(x_train, y_train),
                        num_workers = 2,
                        batch_size = 8,
                        collate_fn = paired_collate_fn,
                        shuffle = True,
                        drop_last = True)

for batch in train_loader:
    src, src_lens, tgt = batch
    print(src)
    print(src_lens)
    print(tgt)
    break

([57, 810, 353, 4, 1278, 4, 2, 1003, 1912, 1191, 385, 674, 0, 1464, 1967, 1181, 0, 2, 111, 5, 4, 0, 4, 3, 2977, 932, 0, 10, 6, 3253, 11, 0, 838, 6, 1478, 89, 798, 0, 0, 9632, 36, 444, 685, 2, 323, 8951, 72, 4064, 2, 656, 4, 1258, 537, 5647, 14, 4072, 3, 1347, 1239, 0, 2750, 9512, 6, 107, 2542, 14, 97, 180, 18, 10, 117, 25, 2911, 511, 6556, 6, 3304, 11, 2, 448, 973, 212, 6, 2656, 222, 2, 3892, 0, 59, 3828, 0, 3, 387, 2397, 0, 98, 999, 0, 0, 10], [8, 250, 0, 0, 10, 4027, 6, 0, 0, 11, 3, 14, 411, 0, 48, 0, 3, 173, 662, 6195, 0, 2, 4, 14, 0, 26, 27, 7, 71, 148, 14, 1154, 6240, 4, 193, 3075, 4, 111, 3, 2221, 486, 991, 2, 604, 4235, 8, 68, 857, 0, 3, 604, 0, 2323, 2259, 54, 435, 482, 435, 2, 0, 7, 426, 592, 8, 133, 2207, 621, 3, 2, 2, 8, 5, 37, 16, 3, 5, 213, 2, 205, 200, 12, 1109, 3, 125, 205, 200, 601, 197, 16, 19, 2, 606, 360, 41, 3, 78, 12, 2088, 3, 162], [341, 2, 810, 545, 2349, 4146, 3131, 0, 7, 0, 848, 28, 278, 9634, 334, 2508, 3, 0, 979, 3, 979, 6217, 2313, 3, 979, 1678, 272, 44, 165

In [41]:
import numpy as np
lengths = [len(ss) for ss in (new_data['train']['x'] + new_data['test']['x'])]

np.min(lengths), np.mean(lengths), np.std(lengths), np.max(lengths)

(5, 144.62662, 110.63362783247958, 1624)

In [43]:
import pickle

with open('/data/charley/crawl-300d-2M.pkl', 'rb') as pf:
    fb_w2v = pickle.load(pf)
    
embed_dim = 300
embed_matrix = np.zeros((len(word2idx), embed_dim))
flag = 0
for word, idx in word2idx.items():
    try:
        word_vec = fb_w2v[word]
    except:
        word_vec = None
    if word_vec is not None:
        embed_matrix[idx] = word_vec
        flag += 1
print("There are {} words with pre-trained vector in vocab_size = {}.".format(flag, len(word2idx)))
np.save('imdb_EmbeddingMatrix', embed_matrix) # saved as EmbeddingMatrix.npy

There are 9789 words with pre-trained vector in vocab_size = 10272.


In [36]:
BATCH_SIZE = 8

device = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

train_iters, test_iters = data.BucketIterator.splits((train_data, test_data), batch_size=BATCH_SIZE, device=device)

In [37]:
batch = next(iter(train_iters))
print(batch.text) # 527 x 8 = seq_len x B
print(batch.label)

tensor([[ 5945,    10,    12,  ...,    10,     5,   669],
        [    6,     7,    90,  ...,    20,   161,    10],
        [ 1106,    30,    26,  ...,     7,     8,    14],
        ...,
        [   87,     1,     1,  ...,     1,     1,     1],
        [   72,     1,     1,  ...,     1,     1,     1],
        [16698,     1,     1,  ...,     1,     1,     1]], device='cuda:1')
tensor([1, 1, 0, 1, 1, 0, 1, 0], device='cuda:1')


In [38]:
batch.text.size()

torch.Size([527, 8])

In [21]:
for x in train_data.examples:
    label = vars(x)['label'][0]
    if label == '<unk>':
        print(vars(x))

In [13]:
for x in train_data.examples:
    if (x.label != 'pos' and x.label != 'neg'):
        print(x.text)
        break