In [1]:
import torch
from torch import nn
from torch.autograd import Variable as V


emb = nn.Embedding(10000, 20, padding_idx=0)
inp = V(torch.LongTensor([1, 2, 5, 2, 10]))
out = emb(inp)
out

Variable containing:

Columns 0 to 9 
-1.9254  0.2848  0.2735  0.1440  1.5715  1.2846 -1.3029 -0.3067  0.0190 -2.1332
-0.6516 -1.9692 -0.8449  0.1270  1.1009  1.3408  1.8494 -0.8904 -1.3814  0.1601
 0.9403  0.1200 -2.7859  0.6495 -2.1209  0.7630 -0.4578  0.0622 -0.6080 -1.3222
-0.6516 -1.9692 -0.8449  0.1270  1.1009  1.3408  1.8494 -0.8904 -1.3814  0.1601
 0.0884  0.7957  1.3944  0.3304  1.2315 -0.5400  1.2227  0.0030  1.0206  2.1087

Columns 10 to 19 
-1.2497 -1.8067 -0.4380 -0.4508 -0.2988 -1.0211  1.0691 -0.2588 -0.5100 -1.4161
-1.8485  2.1402  0.5949 -0.3385 -0.2733 -0.2851  0.0870 -1.7878 -0.9565 -0.3677
-0.3339  0.6801 -0.6570  0.8762 -0.3547 -0.5849 -0.6250  0.2425 -0.6209 -1.7259
-1.8485  2.1402  0.5949 -0.3385 -0.2733 -0.2851  0.0870 -1.7878 -0.9565 -0.3677
 1.8358 -0.0632  0.1345  0.3971 -1.6751 -0.2942 -1.6081 -0.4600 -0.6743  1.7126
[torch.FloatTensor of size 5x20]

In [2]:
# http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
import glob
import pathlib
import re


remove_marks_regex = re.compile('[,\.\(\)\[\]\*:;]|<.*?>')
shift_marks_regex = re.compile('([?!])')

def text2ids(text, vacab_dict):
    text = remove_marks_regex.sub('', text)
    text = shift_marks_regex.sub(r' \l ', text)
    tokens = text.split()
    return [vacab_dict.get(token, 0) for token in tokens]

def list2tensor(token_idxes, max_len=100, padding=True):
    if len(token_idxes) > max_len:
        token_idxes = token_idxes[:max_len]
    n_tokens = len(token_idxes)
    if padding:
        token_idxes = token_idxes + [0] * (max_len - len(token_idxes))
    return torch.LongTensor(token_idxes), n_tokens

In [5]:
from torch.utils.data import Dataset


class IMDBDataset(Dataset):
    def __init__(self, dir_path, train=True, max_len=100, padding=True):
        self.max_len = max_len
        self.padding = padding
        path = pathlib.Path(dir_path)
        vocab_path = path.joinpath('imdb.vocab')
        self.vocab_array = vocab_path.open().read().strip().splitlines()
        self.vocab_dict = dict((w, i+1) for (i, w) in enumerate(self.vocab_array))
        if train:
            target_path = path.joinpath('train')
        else:
            target_path = path.joinpath('test')
        pos_files = sorted(glob.glob(str(target_path.joinpath('pos/*.txt'))))
        neg_files = sorted(glob.glob(str(target_path.joinpath('neg/*.txt'))))
        self.labeled_files = list(zip([0]*len(neg_files), neg_files)) + list(zip([1]*len(pos_files), pos_files))
        
    @property
    def vocab_size(self):
        return len(self.vocab_array)
    
    def __len__(self):
        return len(self.labeled_files)
    
    def __getitem__(self, idx):
        label, f = self.labeled_files[idx]
        data = open(f).read().lower()
        data = text2ids(data, self.vocab_dict)
        data, n = list2tensor(data, self.max_len, self.padding)
        return data, label, n

In [None]:
from torch.utils.data import DataLoader


train_data = IMDBDataset('./aclImdb/')
train_data = IMDBDataset('./aclImdb/', train=False)
