In [1]:
import torch
from torch import nn, optim
from torch.utils.data import (Dataset, DataLoader, TensorDataset)

import tqdm
import import_ipynb
from RNN_Basic import *

importing Jupyter notebook from RNN_Basic.ipynb


In [2]:
class IMDBDataset(Dataset):
    def __init__(self, dir_path, train=True,
                max_len=100, padding=True):
        self.max_len = max_len
        self.padding = padding
        
        path = pathlib.Path(dir_path)
        vocab_path = path.joinpath("imdb.vocab")
        
        # Read Vocabulary-File and divide as row by row.
        self.vocab_array = vocab_path.open() \
            .read().strip().splitlines()
        
        # Create 'dict' that the word is 'key' and the
        # value equals to 'ID'.
        self.vocab_dict = dict((w, i+1) \
            for (i, w) in enumerate(self.vocab_array))
        
        if train:
            target_path = path.joinpath("train")
        else:
            target_path = path.joinpath("test")
        
        pos_files = sorted(glob.glob(
            str(target_path.joinpath("pos/*.txt"))
        ))
        neg_files = sorted(glob.glob(
            str(target_path.joinpath("neg/*.txt"))
        ))
        
        # Create (file_path, label) of tuple_list 
        # by adding label pos as 1, and neg as 0.
        self.labeled_files = \
            list(zip([0] * len(neg_files), neg_files)) + \
            list(zip([0] * len(pos_files), pos_files))
        
    @property
    def vocab_size(self):
        return len(self.vocab_array)
    
    def __len__(self):
        return len(self.labeled_files)
    
    def __getitem__(self, idx):
        label, f = self.labeled_files[idx]
        
        # Convert the Text-Data of file as a Lower-Case.
        data = open(f).read().lower()
        
        # Convert Text-Data as ID-List.
        data = text2ids(data, self.vocab_dict)
        
        # Convert ID-List as Tensor.
        data, n_tokens = list2tensor(data, self.max_len, self.padding)
        return data, label, n_tokens

In [3]:
train_data = IMDBDataset("../05/aclImdb/")
test_data = IMDBDataset('../05/aclImdb/', train=False)

In [4]:
train_loader = DataLoader(
    train_data, batch_size=32,
    shuffle=True, num_workers=4
)
test_loader = DataLoader(
    test_data, batch_size=32,
    shuffle=False, num_workers=4
)