The IMDB dataset comes from https://ai.stanford.edu/~amaas/data/sentiment/

In [1]:
import os
import re
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset

**Download Dataset**

In [None]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

In [None]:
!tar -zxvf aclImdb_v1.tar.gz

**Tokenlization**  
Fileter special characters and return a list with each word to be its element

In [2]:
def tokenlize(content):
  content = re.sub('<.*?>', ' ',content)
  filters = ['\.', ':', '\t', '\n', '\x97', '\x96', '#', '$', '%', '&']
  content = re.sub('|'.join(filters), ' ', content)
  tokens = [i.strip().lower() for i in content.split()]
  return tokens

In [3]:
class ImdbDataset(Dataset):
  def __init__(self, train_path, test_path, train=True):
    self.train_data_path = train_path
    self.test_data_path = test_path
    data_path  = self.train_data_path if train else self.test_data_path

    # add all file names into a list
    temp_data_path = [os.path.join(data_path, 'pos'), os.path.join(data_path, 'neg')]
    self.total_file_path = []  # path to all dataset
    for path in temp_data_path:
      file_name_list = os.listdir(path)
      file_path_list = [os.path.join(path, file_name) for file_name in file_name_list if file_name.endswith('.txt')]
      self.total_file_path.extend(file_path_list)



  def __getitem__(self, index):
    # get label
    file_path = self.total_file_path[index]
    label_str = file_path.split("/")[-2]
    label = 0 if label_str == 'neg' else 1
    # get content
    content = open(file_path).read()
    tokens = tokenlize(content)
    return tokens, label
  
  def __len__(self):
    return len(self.total_file_path)

In [4]:
def collate_fn(batch):
  '''
  :param batch: (retVal of getitem [tokens,label], retVal of getitem ...)
  :return:
  '''
  content, label = list(zip(*batch))
  return content, label

In [5]:
imdb_dataset = ImdbDataset(train_path='/content/aclImdb/train', test_path='/content/aclImdb/test')
data_loader = DataLoader(imdb_dataset, batch_size=1, shuffle=True)

In [6]:
for idx, (input, target) in enumerate(data_loader):
  print(idx)
  print(input)
  print(target)
  break

0
[('young',), ('mr',), ('lincoln',), ('marks',), ('the',), ('first',), ('film',), ('of',), ('the',), ('director/star',), ('collaboration',), ('of',), ('john',), ('ford',), ('and',), ('henry',), ('fonda',), ('i',), ('recall',), ('years',), ('ago',), ('fonda',), ('telling',), ('that',), ('as',), ('a',), ('young',), ('actor',), ('he',), ('was',), ('understandably',), ('nervous',), ('about',), ('playing',), ('abraham',), ('lincoln',), ('and',), ('scared',), ('he',), ("wouldn't",), ('live',), ('up',), ('to',), ('the',), ('challenge',), ('john',), ('ford',), ('before',), ('the',), ('shooting',), ('starts',), ('put',), ('him',), ('at',), ('ease',), ('by',), ('saying',), ('he',), ("wasn't",), ('going',), ('to',), ('be',), ('playing',), ('the',), ('great',), ('emancipator,',), ('but',), ('just',), ('a',), ('jack-leg',), ('prairie',), ('lawyer',), ('that',), ('being',), ('settled',), ('fonda',), ('headed',), ('a',), ('cast',), ('that',), ('john',), ('ford',), ('directed',), ('into',), ('a',), (

**Word to Sequence**

In [7]:
class Word2Sequence():
  UNK_TAG = 'UNK'
  PAD_TAG = 'PAD'

  UNK = 0
  PAD = 1

  def __init__(self):
    self.dict = {
        self.UNK_TAG : self.UNK,
        self.PAD_TAG : self.PAD
    }
    self.count = {}
  
  def fit(self, sentence):
    '''save sentence into dict
    :param sentence: [word1, word2, word3 ...]
    '''
    for word in sentence:
      self.count[word] = self.count.get(word, 0) + 1
  
  def build_vocab(self, min=None, max=None, max_features=None):
    '''
    Build dictionary
    :param min:
    :param max:
    :param max_features:
    :return:
    '''
    # delete words in count where frequency is less than min
    if min is not None:
      self.count = {word : value for word, value in self.count.items() if value > min}

    # delete words in count where frequency is greater than max
    if max is not None:
      self.count = {word : value for word, value in self.count.items() if value < max}
    
    # limit number of vocobulary in count
    if max_features is not None:
      temp = sorted(self.count.items(), key=lambda x : x[-1], reverse=True)[:max_features]
      self.count = dict(temp)
    
    for word in self.count:
      self.dict[word] = len(self.dict)
    
    self.inversed_dict = dict(zip(self.dict.values(), self.dict.keys()))
  
  def transform(self, sentence, max_len=None):
    '''
    Sentence 2 Sequence
    :param sentence: [word1, word2, ...]
    :param max_len: int, if add dummy / cut the sentence
    :return:
    '''
    if max_len is not None:
      if max_len > len(sentence):
        sentence += [self.PAD_TAG]*(max_len-len(sentence))
      elif max_len < len(sentence):
        sentence = sentence[:max_len]
    
    return [self.dict.get(word, self.UNK) for word in sentence]
  
  def inverse_transform(self, indices):
    '''
    Sequence 2 Sentence
    :param indices: [1, 2, 3, 4, ...]
    :return:
    '''
    return [self.inversed_dict.get(idx) for idx in indices]

In [8]:
ws = Word2Sequence()
str = 'I like machine learning'
tok = tokenlize(str)
ws.fit(tok)

str = 'Today\'s weather is good'
tok = tokenlize(str)
ws.fit(tok)
ws.build_vocab()
print(ws.dict)

{'UNK': 0, 'PAD': 1, 'i': 2, 'like': 3, 'machine': 4, 'learning': 5, "today's": 6, 'weather': 7, 'is': 8, 'good': 9}


In [9]:
str = 'I like today\'s beautiful weather'
tok = tokenlize(str)
print(tok)
ret = ws.transform(tok, max_len=10)
print(ret)

['i', 'like', "today's", 'beautiful', 'weather']
[2, 3, 6, 0, 7, 1, 1, 1, 1, 1]


In [10]:
ret = ws.inverse_transform(ret)
print(ret)

['i', 'like', "today's", 'UNK', 'weather', 'PAD', 'PAD', 'PAD', 'PAD', 'PAD']
