In [3]:
from torchtext.datasets import IMDB

train_iter = IMDB(split='train')



In [4]:
for label, line in train_iter:
    print(label, line)
    break

neg I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between,

## 1. Pytorch Tutorial. EmbeddingBag with Linear
#### https://pytorch.org/tutorials/beginner/text_sentiment_ngrams_tutorial.html

### Tokenizer and Vocab

In [5]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

tokenizer = get_tokenizer('basic_english')
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])



In [4]:
print(tokenizer("here is an example the fdfaf"))
print(vocab(['here', 'is', 'an', 'example', 'the', 'fdfaf']))
vocab.get_itos()[:10]

['here', 'is', 'an', 'example', 'the', 'fdfaf']
[131, 9, 40, 464, 1, 0]


['<unk>', 'the', '.', ',', 'and', 'a', 'of', 'to', "'", 'is']

In [5]:
text_pipeline = lambda line: vocab(tokenizer(line))
label_pipeline = lambda label: 0 if label == "neg" else 1

for label, line in train_iter:
    print(text_pipeline(line))
    print(label_pipeline(label))
    break

[12, 1567, 12, 245, 35467, 42, 63, 397, 1134, 91, 6, 36, 1, 7125, 14, 3362, 10, 59, 10, 16, 93, 628, 11, 6920, 2, 12, 86, 552, 14, 37, 93, 10, 16, 20192, 39, 1224, 2, 15, 2, 9262, 50, 10, 130, 779, 7, 2479, 13, 681, 3, 1574, 117, 5, 341, 6, 113, 1159, 3051, 12, 71, 74, 7, 73, 13, 18, 536, 2, 1, 120, 9, 5958, 193, 5, 190, 3861, 473, 1423, 765, 4313, 41, 488, 7, 833, 286, 60, 57, 49, 126, 2, 11, 825, 60, 488, 7, 1131, 46, 11858, 7, 256, 55, 440, 6, 668, 27, 53, 1, 862, 29736, 208, 49, 780, 1000, 1303, 146, 17, 1, 2674, 336, 4, 1509, 1303, 11, 1, 2358, 1591, 2, 11, 202, 2181, 7270, 4, 1918, 19585, 6, 21477, 49, 72, 4655, 27, 2380, 3, 60, 51, 401, 19, 46, 473, 1691, 3, 8134, 3, 4, 998, 346, 2, 53, 1079, 77, 49, 12, 245, 35467, 9, 14, 1613, 160, 586, 3, 13, 16, 1159, 8205, 2, 71, 3, 1, 401, 4, 999, 144, 30, 174, 4, 241, 202, 3, 62, 100, 10, 8, 15, 28, 329, 44, 55, 6654, 99, 4460, 2, 142, 63, 23464, 347, 171, 10, 1573, 3, 11, 634, 401, 4, 999, 30, 5, 662, 10197, 11, 3861, 436, 2, 62, 14515, 

### Dataloader

In [6]:
import torch
from torch.utils.data import DataLoader
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def collate_batch(batch):
    label_list, text_list, offsets = [], [], [0]
    for (_label, _text) in batch:
        label_list.append(label_pipeline(_label))
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        offsets.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.float).view(-1, 1)
    offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
    text_list = torch.cat(text_list)
    return label_list.to(device), text_list.to(device), offsets.to(device)

dataloader = DataLoader(train_iter, batch_size=8, shuffle=False, collate_fn=collate_batch)

### Model

In [7]:
from torch import nn

class TextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super(TextClassificationModel, self).__init__()
        self.embedding = nn.EmbeddingBag(vocab_size, embed_dim, sparse=True)
        self.fc = nn.Linear(embed_dim, 1)
        self.init_weights()

    def init_weights(self):
        initrange = 0.5
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()

    def forward(self, text, offsets):
        embedded = self.embedding(text, offsets)
        return self.fc(embedded)
vocab_size = len(vocab)
emsize = 64
model = TextClassificationModel(vocab_size, emsize).to(device)
vocab_size

100683

### Training

In [8]:
import time
import torch.nn.functional as F

def train(dataloader):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text, offsets) in enumerate(dataloader):
        optimizer.zero_grad()
        probability = model(text, offsets)
        loss = criterion(probability, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()
        predicted_label = (F.sigmoid(probability) >= 0.5).type(torch.int64)  
        total_acc += (predicted_label == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches '
                  '| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
                                              total_acc/total_count))
            total_acc, total_count = 0, 0
            start_time = time.time()

def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text, offsets) in enumerate(dataloader):
            probability = model(text, offsets)
            loss = criterion(probability, label)
            predicted_label = (F.sigmoid(probability) >= 0.5).type(torch.int64)  
            total_acc += (predicted_label == label).sum().item()
            total_count += label.size(0)
    return total_acc/total_count

In [9]:
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset

# Hyperparameters
EPOCHS = 10 # epoch
LR = 5  # learning rate
BATCH_SIZE = 64 # batch size for training

criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
total_accu = None

train_iter, test_iter = IMDB()
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)
num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = \
    random_split(train_dataset, [num_train, len(train_dataset) - num_train])

train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
                              shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE,
                             shuffle=True, collate_fn=collate_batch)

for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader)
    accu_val = evaluate(valid_dataloader)
    if total_accu is not None and total_accu > accu_val:
        scheduler.step()
    else:
        total_accu = accu_val
    print('-' * 59)
    print('| end of epoch {:3d} | time: {:5.2f}s | '
          'valid accuracy {:8.3f} '.format(epoch,
                                           time.time() - epoch_start_time,
                                           accu_val))
    print('-' * 59)



-----------------------------------------------------------
| end of epoch   1 | time:  7.24s | valid accuracy    0.736 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   2 | time:  7.18s | valid accuracy    0.677 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   3 | time:  7.27s | valid accuracy    0.827 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   4 | time:  7.28s | valid accuracy    0.840 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   5 | time:  7.16s | valid accuracy    0.838 
-----------------------------------------------------------
-----------------------------------------------------------
| end of epoch   6 | time:  7.18s |

In [10]:
print('Checking the results of test dataset.')
accu_test = evaluate(test_dataloader)
print('test accuracy {:8.3f}'.format(accu_test))

Checking the results of test dataset.
test accuracy    0.827


In [13]:
ex_text_str = "The movie started cool, but went downhill close to the end."
text = torch.tensor(text_pipeline(ex_text_str))
model = model.to("cpu")
print("Positive probability", torch.nn.Sigmoid()(model(text, torch.tensor([0]))[0]).item())

Positive probability 0.04854077473282814


## 2. Bag of words

In [9]:
import re

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text) 
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text
preprocess_text("This is a, by the way, coolish review!!?")

'this is a , by the way , coolish review ! ! ? '

In [34]:
train_dataloader = DataLoader(train_iter, batch_size=64, shuffle=True)
print(type(train_iter))
train_dataloader

<class 'torch.utils.data.datapipes.iter.callable.MapperIterDataPipe'>


<torch.utils.data.dataloader.DataLoader at 0x1342f35e0>

In [None]:
class Vocabulary():
    def __init__(self, token_to_idx=None, add_unk=True, unk_token="<UNK>"):
        self.token_to_idx = token_to_idx if token_to_idx else {}
        self.idx_to_token = {idx: token for token, idx in token_to_idx.items()} if token_to_idx else {}
        self.add_unk = add_unk
        self.unk_token = unk_token
        self.token_to_idx[unk_token] = 0
        self.idx_to_token[0] = unk_token
        self.index = 1
        
    def add_token(self, token):
        if token not in self.token_to_idx:
            self.token_to_idx[token] = token
            self.idx_to_token = self.index
            self.index += 1
    
    def lookup_id(self, token):
        return self.token_to_idx[token] if token in self.token_to_idx else None
    
    def lookup_token(self, idx):
        return self.idx_to_token[idx] if idx in self.idx_to_token else None
    
    def __len__(self):
        return len(self.token_to_idx)

class Vectorizer():
    def __init__(self, vocab):
        self.vocab = vocab
    
    def 
    