In [17]:
import time

import torch
import torch.nn as nn
import torchtext

import torch.nn.functional as functional

In [18]:
start = time.time()
TEXT = torchtext.data.Field(lower=True, fix_length=200, batch_first=False)
LABEL = torchtext.data.Field(sequential=False)

In [19]:
train_data, test_data = torchtext.datasets.IMDB.splits(TEXT, LABEL)

In [20]:
print(vars(train_data.examples[0]))

{'text': ['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy.', 'it', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life,', 'such', 'as', '"teachers".', 'my', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'bromwell', "high's", 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', '"teachers".', 'the', 'scramble', 'to', 'survive', 'financially,', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', "teachers'", 'pomp,', 'the', 'pettiness', 'of', 'the', 'whole', 'situation,', 'all', 'remind', 'me', 'of', 'the', 'schools', 'i', 'knew', 'and', 'their', 'students.', 'when', 'i', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school,', 'i', 'immediately', 'recalled', '.........', 'at', '..........', 'high.', 'a', 'classic', 'line:', 'inspector:', "i'm", 'here', 'to', 'sack', 'one', 'of', '

In [21]:
import string

for exam in train_data.examples:
    text = [x.lower() for x in vars(exam)["text"]]
    text = [x.replace("<br", "") for x in text]
    text = ["".join(c for c in x if c not in string.punctuation) for x in text]
    text = [s for s in text if s]
    vars(exam)["text"] = text

In [22]:
import random

train_data, valid_data = train_data.split(random_state=random.seed(0), split_ratio=0.8)

In [23]:
print(f"Number of training examples: {len(train_data)}")
print(f"Number of validation examples: {len(valid_data)}")
print(f"Number of testing examples: {len(test_data)}")

Number of training examples: 20000
Number of validation examples: 5000
Number of testing examples: 25000


In [24]:
TEXT.build_vocab(train_data, max_size=10000, min_freq=10, vectors=None)
LABEL.build_vocab(train_data)

print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 10002
Unique tokens in LABEL vocabulary: 3


In [25]:
print(LABEL.vocab.stoi)

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x000001DE7C697550>>, {'<unk>': 0, 'pos': 1, 'neg': 2})


In [26]:
BATCH_SIZE = 64
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

embedding_dim = 100
hidden_size = 300

train_iterator, valid_iterator, test_iterator = torchtext.data.BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    device=device
)

In [27]:
class RNNCell_Encoder(nn.Module):
    def __init__(self, input_dim, hidden_size):
        super().__init__()
        self.rnn = nn.RNNCell(input_dim, hidden_size)

    def forward(self, inputs):
        bz = inputs.shape[1]
        ht = torch.zeros(bz, hidden_size, device=device)
        for w in inputs:
            ht = self.rnn(w, ht)
        return ht


class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.em = nn.Embedding(len(TEXT.vocab.stoi), embedding_dim)
        self.rnn = RNNCell_Encoder(embedding_dim, hidden_size)
        self.fc1 = nn.Linear(hidden_size, 256)
        self.fc2 = nn.Linear(256, 3)

    def forward(self, x):
        x = self.em(x)
        x = self.rnn(x)
        x = functional.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [28]:
model = Net()
model.to(device)

loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [29]:
def training(epoch, model, trainloader, validloader):
    correct = 0
    total = 0
    running_loss = 0

    model.train()
    for b in trainloader:
        x, y = b.text, b.label
        x, y = x.to(device), y.to(device)
        y_pred = model(x)
        loss = loss_fn(y_pred, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        with torch.no_grad():
            predicted = torch.argmax(y_pred, dim=1)
            correct += (predicted == y).sum().item()
            total += y.size(0)
            running_loss += loss.item()

    epoch_loss = running_loss / len(trainloader)
    epoch_acc = correct / total

    valid_correct = 0
    valid_total = 0
    valid_running_loss = 0

    model.eval()
    with torch.no_grad():
        for b in validloader:
            x, y = b.text, b.label
            x, y = x.to(device), y.to(device)
            y_pred = model(x)
            loss = loss_fn(y_pred, y)
            predicted = torch.argmax(y_pred, dim=1)
            valid_correct += (predicted == y).sum().item()
            valid_total += y.size(0)
            valid_running_loss += loss.item()

    epoch_valid_loss = valid_running_loss / len(validloader)
    epoch_valid_acc = valid_correct / valid_total

    print(
        f"epoch: {epoch}",
        f"train loss: {epoch_loss:.4f}",
        f"train acc: {epoch_acc:.4f}",
        f"valid loss: {epoch_valid_loss:.4f}",
        f"valid acc: {epoch_valid_acc:.4f}",
    )
    return epoch_loss, epoch_acc, epoch_valid_loss, epoch_valid_acc

In [30]:
epochs = 5
train_loss = []
train_acc = []
valid_loss = []
valid_acc = []

for epoch in range(epochs):
    loss, acc, v_loss, v_acc = training(epoch, model, train_iterator, valid_iterator)
    train_loss.append(loss)
    train_acc.append(acc)
    valid_loss.append(v_loss)
    valid_acc.append(v_acc)

end = time.time()
print(f"Time: {end - start}")

epoch: 0 train loss: 0.7302 train acc: 0.4922 valid loss: 0.6973 valid acc: 0.5046
epoch: 1 train loss: 0.6963 train acc: 0.5019 valid loss: 0.6943 valid acc: 0.4944
epoch: 2 train loss: 0.6944 train acc: 0.5121 valid loss: 0.7066 valid acc: 0.4938
epoch: 3 train loss: 0.6940 train acc: 0.5141 valid loss: 0.7025 valid acc: 0.4918
epoch: 4 train loss: 0.6920 train acc: 0.5184 valid loss: 0.7000 valid acc: 0.5116
Time: 359.28865933418274


In [31]:
def evaluate(epoch, model, testloader):
    correct = 0
    total = 0
    running_loss = 0

    model.eval()
    with torch.no_grad():
        for b in testloader:
            x, y = b.text, b.label
            x, y = x.to(device), y.to(device)
            y_pred = model(x)
            loss = loss_fn(y_pred, y)
            predicted = torch.argmax(y_pred, dim=1)
            correct += (predicted == y).sum().item()
            total += y.size(0)
            running_loss += loss.item()

    epoch_loss = running_loss / len(testloader)
    epoch_acc = correct / total

    print(
        f"epoch: {epoch}",
        f"test loss: {epoch_loss:.4f}",
        f"test acc: {epoch_acc:.4f}",
    )
    return epoch_loss, epoch_acc

In [32]:
epoch = 5
test_loss = []
test_acc = []

for epoch in range(epochs):
    loss, acc = evaluate(epoch, model, test_iterator)
    test_loss.append(loss)
    test_acc.append(acc)
    
end = time.time()
print(f"Time: {end - start}")

epoch: 0 test loss: 0.7010 test acc: 0.5023
epoch: 1 test loss: 0.7010 test acc: 0.5023
epoch: 2 test loss: 0.7010 test acc: 0.5023
epoch: 3 test loss: 0.7010 test acc: 0.5023
epoch: 4 test loss: 0.7010 test acc: 0.5023
Time: 464.08869791030884
