In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchtext
from torchtext import data
from torchtext import datasets
from torch.utils.tensorboard import SummaryWriter

import matplotlib.pyplot as plt
import numpy as np

In [5]:
TEXT = data.Field(tokenize='spacy', tokenizer_language='en_core_web_md')
LABEL = data.LabelField(dtype=torch.float)

train_data, test_data = datasets.IMDB.splits(text_field=TEXT, label_field=LABEL)

In [7]:
print(vars(train_data.examples[0])['label'])
print(vars(train_data.examples[0])['text'])

pos
['It', "'s", 'a', 'colorful', 'slasher', 'movie', '.', 'That', "'s", 'about', 'it.<br', '/><br', '/>It', 'has', 'the', 'mystery', 'element', 'that', 'SCREAM', 'made', 'so', 'popular', 'in', 'slasher', 'movies', ',', 'but', 'I', 'never', 'care', 'for', 'such', 'things', '.', 'Figuring', 'out', 'who', "'s", 'the', 'bad', 'guy', 'is', 'not', 'that', 'interesting', 'considering', 'the', 'clues', 'are', 'all', 'misleading', 'anyway.<br', '/><br', '/>The', 'death', 'scenes', 'were', 'inventive', 'and', 'gorey', ',', 'bringing', 'back', 'memories', 'of', '80', "'s", 'horror', 'movies', 'like', 'Friday', 'the', '13th', '.', '<', 'br', '/><br', '/>Another', 'nice', 'thing', 'about', 'this', 'movie', 'is', 'that', 'it', "'s", 'hard', 'to', 'pinpoint', 'the', 'surviving', 'girl', ',', 'unlike', 'in', 'SCREAM', 'and', 'IKWYDLS', 'where', 'it', 'was', 'obvious', '.', '<', 'br', '/><br', '/>People', 'who', 'do', "n't", 'like', 'slasher', 'movies', 'wo', "n't", 'like', 'this', 'movie', '.', 'As',

In [10]:
writer = SummaryWriter()

for batch in range(5):
    writer.add_text("Text batch", ' '.join(vars(train_data.examples[0])['text']), batch)
    writer.add_text("Text batch", ' '.join(vars(train_data.examples[0])['label']), batch)

writer.close()

In [14]:
TEXT.build_vocab(train_data, max_size=2500, vectors=torchtext.vocab.GloVe(name='6B', dim=50))

.vector_cache/glove.6B.zip: 862MB [06:43, 2.14MB/s]                               
100%|█████████▉| 399999/400000 [00:12<00:00, 30948.21it/s]


In [15]:
LABEL.build_vocab(train_data)

In [16]:
print(TEXT.vocab.itos)
print(len(TEXT.vocab.itos))
print(type(TEXT.vocab.itos))
print(TEXT.vocab.vectors)
print(len(TEXT.vocab.vectors))

2502
<class 'list'>
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.4180,  0.2497, -0.4124,  ..., -0.1841, -0.1151, -0.7858],
        ...,
        [ 0.2337, -0.7152, -0.1912,  ...,  0.1908,  0.3518, -0.0332],
        [ 0.9054,  0.5484, -0.2942,  ..., -0.2478,  0.0399, -0.2427],
        [ 0.6436,  0.0540,  0.0944,  ...,  0.1137,  0.3243,  0.1347]])
2502


In [17]:
writer = SummaryWriter()
writer.add_embedding(TEXT.vocab.vectors, [x.encode('utf-8') for x in TEXT.vocab.itos])
writer.close()

AttributeError: module 'tensorflow._api.v2.io.gfile' has no attribute 'get_filesystem'

In [18]:
import tensorflow as tf
import tensorboard as tb

In [22]:
tf.io.gfile = tb.compat.tensorflow_stub.io.gfile 

In [23]:
writer = SummaryWriter()
writer.add_embedding(TEXT.vocab.vectors, [x.encode('utf-8') for x in TEXT.vocab.itos])
writer.close()

In [33]:
class RNN(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, text):
        embedded = self.embedding(text)
        output, hidden = self.rnn(embedded)
        return self.fc(hidden.squeeze(0))
        

In [34]:
model = RNN(input_dim=len(TEXT.vocab), embedding_dim=50, hidden_dim=256, output_dim=1)

In [36]:
train_iterator, test_iterator = data.BucketIterator.splits((train_data, test_data), batch_size=64, device='cpu')

In [37]:
for i, batch in enumerate(train_iterator):
    if i < 2:
        print(batch.text)
        print(batch.label)

tensor([[ 647,  156,   54,  ..., 2208,   54,  149],
        [  13,   57,   19,  ...,   39,    0,  163],
        [   0,   30,  998,  ...,    0,   28,  970],
        ...,
        [   2,    1,    1,  ...,    1,    1,    1],
        [   0,    1,    1,  ...,    1,    1,    1],
        [  29,    1,    1,  ...,    1,    1,    1]])
tensor([0., 1., 0., 0., 0., 1., 1., 0., 1., 0., 1., 0., 0., 1., 0., 1., 1., 1.,
        1., 1., 1., 1., 0., 1., 1., 1., 1., 0., 1., 1., 0., 0., 1., 1., 0., 0.,
        0., 0., 0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0.,
        0., 1., 1., 1., 1., 1., 1., 1., 1., 0.])
tensor([[ 149,    2,   16,  ...,   66,    0,   11],
        [1267, 1986,    0,  ...,   19,  464,  469],
        [ 758,    7,    0,  ...,   38,    7,  143],
        ...,
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1]])
tensor([1., 0., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 1.,

In [38]:
writer = SummaryWriter()

writer.add_graph(model, batch.text)
    
writer.close()

In [40]:
len(train_data), len(test_data)

(25000, 25000)

In [61]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

print(device)

cuda:0


In [62]:
train_iterator, test_iterator = data.BucketIterator.splits((train_data, test_data), batch_size=64, device=device)

In [63]:
embedding_dim = 100
hidden_dim = 256

model = RNN(input_dim=len(TEXT.vocab), embedding_dim=embedding_dim, hidden_dim=hidden_dim, output_dim=1)

In [70]:
TEXT.build_vocab(train_data, max_size=2500, vectors=torchtext.vocab.GloVe(name='6B', dim=embedding_dim))
LABEL.build_vocab(train_data)

100%|█████████▉| 399999/400000 [00:19<00:00, 20464.51it/s]


In [71]:
model.to(device)

RNN(
  (embedding): Embedding(2502, 100)
  (rnn): RNN(100, 256)
  (fc): Linear(in_features=256, out_features=1, bias=True)
)

In [72]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [73]:
def binary_accuracy(y_true, predict):
    rounded_predicts = torch.round(torch.sigmoid(predict))
    correct = (rounded_predicts==y_true).float()
    return correct.sum() / len(correct)

In [74]:
pretraioned_embeddings = TEXT.vocab.vectors

In [75]:
pretraioned_embeddings.shape

torch.Size([2502, 100])

In [76]:
model.embedding.weight.data.copy_(pretraioned_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-0.1699,  0.1631,  0.6325,  ..., -0.0244, -0.5670,  0.1749],
        [-0.1435,  0.6606, -0.0788,  ..., -1.6433,  0.6658,  0.1726],
        [ 0.4098, -0.6922,  0.6605,  ..., -0.3771, -0.0812, -0.4148]],
       device='cuda:0')

In [77]:
model = model.to(device)   
criterion = criterion.to(device)  


for epoch in range(10):  # loop over the dataset multiple times
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in train_iterator:
        # zero the parameter gradients
        optimizer.zero_grad()
        
        outputs = model(batch.text).squeeze(1)
        
        loss = criterion(outputs, batch.label)
        
        acc = binary_accuracy(batch.label, outputs)
        
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    train_loss = epoch_loss / len(train_iterator)
    train_acc = epoch_acc / len(train_iterator)
    
#     writer.add_scalar("Loss", loss.item(), epoch)
    print(f"Epoch: {epoch}, Loss: {train_loss}, Acc: {train_acc}")

print('Finished Training')

Epoch: 0, Loss: 0.6960257462528355, Acc: 0.5036604860249687
Epoch: 1, Loss: 0.6986272444810404, Acc: 0.49660326086956524
Epoch: 2, Loss: 0.697589741795874, Acc: 0.4975143862349908
Epoch: 3, Loss: 0.696794686872331, Acc: 0.4978420716112532
Epoch: 4, Loss: 0.6957235560392785, Acc: 0.5025575447570333
Epoch: 5, Loss: 0.6971365951211251, Acc: 0.5014146419284898
Epoch: 6, Loss: 0.6963967687028754, Acc: 0.49781010233227857
Epoch: 7, Loss: 0.6964495980831058, Acc: 0.5063778772530958
Epoch: 8, Loss: 0.6971018225945476, Acc: 0.5024616369201095
Epoch: 9, Loss: 0.6964242773897508, Acc: 0.49792998724276455
Finished Training


In [78]:
def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    for batch in iterator:       
        outputs = model(batch.text).squeeze(1)
        
        loss = criterion(outputs, batch.label)
        
        acc = binary_accuracy(batch.label, outputs)

        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return  epoch_loss / len(iterator), epoch_acc / len(iterator)


In [79]:
test_loss, test_acc = evaluate(model, test_iterator, criterion)
test_loss, test_acc

(0.6949618602042917, 0.500815217452281)

In [80]:
writer = SummaryWriter()
writer.add_hparams({"Embedding Dim": embedding_dim, "Hidden dim": hidden_dim}, {"hparam/loss": test_loss, "hparam/accuracy": test_acc})
writer.close()