In [None]:
def simple_rnn_cell(i, h):
    return max(3*i - 4*h + 1, 0)

in_sequence = [3,5,7,5]
h = 0                               # initialize hidden state to 0

out_sequence = []
for i in in_sequence:
    h = simple_rnn_cell(i, h)       # get the next hidden state
    out_sequence.append(h*3-1)      # linear transformation before add to output sequence

out_sequence

[29, -1, 65, -1]

In [None]:
import numpy as np

def rnn_cell(i, h, w_i, w_h, b):
    projected_i = w_i.dot(i)        # (hidden_size, input_size) x (input_size,) = (hidden_size,)
    projected_h = w_h.dot(h)        # (hidden_size, hidden_size) x (hidden_size,) = (hidden_size,)
    return np.tanh(projected_i + projected_h + b)   # (hidden_size,)

w_i = np.random.randn(8,5)          # (hidden_size, input_size)
w_h = np.random.randn(8,8)          # (hidden_size, hidden_size)
b = np.random.randn(8)              # (hidden_size,)

# basic linear algebra: matrix multiplication
# (a, b) x (b, c) = (a, c)

in_sequence = np.random.randn(10,5)     # (sequence_length, input_size)
h = np.zeros(8)                         # (hidden_size)

out_sequence = np.zeros((10,8))         # (sequence_length, hidden_size)
for idx, in_vector in enumerate(in_sequence):
    h = rnn_cell(in_vector, h, w_i, w_h, b)
    out_sequence[idx] = h

out_sequence.shape                      # (10, 8)
out_sequence

array([[ 0.99913245, -0.96031546,  0.8909773 , -0.74409246, -0.50816938,
        -0.12867373,  0.17824004,  0.49758119],
       [-0.95736192,  0.80066157, -0.91627799, -0.99933537, -0.99855754,
        -0.72345596, -0.78787275, -0.6225107 ],
       [ 0.99999988, -0.99999291,  0.09598813,  0.62045583, -0.88840848,
        -0.99833561, -0.99927904, -0.01512688],
       [ 0.86433929, -0.99987058,  0.92951771, -0.98475727,  0.94720577,
         0.94013805, -0.96845876, -0.4048926 ],
       [ 0.69076172,  0.99910112, -0.99998237, -0.91172142, -0.85179817,
         0.72422335, -0.99863239,  0.97232844],
       [ 0.9957403 , -0.98587822, -0.37239783,  0.74462708, -0.90732804,
         0.58312442,  0.19085547,  0.98925926],
       [-0.99999207, -0.99594525,  0.88506349, -0.99999001,  0.93484583,
         0.99916708,  0.9940741 , -0.99455887],
       [-0.23154603,  0.94671637, -0.99992379, -0.99769725, -0.72710839,
        -0.98053198, -0.97426395, -0.91914583],
       [ 0.99716018, -0.99558149

In [None]:
import gensim.downloader
glove_vectors = gensim.downloader.load('glove-twitter-200')     # GLoVE vectors trained on Twitter with dimension 200



In [None]:
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz   # download
!tar -xf aclImdb_v1.tar.gz                                              # unzip / untar

--2022-02-10 11:23:48--  https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2022-02-10 11:23:56 (11.7 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import os
import pandas as pd


def load_samples(dir):
    files = os.listdir(dir)
    samples = []
    for file in files:
        with open(os.path.join(dir, file)) as f:
            sample = ''.join(f.readlines())
            samples.append(sample)
    return samples

train_pos_samples = load_samples('aclImdb/train/pos')
train_neg_samples = load_samples('aclImdb/train/neg')
val_pos_samples = load_samples('aclImdb/test/pos')
val_neg_samples = load_samples('aclImdb/test/neg')

train_samples = [(x, 1) for x in train_pos_samples] + [(x, 0) for x in train_neg_samples]
val_samples = [(x, 1) for x in val_pos_samples] + [(x, 0) for x in val_neg_samples]

df_train = pd.DataFrame(train_samples, columns=['text', 'label'])
df_val = pd.DataFrame(val_samples, columns=['text', 'label'])

In [None]:
from nltk.corpus import stopwords

en_stopwords = set(stopwords.words('english'))      # make it a set for fast lookup O(1)

In [None]:
import string
import re

def clean_and_tokenize(text):
    text = text.lower()                     # lowercase
    text = text.replace('<br />', '')       # remove line break token
    tokens = text.split()                   # split into words / whitespace tokenize
    tokens = [x for x in tokens if x not in en_stopwords]       # remove stopwords
    text = ' '.join(tokens)                 # combine tokens into a sentence
    for punc in string.punctuation:         # removing punctuations
        text = text.replace(punc, ' ')
    text = re.sub('\s+', ' ', text)         # > 1 whitespace -> 1 whitespace
    text = text.strip()
    tokens = text.split()
    tokens = [x for x in tokens if len(x) > 2]      # words > 2 characters
    return tokens


def tokens_to_vectors(tokens):
    vecs = [glove_vectors[token] for token in tokens if token in glove_vectors]
    return np.stack(vecs, axis=0)

text = 'I caught this movie on my local movie channel, and i rather enjoyed watching the film. It has all the elements of a good teen film, and more - this film, aside from dealing with boys-girls relationships and sex and the like, also deals with the issue of steroid use by young people.'
tokens = clean_and_tokenize(text)
vecs = tokens_to_vectors(tokens)
print(vecs.shape)
vecs

(28, 200)


array([[-0.15818  ,  0.14474  , -0.048466 , ..., -0.032991 , -0.072195 ,
        -0.13176  ],
       [ 0.049906 ,  0.65401  , -0.16842  , ...,  0.14038  ,  0.26025  ,
         0.13309  ],
       [-0.47072  ,  0.24035  , -0.087093 , ...,  0.034878 , -0.037317 ,
         0.12169  ],
       ...,
       [ 0.0033451,  0.18548  ,  0.11773  , ..., -0.080602 ,  0.13323  ,
         0.10056  ],
       [-0.30089  , -0.26255  , -0.05564  , ..., -0.2641   , -0.35714  ,
        -0.90862  ],
       [ 0.011112 ,  0.4785   , -0.092164 , ...,  0.11395  , -0.20854  ,
         0.057258 ]], dtype=float32)

In [None]:
train_sent_tokens = [clean_and_tokenize(x) for x in df_train['text'].tolist()]
val_sent_tokens = [clean_and_tokenize(x) for x in df_val['text'].tolist()]

train_vecs = [tokens_to_vectors(tokens) for tokens in train_sent_tokens]
val_vecs = [tokens_to_vectors(tokens) for tokens in val_sent_tokens]

In [None]:
import torch
from torch import nn


class RNNLayer(nn.Module):
    def __init__(self, in_features, out_features):
        super().__init__()
        self.weight_i = nn.Parameter(torch.zeros(in_features, out_features))
        self.weight_h = nn.Parameter(torch.zeros(out_features, out_features))
        self.weight_out = nn.Parameter(torch.zeros(out_features, out_features))
        self.bias = nn.Parameter(torch.zeros(out_features))
        self.activation = nn.Tanh()
        self.out_features = out_features

        # init weights
        torch.nn.init.xavier_normal_(self.weight_i.data)
        torch.nn.init.xavier_normal_(self.weight_h.data)
        torch.nn.init.xavier_normal_(self.weight_out.data)

    def forward(self, in_sequence):
        seq_len = in_sequence.shape[0]                          # (num_tokens, input_emb_dim)
        device = in_sequence.device
        h = torch.zeros(self.out_features, device=device)       # 0-th hidden state = 0 (hidden_state_dim)

        projected_i = torch.matmul(in_sequence, self.weight_i)  # (num_tokens, input_emb_dim)
        output_sequence = torch.zeros(seq_len, self.out_features, device=device)        # allocate space

        # recurrent logic
        for i in range(seq_len):
            projected_h = torch.matmul(self.weight_h, h)
            h = self.activation(projected_i[i] + projected_h + self.bias)   # next_hidden_state = fn(input, prev_hidden_state)
            output_sequence[i] = h

        output_sequence = torch.matmul(output_sequence, self.weight_out)
        return output_sequence

layer = RNNLayer(200, 256)      # initialize the weights and build the model
in_seq = torch.rand(10, 200)    # (num_tokens, input_emb_dim)
layer(in_seq).shape             # (num_tokens, output_emb_dim)

torch.Size([10, 256])

In [None]:
class RNNModel(nn.Module):
    def __init__(self, in_channels, num_channels):
        super().__init__()
        layers = []
        for out_channels in num_channels:
            layer = RNNLayer(in_channels, out_channels)
            layers.append(layer)
            in_channels = out_channels
        self.layers = nn.Sequential(*layers)
        self.linear = nn.Linear(in_channels, 1) # binary classification
    
    def forward(self, in_seq):
        out_seq = self.layers(in_seq)       # (num_tokens, emb_dim)
        out_vec = out_seq[-1].unsqueeze(0)      # last token's embedding = text's embeddings
        # (emb_dim) -> (1, emb_dim)
        out = self.linear(out_vec)      # (1, 1) 1 neuron: <0.5 negative, >0.5 positve
        return out.squeeze()   # (1,)


model = RNNModel(200, [512,512])
torch.sigmoid(model(torch.rand(10,200)))

tensor(0.4811, grad_fn=<SigmoidBackward0>)

In [None]:
from torch.utils.data import Dataset, DataLoader

class ReviewDataset(Dataset):
    def __init__(self, vectors, labels):
        self.vectors = vectors
        self.labels = labels

    def __getitem__(self, idx):
        return self.vectors[idx], self.labels[idx]

    def __len__(self):
        return len(self.vectors)

train_ds = ReviewDataset(train_vecs, df_train['label'].tolist())
val_ds = ReviewDataset(val_vecs, df_val['label'].tolist())

# batching problem
train_loader = DataLoader(train_ds, batch_size=1, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=1, shuffle=False)

vec, label = train_ds[13000]
print(vec.shape, label)

(250, 200) 0


In [None]:
import torch.nn.functional as F
from tqdm import tqdm


device = 'cuda' if torch.cuda.is_available() else 'cpu'

model = RNNModel(200, [512,512])
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)

n_epochs = 10

for i in range(n_epochs):
    print(f'Epoch{i}')
    pbar = tqdm(train_loader)
    for vecs, labels in pbar:
        vecs = vecs.to(device)
        logits = model(vecs[0])
        loss = F.binary_cross_entropy_with_logits(logits, labels[0].to(device).float())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        pbar.set_description(f'Loss: {loss.item():.4f}')

Epoch0


Loss: 0.2581: 100%|██████████| 25000/25000 [43:24<00:00,  9.60it/s]


Epoch1


Loss: 0.2430:  33%|███▎      | 8165/25000 [14:02<28:57,  9.69it/s]


KeyboardInterrupt: ignored