In [1]:
!pip install portalocker
!pip install torchmetrics

Collecting portalocker
  Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)
Installing collected packages: portalocker
Successfully installed portalocker-2.8.2
Collecting torchmetrics
  Downloading torchmetrics-1.2.0-py3-none-any.whl (805 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m805.2/805.2 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.10.0-py3-none-any.whl (24 kB)
Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.10.0 torchmetrics-1.2.0


In [2]:
import argparse
import logging
import time

import torch
from torch.utils.data import DataLoader
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
from torchtext.data.utils import get_tokenizer, ngrams_iterator
from torchtext.datasets import DATASETS
from torchtext.prototype.transforms import load_sp_model, PRETRAINED_SP_MODEL, SentencePieceTokenizer
from torchtext.utils import download_from_url
from torchtext.vocab import build_vocab_from_iterator
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from torchtext.vocab import GloVe
from tqdm import tqdm

torch.autograd.set_detect_anomaly(True)

FILL = '_FILL_'

### Information
- torchtext repo: https://github.com/pytorch/text/tree/main/torchtext
- torchtext documentation: https://pytorch.org/text/stable/index.html

### Constants

In [3]:
DATASET = "AG_NEWS"
DATA_DIR = ".data"
DEVICE = "cpu"
EMBED_DIM = 300
LR = 4.0
BATCH_SIZE = 16
NUM_EPOCHS = 5
PADDING_VALUE = 0
PADDING_IDX = PADDING_VALUE

### Get the tokenizer
- Use the WordLevel tokenizer.


In [4]:
basic_english_tokenizer = get_tokenizer("basic_english")

In [5]:
basic_english_tokenizer("This is some text ...")

['this', 'is', 'some', 'text', '.', '.', '.']

In [6]:
# Needed later
TOKENIZER = basic_english_tokenizer

### Get the data and get the vocabulary

In [7]:
def yield_tokens(data_iter):
    for _, text in data_iter:
        yield TOKENIZER(text)

In [8]:
train_iter = DATASETS[DATASET](root=DATA_DIR, split="train")
VOCAB = build_vocab_from_iterator(yield_tokens(train_iter), specials=('<pad>', '<unk>'))

# Make the default index the same as that of the unk_token
VOCAB.set_default_index(VOCAB['<unk>'])

In [9]:
len(VOCAB)

95812

### Get GloVe embeddings ... This will be slow ...

In [10]:
GLOVE = GloVe() # 13 min

.vector_cache/glove.840B.300d.zip: 2.18GB [06:49, 5.31MB/s]                            
100%|█████████▉| 2196016/2196017 [04:11<00:00, 8716.53it/s]


In [11]:
len(GLOVE), GLOVE.vectors.shape

(2196017, torch.Size([2196017, 300]))

### Helper functions

In [12]:
def text_pipeline(text):
    return VOCAB(TOKENIZER(text))

def label_pipeline(label):
    return int(label) - 1

Nice link on collate_fn and DataLoader in PyTorch: https://python.plainenglish.io/understanding-collate-fn-in-pytorch-f9d1742647d3

In [13]:
# What does this do?
# How can this be improved?
def collate_batch(batch):
    label_list, text_list = [], []
    for (_label, _text) in batch:
        # Get the label from {1, 2, 3, 4} to {0, 1, 2, 3}
        label_list.append(label_pipeline(_label))

        # Return a list of ints.
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text.clone().detach())

    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = pad_sequence(text_list, batch_first=True)

    return label_list.to(DEVICE), text_list.to(DEVICE)

### Get the data

In [14]:
train_iter = DATASETS[DATASET](root=DATA_DIR, split="train")
num_class = len(set([label for (label, _) in train_iter]))
# What are the classes?
print(f"The number of classes is {num_class} ...")

The number of classes is 4 ...


### Set up the model

Good reference on this type of model
- Recurrent CNN: https://ojs.aaai.org/index.php/AAAI/article/view/9513/9372

In [15]:
class CNN1dTextClassificationModel(nn.Module):
    def __init__(
        self,
        vocab_size,
        num_class,
        embed_dim = 300,
        use_pretrained = True,
        fine_tune_embeddings = True
    ):

        super(CNN1dTextClassificationModel, self).__init__()

        # Set to embeddings layer of vocab_size and embed_dim vector dimension
        # Set the PADDING_IDX appropriately
        # from discription: use GLOVE embeddings
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=PADDING_IDX)

        if use_pretrained:
            # Set the embeddings to not requiring gradients since we'll try and modify
            self.embedding.weight.requires_grad = False
            for i in range(vocab_size):
                # Get the token for the index i
                token = VOCAB.lookup_token(i)
                # Modify the embedding for index i by the embedding for that token
                # Do this only if token is in the stoi dictionary for GLOVE
                if token in GLOVE.stoi:
                    self.embedding.weight[i, :] = GLOVE.vectors[GLOVE.stoi[token]]
            # Reset to True the weights
            self.embedding.weight.requires_grad = True
        else:
            # Otherwise, initialize the weights
            self.init_weights()

        # Turn off gradients
        if not fine_tune_embeddings:
            for param in self.embedding.parameters():
                param.requires_grad = False

        # Define 3 Conv1d layers each having 1 filter and kernel sizes 2, 3 and 4
        self.cnn2 = nn.Conv1d(in_channels=embed_dim, out_channels=1, kernel_size=2)
        self.cnn3 = nn.Conv1d(in_channels=embed_dim, out_channels=1, kernel_size=3)
        self.cnn4 = nn.Conv1d(in_channels=embed_dim, out_channels=1, kernel_size=4)

        # A linear map from some dimensions to num_class (you need to figure it out)
        self.fc = nn.Linear(3, num_class) ##

        # For drop out + ReLu, order does not matter below
        self.dropout = nn.Dropout(p=0.2) ##

        self.debug = False

    def init_weights(self):
        initrange = 0.5
        # Initialize the embedding weight matrix to uniform between the [-0.5, 0.5]
        self.embedding.weight.data.uniform_(-initrange, initrange)
        # Initialize the weight matrix of fc to uniform between the [-0.5, 0.5]
        self.fc.weight.data.uniform_(-initrange, initrange)
        # Initialize the bias for fc to zero
        self.fc.bias.data.zero_()

    # B = batch_size, L = sequence length, D = vector dimension
    def forward(self, text):

        # B X L X D
        # Get the embeddings for the text passed in
        embedded = self.embedding(text)

        if self.debug:
            print('embedding', embedded.shape)

        # B X D X L
        # Transpose the embedding above as needed
        embedded = embedded.transpose(1, 2)

        # B X 1 X L - 1
        # Pass through cnn2
        cnn2 = self.cnn2(embedded)
        if self.debug:
            print('cnn2', cnn2.shape)

        # B X 1 X L - 2
        # Pass through cnn3
        cnn3 = self.cnn3(embedded)
        if self.debug:
            print('cnn3', cnn3.shape)

        # B X 1 X L - 3
        # Pass through cnn4
        cnn4 = self.cnn4(embedded)
        if self.debug:
            print('cnn4', cnn4.shape)

        # B X 1 in all cases
        # Apply max pooling to each of cnn2, cnn3 and cnn4
        cnn2 = F.max_pool1d(cnn2, cnn2.size(2)).squeeze(2)
        cnn3 = F.max_pool1d(cnn3, cnn3.size(2)).squeeze(2)
        cnn4 = F.max_pool1d(cnn4, cnn4.size(2)).squeeze(2)
        if self.debug:
            print('cnn2 after max', cnn2.shape)

        # B X 3
        # Concatenate and add drop out to the result
        cnn_concat = cnn_concat = torch.cat((cnn2, cnn3, cnn4), 1)
        cnn_concat = self.dropout(cnn_concat)
        # cnn_concat = self.relu(cnn_concat)
        if self.debug:
            print('cnn concat', cnn_concat.shape)
            self.debug = False

        # Pass through an appropriate Linear layer to get the right dimensions needed
        out = self.fc(cnn_concat)

        return out

class RecurrentCNNModel(nn.Module):
    def __init__(
        self,
        vocab_size,
        num_class = 4,
        e = 300, # embedding dimension
        use_pretrained = True,
        fine_tune_embeddings = True,
        # If true, this will print out the shapes of data in the forward pass for the first batch
        # This will be set to False after the first forward pass
        debug = True
    ):

        super(RecurrentCNNModel, self).__init__()

        # Set to a nn.Embedding laer for vocab_size size and e dimension
        self.embedding = nn.Embedding(vocab_size, e, padding_idx=PADDING_IDX)

        # Set as in the paper
        self.c = 100
        self.h = 100
        self.initrange = 0.5

        # Same as for the CNN model above
        if use_pretrained:
            self.embedding.weight.requires_grad = False
            for i in range(vocab_size):
                # Get the token for the index i
                token = VOCAB.lookup_token(i)
                # Modify the embedding for index i by the embedding for that token
                # Do this only if token is in the stoi dictionary for GLOVE
                if token in GLOVE.stoi:
                    self.embedding.weight[i, :] = GLOVE[token]
            # Reset to True the weights
            self.embedding.weight.requires_grad = True
        else:
            # Otherwise, initialize the weights
            self.init_weights()

        if not fine_tune_embeddings:
            # Turn off gradients for the embedding weight ## not the same
            for param in self.embedding.parameters():
                param.requires_grad = False

        # Set Wl, Wr, Wsl, Wsr etc as in the paper
        # Used in (1) and (2)
        self.Wl = nn.Linear(self.c, self.c)
        self.Wr = nn.Linear(self.c, self.c)

        # Used in (1) and (2)
        self.Wsl = nn.Linear(e, self.c)
        self.Wsr = nn.Linear(e, self.c)

        # Used in equations (4) and (6)
        self.W2 = nn.Linear((2*self.c + e), self.h)
        self.W4 = nn.Linear(self.h, num_class)

        # For drop out + ReLu, order does not matter.
        self.dropout = nn.Dropout(p=0.5)
        self.relu = nn.ReLU()

        self.debug = False

    def init_weights(self):
      # Set some of these to uniform on [-initrange, initrange]
      # The biases can be set to 0
      self.embedding.weight.data.uniform_(-self.initrange, self.initrange)

    # B = batch_size, L = sequence length, e = vector dimension
    def forward(self, text):
        # Text is originally B X L

        # B X L X e
        embedded = self.embedding(text)

        N, L, D = embedded.shape

        # N X L X c
        cr = torch.zeros(N, L, self.c, device=embedded.device)
        if self.debug:
            print('cr ', cr.shape)

        # N X L X c
        cl = torch.zeros(N, L, self.c, device=embedded.device)

        # N X L X c
        # We need to clone here or we get this error:
        # https://nieznanm.medium.com/runtimeerror-one-of-the-variables-needed-for-gradient-computation-has-been-modified-by-an-inplace-85d0d207623
        for l in range(1, L):
            cl[:, l, :] = F.relu(self.Wl(cl[:, l - 1, :].clone()) + self.Wsl(embedded[:, l - 1, :].clone())) # relu as non-linear ??

        # N X L X c
        # Set cr as in the paper from equation (3)
        for l in range(L-2, -1, -1):
            cr[:, l, :] = F.relu(self.Wr(cr[:, l + 1, :].clone()) + self.Wsr(embedded[:, l + 1, :].clone()))

        # B X L X (2c + e)
        # Set x as in the paper; this is equation (3)
        x = torch.cat((cl, embedded, cr), dim=2)
        if self.debug:
            print('x ', x.shape)

        # B X L X h
        # Set y2 as in equation (4)
        y2 = torch.tanh(self.W2(x))
        if self.debug:
            print('y2 ', y2.shape)

        # B X H X L
        y2 = y2.transpose(1, 2)
        if self.debug:
            print('y2 ', y2.shape)

        # Set y3 from y2 as in equation (5)
        y3,_ = torch.max(y2, dim=2)
        if self.debug:
            print('y3 ', y3.shape)

        # Set y4 from W4 and y3
        y4 = self.W4(y3)
        if self.debug:
            print('y4 ', y4.shape)
            # Set to False after this is done
            self.debug = False

        return y4

### Set up the model

In [16]:
# If this is True, we will initialize the Embedding layer with GLOVE
USE_PRETRANED = True,

# If this is True, we will allow for gradient updates on the nn.Embedding layer
FINE_TUNE_EMBEDDINGS = True

# Set the loss appropriately
criterion = torch.nn.CrossEntropyLoss().to(DEVICE)

In [17]:
# Select the Recurrent CNN Model
model = RecurrentCNNModel(vocab_size=len(VOCAB))

# Set the optimizer to SGD
optimizer = torch.optim.SGD(model.parameters(), lr=LR)

# Set the scheduler to StepLR with gamma=0.1 and step_size = 1.0
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

### Set up the data

In [18]:
train_iter, test_iter = DATASETS[DATASET]()
train_dataset = to_map_style_dataset(train_iter)
test_dataset = to_map_style_dataset(test_iter)

num_train = int(len(train_dataset) * 0.95)
split_train_, split_valid_ = random_split(train_dataset, [num_train, len(train_dataset) - num_train])

train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)

### Train the model

In [19]:
def train(dataloader, model, optimizer, criterion, epoch):
    model.train()
    total_acc, total_count = 0, 0
    log_interval = 100

    for idx, (label, text) in tqdm(enumerate(dataloader)):
        optimizer.zero_grad()
        predicted_label = model(text)

        # Get the loss
        loss = criterion(predicted_label, label)

        # Do back propagation
        loss.backward()

        # Clip the gradients at 0.1
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)

        # Do an optimization step
        optimizer.step()
        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            print(
                "| epoch {:3d} | {:5d}/{:5d} batches "
                "| accuracy {:8.3f}".format(epoch, idx, len(dataloader), total_acc / total_count)
            )
            total_acc, total_count = 0, 0

In [20]:
def evaluate(dataloader, model):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text) in enumerate(dataloader):
            predited_label = FILL
            total_acc += FILL
            total_count += FILL
    return total_acc / total_count

In [None]:
# Train the RNNCNN model
for epoch in range(1, NUM_EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader, model, optimizer, criterion, epoch)
    accu_val = evaluate(valid_dataloader, model)
    scheduler.step()
    print("-" * 59)
    print(
        "| end of epoch {:3d} | time: {:5.2f}s | "
        "valid accuracy {:8.3f} ".format(epoch, time.time() - epoch_start_time, accu_val)
    )
    print("-" * 59)

print("Checking the results of test dataset.")
accu_test = evaluate(test_dataloader, model)
print("test accuracy {:8.3f}".format(accu_test))

101it [02:45,  1.69s/it]

| epoch   1 |   100/ 7125 batches | accuracy    0.709


201it [05:22,  1.56s/it]

| epoch   1 |   200/ 7125 batches | accuracy    0.848


301it [07:50,  1.41s/it]

| epoch   1 |   300/ 7125 batches | accuracy    0.854


401it [10:35,  1.52s/it]

| epoch   1 |   400/ 7125 batches | accuracy    0.873


501it [13:10,  1.42s/it]

| epoch   1 |   500/ 7125 batches | accuracy    0.873


601it [15:51,  1.60s/it]

| epoch   1 |   600/ 7125 batches | accuracy    0.889


701it [18:22,  1.44s/it]

| epoch   1 |   700/ 7125 batches | accuracy    0.869


801it [21:08,  1.59s/it]

| epoch   1 |   800/ 7125 batches | accuracy    0.884


901it [24:05,  1.87s/it]

| epoch   1 |   900/ 7125 batches | accuracy    0.879


1001it [26:50,  2.03s/it]

| epoch   1 |  1000/ 7125 batches | accuracy    0.882


1082it [28:57,  1.61s/it]


KeyboardInterrupt: ignored

In [21]:
# Make a Conv Text model
model = CNN1dTextClassificationModel(vocab_size=len(VOCAB), num_class=4)

# Set the optimizer to SGD
optimizer = torch.optim.SGD(model.parameters(), lr=LR)

# Set the scheduler to StepLR with gamma=0.1 and step_size = 1.0
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

In [25]:
# Train the Conv1d model
for epoch in range(1, NUM_EPOCHS + 1):
    epoch_start_time = time.time()
    train(train_dataloader, model, optimizer, criterion, epoch)
    accu_val = evaluate(valid_dataloader, model)
    scheduler.step()
    print("-" * 59)
    print(
        "| end of epoch {:3d} | time: {:5.2f}s | "
        "valid accuracy {:8.3f} ".format(epoch, time.time() - epoch_start_time, accu_val)
    )
    print("-" * 59)

print("Checking the results of test dataset.")
accu_test = evaluate(test_dataloader, model)
print("test accuracy {:8.3f}".format(accu_test))

102it [00:08, 13.30it/s]

| epoch   1 |   100/ 7125 batches | accuracy    0.743


202it [00:15, 13.79it/s]

| epoch   1 |   200/ 7125 batches | accuracy    0.758


302it [00:23, 13.51it/s]

| epoch   1 |   300/ 7125 batches | accuracy    0.757


402it [00:31, 10.48it/s]

| epoch   1 |   400/ 7125 batches | accuracy    0.749


502it [00:38, 13.50it/s]

| epoch   1 |   500/ 7125 batches | accuracy    0.732


602it [00:46, 13.70it/s]

| epoch   1 |   600/ 7125 batches | accuracy    0.758


702it [00:54, 13.26it/s]

| epoch   1 |   700/ 7125 batches | accuracy    0.755


802it [01:02, 13.43it/s]

| epoch   1 |   800/ 7125 batches | accuracy    0.726


902it [01:09, 10.78it/s]

| epoch   1 |   900/ 7125 batches | accuracy    0.743


1002it [01:17, 13.53it/s]

| epoch   1 |  1000/ 7125 batches | accuracy    0.725


1102it [01:25, 13.17it/s]

| epoch   1 |  1100/ 7125 batches | accuracy    0.768


1202it [01:32, 13.79it/s]

| epoch   1 |  1200/ 7125 batches | accuracy    0.736


1302it [01:40, 13.75it/s]

| epoch   1 |  1300/ 7125 batches | accuracy    0.752


1402it [01:48, 10.22it/s]

| epoch   1 |  1400/ 7125 batches | accuracy    0.724


1435it [01:51, 12.91it/s]


KeyboardInterrupt: ignored

Why do you think this CNN does not do very well on this data? Also, please explain why. (Hint: the answer is fairly short)

Because this CNN has no non-linearity. So, the model cannot capture any non-lineary relationship. The model maybe not complex enough to capture the pattern in the dataset.
Plus, This CNN cannot capture the long-term information from the data. The CNN window size are 2,3,and 4. The AG_NEWS is about news topic classification. But we can only gain little information from 4 words. So, the CNN doesn't do well on the data.
