In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import torch

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

if torch.cuda.is_available():
    print("Good to go!")
    DEVICE = torch.device("cuda")
else:
    print("Please set GPU via Edit -> Notebook Settings.")
    DEVICE = torch.device("cpu")

Good to go!


In [3]:
import torch
import json
import numpy as np
import tiktoken
import torch.nn.functional as F
from sklearn.model_selection import train_test_split

class CategoryDataset:

  def __init__(self, data_dir="News_Category_Dataset_v3.json", batch_size=32):

    total_dataset = []
    # load dataset & get all unique words in the category
    with open(data_dir) as f:
        for data in f:
            total_dataset.append(json.loads(data))
    
    # GPT2 Tokenizer to encode text
    tokenizer = tiktoken.get_encoding("gpt2") # vocab_size=50257
    x = [tokenizer.encode(f"{d['headline']} {d['short_description']}".lower()) for d in total_dataset]
    y = [d['category'] for d in total_dataset]
    
    self.cat2idx = self._get_cat2idx(set(y))
    
    x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=1)
  
    self.x_train = x_train
    self.y_train = torch.tensor([self.cat2idx[cat] for cat in y_train])
    self.x_val = x_val
    self.y_val = torch.tensor([self.cat2idx[cat] for cat in y_val])

    self.batch_size = batch_size
    self.num_classes = len(set(self.y_train))
      
    print(f"Training Samples: {len(self.x_train)}")    
    print(f"Validation Samples: {len(self.x_val)}")
    print(f"Categories: {len(set(self.y_train))}")
      
    self.train_start_idx = 0
    self.val_start_idx = 0
    
  @staticmethod   
  def _padding(tokens, max_padding):
    x = []
    for token in tokens:
        padded = torch.cat([torch.tensor(token), torch.zeros(max_padding-len(token)).fill_(50257)], dim=-1)
        x.append(padded)
    return torch.stack(x, dim=0)

  @staticmethod
  def _get_cat2idx(category):
    unique_cat = list(set(category))
    cat2idx = {}
    for idx, cat in enumerate(unique_cat):
      cat2idx[cat] = idx
    return cat2idx

  def get_next_batch(self):
    self.train_start_idx += 1
    if self.train_start_idx >= len(self.x_train):
        self.train_start_idx = 0
    x = torch.tensor(self.x_train[self.train_start_idx:self.train_start_idx+1]).to(torch.long)
    y = self.y_train[self.train_start_idx:self.train_start_idx+1]
    return x, y
  
  def get_next_val_batch(self):
    
    self.val_start_idx += 1
    if self.val_start_idx >= len(self.x_val):
      self.val_start_idx = 0
    
    x = torch.tensor(self.x_val[self.val_start_idx:self.val_start_idx+1]).to(torch.long)
    y = self.y_val[self.val_start_idx:self.val_start_idx+1]

    return x, y
  
ds = CategoryDataset()

Training Samples: 167621
Validation Samples: 41906
Categories: 167621


In [4]:
import torch.nn as nn

class RNNClassifier(nn.Module):
    def __init__(self, vocab_size, input_size=256, hidden_size=512, num_classes=42):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, input_size) 
        self.rnn = nn.RNN(input_size=input_size, hidden_size=hidden_size, num_layers=1, batch_first=True)
        
        self.fc = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x, y=None):
        emb = self.embedding(x)
        out, hn = self.rnn(emb)
        logits = self.fc(out.mean(dim=1))
        
        if y is None:
          loss = None
        else:
          loss = F.cross_entropy(logits, y)
        
        return loss, logits

In [5]:
import torch
import torch.optim as optim

learning_rate = 2e-4
iters = 167621 * 10 # epochs = 10

ds = CategoryDataset()

model = RNNClassifier(50257, 256)
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

model = model.to(DEVICE)
for i, iter in enumerate(range(iters)):
  x, y = ds.get_next_batch()
  x = x.to(DEVICE)
  y = y.to(DEVICE)
  optimizer.zero_grad()

  loss, _ = model(x, y)
  if i % 10000 == 0:
    print(f"{iter}/{iters} || Loss: {loss}")

  loss.backward()
  optimizer.step()

Training Samples: 167621
Validation Samples: 41906
Categories: 167621
0/167621 || Loss: 3.6858811378479004
10000/167621 || Loss: 0.9011767506599426
20000/167621 || Loss: 1.8772355318069458
30000/167621 || Loss: 4.318739891052246
40000/167621 || Loss: 0.09421233087778091
50000/167621 || Loss: 0.05923443287611008
60000/167621 || Loss: 0.5570294857025146
70000/167621 || Loss: 2.066330909729004
80000/167621 || Loss: 3.658010959625244
90000/167621 || Loss: 2.0739264488220215
100000/167621 || Loss: 1.9398200511932373
110000/167621 || Loss: 0.6870312094688416
120000/167621 || Loss: 0.715524435043335
130000/167621 || Loss: 0.07046086341142654
140000/167621 || Loss: 0.5110667943954468
150000/167621 || Loss: 1.3033519983291626
160000/167621 || Loss: 0.32936033606529236


In [6]:
params = sum(parameter.numel() for parameter in model.parameters())

print(f"Number of Parameters: {params}")

Number of Parameters: 13281578


In [6]:
x_val, y_val = ds.x_val, ds.y_val
total_n = len(y_val)
count = 0

for iter in range(41906): # validation_size(41906) / batch_size(1) = 1309.xx 
  x, y = ds.get_next_val_batch()
  x = x.to(DEVICE)
  y = y.to(DEVICE)

  _, logits = model(x)

  pred = torch.argmax(logits, dim=-1)

  for predict, gt in zip(pred, y):
    if predict == gt:
      count += 1

print(f"RNN Classifier Accuracy: {(count * 100/total_n):.2f}%")

RNN Classifier Accuracy: 59.97%
