In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import torch

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

if torch.cuda.is_available():
    print("Good to go!")
    DEVICE = torch.device("cuda")
else:
    print("Please set GPU via Edit -> Notebook Settings.")
    DEVICE = torch.device("cpu")

Good to go!


# Load Dataset

In [8]:
import json
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

class CategoryDataset:

  def __init__(self, data_dir="News_Category_Dataset_v3.json", batch_size=32):

    total_dataset = []
    # load dataset & get all unique words in the category
    with open(data_dir) as f:
        for data in f:
            total_dataset.append(json.loads(data))
      
    x = [f"{d['headline']} {d['short_description']}".lower() for d in total_dataset]
    y = [d['category'] for d in total_dataset]
    
    # use TFIDF to vectorize the texts
    vectorizer = TfidfVectorizer() 
    x = vectorizer.fit_transform(x)
    
    self.cat2idx = self._get_cat2idx(set(y))
    
    x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=1)
  
    self.x_train = x_train.toarray()
    self.y_train = np.array([self.cat2idx[cat] for cat in y_train])
    self.x_val = x_val.toarray()
    self.y_val = np.array([self.cat2idx[cat] for cat in y_val])

    self.batch_size = batch_size
    self.num_classes = len(set(self.y_train))
      
    print(f"Training Samples: {len(self.x_train)}")    
    print(f"Validation Samples: {len(self.x_val)}")
    print(f"Categories: {len(set(self.y_train))}")
      
    self.val_start_idx = 0

  @staticmethod
  def _get_cat2idx(category):
    unique_cat = list(set(category))
    cat2idx = {}
    for idx, cat in enumerate(unique_cat):
      cat2idx[cat] = idx
    return cat2idx

  def get_next_batch(self):
    rand_idx = np.random.randint(0, self.x_train.shape[0], (self.batch_size,))
    x = self.x_train[rand_idx]
    y = self.y_train[rand_idx]
    return torch.from_numpy(x).to(torch.float), torch.from_numpy(y)
  
  def get_next_val_batch(self):
    
    self.val_start_idx += self.batch_size
    if self.val_start_idx >= len(self.x_val):
      x = self.x_val[self.val_start_idx: ]
      y = self.y_val[self.val_start_idx: ]
      self.val_start_idx = 0
    else:
      x = self.x_val[self.val_start_idx: self.val_start_idx+self.batch_size]
      y = self.y_val[self.val_start_idx: self.val_start_idx+self.batch_size]

    return torch.from_numpy(x).to(torch.float), torch.from_numpy(y)
  
ds = CategoryDataset()

Training Samples: 167621
Validation Samples: 41906
Categories: 42


# Linear Classifier

In [4]:
import torch.nn as nn
import torch.nn.functional as F

class LinearClassifier(nn.Module):

  def __init__(self, in_dim, hidden_dim=256, num_classes=42):
    super().__init__()
    self.linear1 = nn.Linear(in_dim, hidden_dim)
    self.linear2 = nn.Linear(hidden_dim, hidden_dim)
    self.final_proj = nn.Linear(hidden_dim, num_classes)

    self.relu = nn.ReLU()


  def forward(self, x, y=None):
    """
    Input
    - x: (B, d),
    - y: (B,)
    """
    out = self.relu(self.linear1(x))
    out = self.relu(self.linear2(out))
    logits = self.final_proj(out)

    if y is None:
      loss = None
    else:
      loss = F.cross_entropy(logits, y)

    return loss, logits

### Training

In [5]:
import time
import torch
import torch.optim as optim

learning_rate = 2e-3
iters = 5239 * 10 

ds = CategoryDataset()

model = LinearClassifier(in_dim=ds.x_train.shape[1], num_classes=ds.num_classes)
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)

model = model.to(DEVICE)
for i, iter in enumerate(range(iters)):
  x, y = ds.get_next_batch()
  x = x.to(DEVICE)
  y = y.to(DEVICE)
  optimizer.zero_grad()

  loss, _ = model(x, y)
  if i % 100 == 0:
    print(f"{iter}/{iters} || Loss: {loss}")

  loss.backward()
  optimizer.step()

(209527, 88507)
Training Samples: 167621
Validation Samples: 41906
Categories: 42
0/52390 || Loss: 3.7425692081451416
100/52390 || Loss: 3.272230863571167
200/52390 || Loss: 2.7504000663757324
300/52390 || Loss: 2.2660880088806152
400/52390 || Loss: 2.1233232021331787
500/52390 || Loss: 2.0287086963653564
600/52390 || Loss: 1.7908399105072021
700/52390 || Loss: 1.9219212532043457
800/52390 || Loss: 1.6207417249679565
900/52390 || Loss: 1.5896824598312378
1000/52390 || Loss: 1.3233721256256104
1100/52390 || Loss: 1.970096230506897
1200/52390 || Loss: 1.5842950344085693
1300/52390 || Loss: 1.7772951126098633
1400/52390 || Loss: 2.064671516418457
1500/52390 || Loss: 1.5487754344940186
1600/52390 || Loss: 1.4786111116409302
1700/52390 || Loss: 1.5240954160690308
1800/52390 || Loss: 1.5251762866973877
1900/52390 || Loss: 1.5602601766586304
2000/52390 || Loss: 1.1026169061660767
2100/52390 || Loss: 1.4163180589675903
2200/52390 || Loss: 1.473536729812622
2300/52390 || Loss: 1.058171153068542

In [6]:
params = sum(parameter.numel() for parameter in model.parameters())

print(f"Number of Parameters: {params}")

Number of Parameters: 22734634


### Testing

In [7]:
x_val, y_val = ds.x_val, ds.y_val
total_n = len(y_val)
count = 0

for iter in range(1310): # validation_size(41906) / batch_size(32) = 1309.xx 
  x, y = ds.get_next_val_batch()
  x = x.to(DEVICE)
  y = y.to(DEVICE)

  _, logits = model(x)

  pred = torch.argmax(logits, dim=-1)

  for predict, gt in zip(pred, y):
    if predict == gt:
      count += 1

print(f"Linear Classifier Accuracy: {(count * 100/total_n):.2f}%")

Linear Classifier Accuracy: 55.80%
