In [1]:
import re # for regular expressions
import random # we used this for setting the seed => reproducibility
from collections import Counter # used to build vocabulary => counting how often each word appears
import numpy as np # used for reproducibility as well
import pandas as pd # read csv => our dataset is in csv format
import torch # => moves data to cpu/gpu
import torch.nn as nn # neural networks building blocks
from torch.utils.data import Dataset, DataLoader # to feed text to the model
from sklearn.model_selection import train_test_split  # we want to split data into training, validation, and testing
from sklearn.metrics import accuracy_score, f1_score # to measure accuracy and how good our model is 

In [2]:
SEED = 42 # I checked the convention is to use 42 (from The Hitchhiker’s Guide to the Galaxy hehehe)
def set_seed(seed=SEED):
    random.seed(seed) # fixes randomness in python's random module
    np.random.seed(seed) # fixes randomness in numpy
    torch.manual_seed(seed) # fixes randomness in pytorch on cpu
    torch.cuda.manual_seed_all(seed) # fixes randomness in pytorch on gpu if u have a gpu ( i don't but this is safe to keep)
set_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [3]:
def load_agnews(train_path="data/train.csv"):
    df = pd.read_csv(train_path, header=None, skiprows=1)
    df = df.iloc[:, :3] # just to be safe ( they are only these 3 columns )
    df.columns = ["label", "title", "description"]
    # okok so basically what i am doing here is that i want to join the title and description
    # the title may be too short; the description may lack context
    # so this is only to be more accurate 
    df["text"] = (df["title"].astype(str) + " " + df["description"].astype(str)).str.strip() # remove extra spaces as well
    # labels are often indexed from 1; we as programmers prefer to start from 0 :) (we need it for pytorch they expect indexing from 0 hehe)
    df["label"] = df["label"].astype(int) - 1
    df = df[["text", "label"]].dropna() # finally remove missing rows => cleaning
    return df
df = load_agnews("data/train.csv")
df.head(3)

Unnamed: 0,text,label
0,Wall St. Bears Claw Back Into the Black (Reute...,2
1,Carlyle Looks Toward Commercial Aerospace (Reu...,2
2,Oil and Economy Cloud Stocks' Outlook (Reuters...,2


In [4]:
df.shape, df["label"].nunique(), sorted(df["label"].unique())

((120000, 2), 4, [0, 1, 2, 3])

In [5]:
# here i show you what are exactly the results of the line above
print("Shape:", df.shape)
print("Num classes:", df["label"].nunique())
print("Labels:", sorted(df["label"].unique()))

Shape: (120000, 2)
Num classes: 4
Labels: [0, 1, 2, 3]


In [6]:
train_all = load_agnews("data/train.csv")
test_df   = load_agnews("data/test.csv")

train_all.shape, test_df.shape

((120000, 2), (7600, 2))

In [7]:
def split_train_val(df, val_size=0.1, seed=SEED): # validation is 10%, 90% for training
    train_df, val_df = train_test_split(
        df,
        test_size=val_size,
        random_state=seed,
        stratify=df["label"] # each split keeps similar class proportions
    )
    return train_df.reset_index(drop=True), val_df.reset_index(drop=True)
train_df, val_df = split_train_val(train_all, val_size=0.1)
train_df.shape, val_df.shape, test_df.shape

((108000, 2), (12000, 2), (7600, 2))

In [8]:
train_df["label"].value_counts(normalize=True).sort_index()

label
0    0.25
1    0.25
2    0.25
3    0.25
Name: proportion, dtype: float64

In [9]:
val_df["label"].value_counts(normalize=True).sort_index()

label
0    0.25
1    0.25
2    0.25
3    0.25
Name: proportion, dtype: float64

In [10]:
test_df["label"].value_counts(normalize=True).sort_index()

label
0    0.25
1    0.25
2    0.25
3    0.25
Name: proportion, dtype: float64

In [11]:
def tokenize(text):
    text = text.lower()
    return re.findall(r"[a-z0-9']+", text)

In [12]:
PAD_TOKEN = "<pad>" # pad sequences to the same length => id 0 
UNK_TOKEN = "<unk>" # unknnow words => id 1

def build_vocab(texts, max_vocab=30000, min_freq=1):

    counter = Counter() # count occurrences
    for t in texts:
        counter.update(tokenize(t))

    vocab = {PAD_TOKEN: 0, UNK_TOKEN: 1}
    for w, f in counter.most_common(): # returns tokens ordered by frequency
    # so we are keeping frequent words and ignoring rare ones
        if f < min_freq: # skip words that appear fewer than min_freq times
            continue
        if len(vocab) >= max_vocab:
            break
        vocab[w] = len(vocab)
    return vocab

vocab = build_vocab(train_df["text"], max_vocab=30000) 
len(vocab), list(vocab.items())[:10] # show vocab size and the first few entries

(30000,
 [('<pad>', 0),
  ('<unk>', 1),
  ('the', 2),
  ('to', 3),
  ('a', 4),
  ('of', 5),
  ('in', 6),
  ('and', 7),
  ('on', 8),
  ('for', 9)])

In [13]:
# here we are mapping the ids if token is in vocab get its id ow we get 1 
def numericalize(tokens, vocab):
    unk_id = vocab[UNK_TOKEN]
    return [vocab.get(tok, unk_id) for tok in tokens]

In [14]:
class NewsDataset(Dataset):
    def __init__(self, df, vocab, max_len=200):
        self.texts = df["text"].tolist() # => ith article text
        self.labels = df["label"].tolist() # => its label
        self.vocab = vocab 
        self.max_len = max_len
        self.pad_id = vocab[PAD_TOKEN]

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens = tokenize(self.texts[idx])
        ids = numericalize(tokens, self.vocab)
        ids = ids[:self.max_len]
        length = len(ids)

        if len(ids) < self.max_len: # here we pad if shorter
            ids = ids + [self.pad_id] * (self.max_len - len(ids))
        x = torch.tensor(ids, dtype=torch.long) # word ids
        y = torch.tensor(self.labels[idx], dtype=torch.long) # the label
        return x, y, torch.tensor(length, dtype=torch.long)

In [15]:
def make_loaders(train_df, val_df, test_df, vocab, max_len=200, batch_size=64): 
    train_loader = DataLoader(NewsDataset(train_df, vocab, max_len), batch_size=batch_size, shuffle=True)
    val_loader   = DataLoader(NewsDataset(val_df, vocab, max_len),   batch_size=batch_size, shuffle=False)
    test_loader  = DataLoader(NewsDataset(test_df, vocab, max_len),  batch_size=batch_size, shuffle=False)
    return train_loader, val_loader, test_loader
train_loader, val_loader, test_loader = make_loaders(train_df, val_df, test_df, vocab, max_len=200, batch_size=64)

# 64 samples per batch, each sample is 200 token IDs then 64 => labels per one sample
x, y, lengths = next(iter(train_loader))
x.shape, y.shape, lengths.shape, y[:10], lengths[:10]


(torch.Size([64, 200]),
 torch.Size([64]),
 torch.Size([64]),
 tensor([0, 2, 1, 2, 3, 2, 0, 2, 1, 2]),
 tensor([39, 37, 37, 39, 45, 50, 49, 49, 41, 33]))

In [16]:
def prepare_agnews_core(train_csv_path="data/train.csv", test_csv_path="data/test.csv", max_vocab=30000, max_len=200, batch_size=64, seed=SEED, val_size=0.1):
    set_seed(seed)
    train_all = load_agnews(train_csv_path)
    test_df   = load_agnews(test_csv_path)
    train_df, val_df = train_test_split(
        train_all,
        test_size=val_size,
        random_state=seed,
        stratify=train_all["label"]
    )
    train_df = train_df.reset_index(drop=True)
    val_df   = val_df.reset_index(drop=True)
    test_df  = test_df.reset_index(drop=True)
    vocab = build_vocab(train_df["text"], max_vocab=max_vocab)

    train_loader = DataLoader(NewsDataset(train_df, vocab, max_len), batch_size=batch_size, shuffle=True)
    val_loader   = DataLoader(NewsDataset(val_df,   vocab, max_len), batch_size=batch_size, shuffle=False)
    test_loader  = DataLoader(NewsDataset(test_df,  vocab, max_len), batch_size=batch_size, shuffle=False)

    return vocab, train_loader, val_loader, test_loader, (train_df, val_df, test_df)

In [17]:
def compute_metrics(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    f1  = f1_score(y_true, y_pred, average="macro")
    return acc, f1

In [18]:
def run_epoch(model, loader, optimizer=None, device=device):
    # If we pass an optimizer, we assume we’re training.
    # If optimizer=None, we assume we’re evaluating (validation/test).
    is_train = optimizer is not None
    model.train() if is_train else model.eval()
    loss_fn = nn.CrossEntropyLoss() # we are doing multi class classification
    total_loss = 0.0
    all_preds, all_labels = [], [] # we want to store predictions

    for x, y, lengths in loader: # each batch
        x, y, lengths = x.to(device), y.to(device), lengths.to(device)
        if is_train:
            optimizer.zero_grad() # remove gradients of previous batch
        logits = model(x, lengths) # raw output of the model
        loss = loss_fn(logits, y) # comparing predicted logits and labels
        if is_train:
            loss.backward() # compute gradients
            optimizer.step() # update the weights
        total_loss += loss.item() * x.size(0) # average loss of the batches
        preds = torch.argmax(logits, dim=1) # pick the class with highest score
        all_preds.extend(preds.detach().cpu().tolist()) # stop gradient
        all_labels.extend(y.detach().cpu().tolist())
    # finallyyy we compute the metrics
    avg_loss = total_loss / len(loader.dataset)
    acc, f1 = compute_metrics(all_labels, all_preds)
    return avg_loss, acc, f1

In [19]:
def train_model(model, train_loader, val_loader, epochs=6, lr=1e-3, device=device):

    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    best_val_acc = -1 # we want to store the one with best accuracy
    best_state = None # store weights
    history = []

    for epoch in range(1, epochs + 1):
        tr_loss, tr_acc, tr_f1 = run_epoch(model, train_loader, optimizer, device)
        va_loss, va_acc, va_f1 = run_epoch(model, val_loader, None, device)
        # we will use this to plot later on
        history.append([epoch, tr_loss, tr_acc, tr_f1, va_loss, va_acc, va_f1])

        print(f"Epoch {epoch:02d} | "
              f"train loss {tr_loss:.4f} acc {tr_acc:.4f} f1 {tr_f1:.4f} || "
              f"val loss {va_loss:.4f} acc {va_acc:.4f} f1 {va_f1:.4f}")
        if va_acc > best_val_acc:
            best_val_acc = va_acc
            best_state = {k: v.detach().cpu() for k, v in model.state_dict().items()}

    model.load_state_dict(best_state)
    return model, history

In [20]:
def prepare_core(train_csv_path="data/train.csv", test_csv_path="data/test.csv", val_size=0.1, max_vocab=30000, max_len=200, batch_size=64, seed=SEED):
    set_seed(seed)
    train_all = load_agnews(train_csv_path)
    test_df   = load_agnews(test_csv_path)
    train_df, val_df = train_test_split(
        train_all,
        test_size=val_size,
        random_state=seed,
        stratify=train_all["label"]
    )
    train_df = train_df.reset_index(drop=True)
    val_df   = val_df.reset_index(drop=True)
    test_df  = test_df.reset_index(drop=True)

    vocab = build_vocab(train_df["text"], max_vocab=max_vocab)

    train_loader, val_loader, test_loader = make_loaders(
        train_df, val_df, test_df, vocab,
        max_len=max_len, batch_size=batch_size
    )
    return vocab, train_loader, val_loader, test_loader