In [1]:
!pip install transformers[sentencepiece]



In [38]:
# Imports
import os
import gc
import time
from pathlib import Path
import json
from warnings import filterwarnings

filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from tqdm.auto import tqdm

import re
import nltk
from nltk.corpus import stopwords

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

from transformers import RobertaTokenizer, RobertaConfig, RobertaModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
nltk.download("stopwords")
STOPWORDS = stopwords.words("english")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
DATASET_LOC = "/content/news_dataset.csv"
df = pd.read_csv(DATASET_LOC)
df

Unnamed: 0,Title,Publisher,DateTime,Link,Category
0,"Chainlink (LINK) Falters, Hedera (HBAR) Wobble...",Analytics Insight,2023-08-30T06:54:49Z,https://news.google.com/articles/CBMibGh0dHBzO...,Business
1,Funds punished for owning too few Nvidia share...,ZAWYA,2023-08-30T07:15:59Z,https://news.google.com/articles/CBMigwFodHRwc...,Business
2,Crude oil prices stalled as hedge funds sold: ...,ZAWYA,2023-08-30T07:31:31Z,https://news.google.com/articles/CBMibGh0dHBzO...,Business
3,Grayscale's Bitcoin Win Is Still Only Half the...,Bloomberg,2023-08-30T10:38:40Z,https://news.google.com/articles/CBMib2h0dHBzO...,Business
4,"I'm a Home Shopping Editor, and These Are the ...",Better Homes & Gardens,2023-08-30T11:00:00Z,https://news.google.com/articles/CBMiPWh0dHBzO...,Business
...,...,...,...,...,...
51297,Slovakia's Election Could Echo in Ukraine. Her...,The New York Times,2023-09-30T04:01:14Z,https://news.google.com/articles/CBMiU2h0dHBzO...,Worldwide
51298,Things to know about the Nobel Prizes - The Wa...,The Washington Post,2023-09-30T04:26:44Z,https://news.google.com/articles/CBMimQFodHRwc...,Worldwide
51299,"After brief calm, protests against killing of ...",Hindustan Times,2023-09-30T04:51:51Z,https://news.google.com/articles/CBMikgFodHRwc...,Worldwide
51300,‘No one is safe’: France vows action as bedbug...,CNN,2023-09-30T04:58:00Z,https://news.google.com/articles/CBMiTmh0dHBzO...,Worldwide


## Prepare Data

In [5]:
def prepare_data(df):
    df = df[["Title", "Category"]]
    df.rename(columns={"Title": "Text"}, inplace=True)
    df, headlines_df = df[df["Category"] != "Headlines"].reset_index(drop=True), df[df["Category"] == "Headlines"].reset_index(drop=True)

    return df, headlines_df

In [6]:
def clean_text(text):
    # lower case the text
    text = text.lower()  # necessary to do before as stopwords are in lower case

    # remove stopwords
    stp_pattern = re.compile(r"\b(" + r"|".join(STOPWORDS) + r")\b\s*")
    text = stp_pattern.sub("", text)

    # custom cleaning
    text = text.strip()  # remove space at start or end if any
    text = re.sub(" +", " ", text)  # remove extra spaces
    text = re.sub("[^A-Za-z0-9]+", " ", text)  # remove characters that are not alphanumeric

    return text

In [7]:
def tokenize(batch):
    tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
    encoded_inputs = tokenizer(batch["Text"].tolist(), return_tensors="np", padding="longest")
    return dict(ids=encoded_inputs["input_ids"], masks=encoded_inputs["attention_mask"], targets=np.array(batch["Category"]))

In [8]:
def preprocess(df):
    """Preprocess the data."""
    df, headlines_df = prepare_data(df)

    # label encoding
    cats = df["Category"].unique().tolist()
    num_classes = len(cats)
    class_to_index = {tag: i for i, tag in enumerate(cats)}
    index_to_class = {v: k for k, v in class_to_index.items()}

    df["Text"] = df["Text"].apply(clean_text)  # clean text
    df = df[["Text", "Category"]]
    df["Category"] = df["Category"].map(class_to_index)  # label encoding
    return df, class_to_index, index_to_class

In [9]:
ds, class_to_index, index_to_class = preprocess(df)
ds

Unnamed: 0,Text,Category
0,chainlink link falters hedera hbar wobbles yet...,0
1,funds punished owning nvidia shares stunning 2...,0
2,crude oil prices stalled hedge funds sold kemp,0
3,grayscale bitcoin win still half battle,0
4,home shopping editor miss labor day deals eyeing,0
...,...,...
44142,slovakia election could echo ukraine expect,6
44143,things know nobel prizes washington post,6
44144,brief calm protests killing 2 students rock im...,6
44145,one safe france vows action bedbugs sweep paris,6


In [10]:
train_ds, val_ds = train_test_split(ds, test_size=0.2, stratify=ds["Category"])

In [11]:
# max_len = 0
# for i in range(train_ds.shape[0]):
#     size = len(train_ds['Text'].reset_index(drop=True)[i])
#     if size > max_len:
#         max_len = size

# max_len

In [12]:
def prepare_input(tokenizer, text):
    inputs = tokenizer.encode_plus(
        text,
        return_tensors=None,
        add_special_tokens=True,
        max_length=50,
        pad_to_max_length=True,
        truncation=True,
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class NewsDataset(Dataset):
    def __init__(self, ds):
        self.texts = ds["Text"].values
        self.labels = ds["Category"].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(tokenizer, self.texts[item])
        labels = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, labels


def collate(inputs):
    max_len = int(inputs["input_ids"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:, :max_len]
    return inputs


tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

i = prepare_input(tokenizer, train_ds["Text"].values[2])
i

{'input_ids': tensor([    0,  9426, 21028, 11687,   907, 11901,  4291,   918,  4403,  1482,
         3361,  3595,   195,   231,   325,     2,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0])}

## Model

In [13]:
class CustomModel(nn.Module):
    def __init__(self, num_classes, change_config=False, dropout_pb=0.0):
        super(CustomModel, self).__init__()
        if change_config:
            pass
        self.model = RobertaModel.from_pretrained("roberta-base")
        self.hidden_size = self.model.config.hidden_size
        self.num_classes = num_classes
        self.dropout_pb = dropout_pb
        self.dropout = torch.nn.Dropout(self.dropout_pb)
        self.fc = nn.Linear(self.hidden_size, self.num_classes)

    def forward(self, inputs):
        output = self.model(**inputs)
        z = self.dropout(output[1])
        z = self.fc(z)
        return z

    @torch.inference_mode()
    def predict(self, inputs):
        self.eval()
        z = self(inputs)
        y_pred = torch.argmax(z, dim=1).cpu().numpy()
        return y_pred

    @torch.inference_mode()
    def predict_proba(self, inputs):
        self.eval()
        z = self(inputs)
        y_probs = F.softmax(z, dim=1).cpu().numpy()
        return y_probs

    def save(self, dp):
        with open(Path(dp, "args.json"), "w") as fp:
            contents = {
                "dropout_pb": self.dropout_pb,
                "hidden_size": self.hidden_size,
                "num_classes": self.num_classes,
            }
            json.dump(contents, fp, indent=4, sort_keys=False)
        torch.save(self.state_dict(), os.path.join(dp, "model.pt"))

    @classmethod
    def load(cls, args_fp, state_dict_fp):
        with open(args_fp, "r") as fp:
            kwargs = json.load(fp=fp)
        llm = RobertaModel.from_pretrained("roberta-base")
        model = cls(llm=llm, **kwargs)
        model.load_state_dict(torch.load(state_dict_fp, map_location=torch.device("cpu")))
        return model

In [14]:
# Initialize model
num_classes = len(ds["Category"].unique())
model = CustomModel(num_classes=num_classes, dropout_pb=0.5)
print(model.named_parameters)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<bound method Module.named_parameters of CustomModel(
  (model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=T

## Training

In [15]:
def train_step(train_loader, model, num_classes, loss_fn, optimizer):
    """Train step."""
    model.train()
    loss = 0.0
    for step, (inputs, labels) in tqdm(enumerate(train_loader)):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        optimizer.zero_grad()  # reset gradients
        y_pred = model(inputs)  # forward pass
        targets = F.one_hot(labels.long(), num_classes=num_classes).float()  # one-hot (for loss_fn)
        J = loss_fn(y_pred, targets)  # define loss
        J.backward()  # backward pass
        optimizer.step()  # update weights
        loss += (J.detach().item() - loss) / (step + 1)  # cumulative loss
    return loss


def eval_step(val_loader, model, num_classes, loss_fn):
    """Eval step."""
    model.eval()
    loss = 0.0
    y_trues, y_preds = [], []
    with torch.inference_mode():
        for step, (inputs, labels) in tqdm(enumerate(val_loader)):
            inputs = collate(inputs)
            for k, v in inputs.items():
                inputs[k] = v.to(device)
            labels = labels.to(device)
            y_pred = model(inputs)
            targets = F.one_hot(labels.long(), num_classes=num_classes).float()  # one-hot (for loss_fn)
            J = loss_fn(y_pred, targets).item()
            loss += (J - loss) / (step + 1)
            y_trues.extend(targets.cpu().numpy())
            y_preds.extend(torch.argmax(y_pred, dim=1).cpu().numpy())
    return loss, np.vstack(y_trues), np.vstack(y_preds)

In [16]:
def train_loop():
    # ====================================================
    # loader
    # ====================================================
    train_ds, val_ds = train_test_split(ds, test_size=0.2, stratify=ds["Category"])

    train_dataset = NewsDataset(train_ds)
    valid_dataset = NewsDataset(val_ds)

    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset, batch_size=64, shuffle=False, num_workers=4, pin_memory=True, drop_last=False)

    # ====================================================
    # model
    # ====================================================
    num_classes = 7
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = CustomModel(num_classes=num_classes, dropout_pb=0.5)
    model.to(device)

    # ====================================================
    # Training components
    # ====================================================
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode="min", factor=0.8, patience=3)

    # ====================================================
    # loop
    # ====================================================
    max_loss = np.inf

    for epoch in range(10):
        start_time = time.time()

        # Step
        train_loss = train_step(train_loader, model, num_classes, criterion, optimizer)
        val_loss, _, _ = eval_step(valid_loader, model, num_classes, criterion)
        scheduler.step(val_loss)

        # scoring
        elapsed = time.time() - start_time

        print(f"Epoch {epoch+1} - avg_train_loss: {train_loss:.4f}  avg_val_loss: {val_loss:.4f}  time: {elapsed:.0f}s")

        if max_loss > val_loss:
            max_loss = val_loss
            print(f"Epoch {epoch+1} - Save Best Score: {max_loss:.4f} Model")
            model.save("/content/")

    #     predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth",
    #                              map_location=torch.device('cpu'))['predictions']
    #     valid_folds[[f"pred_{c}" for c in CFG.target_cols]] = predictions

    torch.cuda.empty_cache()
    gc.collect()

In [17]:
train_loop()

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Epoch 1 - avg_train_loss: 0.1396  avg_val_loss: 0.0845  time: 301s
Epoch 1 - Save Best Score: 0.0845 Model


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Epoch 2 - avg_train_loss: 0.0711  avg_val_loss: 0.0699  time: 301s
Epoch 2 - Save Best Score: 0.0699 Model


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Epoch 3 - avg_train_loss: 0.0512  avg_val_loss: 0.0674  time: 301s
Epoch 3 - Save Best Score: 0.0674 Model


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Epoch 4 - avg_train_loss: 0.0401  avg_val_loss: 0.0625  time: 301s
Epoch 4 - Save Best Score: 0.0625 Model


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Epoch 5 - avg_train_loss: 0.0327  avg_val_loss: 0.0663  time: 301s


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Epoch 6 - avg_train_loss: 0.0302  avg_val_loss: 0.0721  time: 301s


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Epoch 7 - avg_train_loss: 0.0270  avg_val_loss: 0.0681  time: 301s


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Epoch 8 - avg_train_loss: 0.0221  avg_val_loss: 0.0695  time: 300s


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Epoch 9 - avg_train_loss: 0.0199  avg_val_loss: 0.0684  time: 301s


0it [00:00, ?it/s]

0it [00:00, ?it/s]

Epoch 10 - avg_train_loss: 0.0151  avg_val_loss: 0.0749  time: 300s


## Inference

In [29]:
with open("/content/args.json", "r") as fp:
    kwargs = json.load(fp=fp)

llm = RobertaModel.from_pretrained("roberta-base")
model = CustomModel(num_classes=7)
model.load_state_dict(torch.load("/content/model.pt", map_location=torch.device("cpu")))
model.to(device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


CustomModel(
  (model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm

In [34]:
def test_step(test_loader, model, num_classes):
    """Eval step."""
    model.eval()
    y_trues, y_preds = [], []
    with torch.inference_mode():
        for step, (inputs, labels) in tqdm(enumerate(test_loader)):
            inputs = collate(inputs)
            for k, v in inputs.items():
                inputs[k] = v.to(device)
            labels = labels.to(device)
            y_pred = model(inputs)
            y_trues.extend(labels.cpu().numpy())
            y_preds.extend(torch.argmax(y_pred, dim=1).cpu().numpy())
    return np.vstack(y_trues), np.vstack(y_preds)

In [104]:
test_dataset = NewsDataset(val_ds)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=4, pin_memory=True, drop_last=False)

y_true, y_pred = test_step(test_loader, model, 7)

0it [00:00, ?it/s]

In [105]:
print(
    f'Precision: {precision_score(y_true, y_pred, average="weighted")} \n Recall: {recall_score(y_true, y_pred, average="weighted")} \n F1: {f1_score(y_true, y_pred, average="weighted")} \n Accuracy: {accuracy_score(y_true, y_pred)}'
)

Precision: 0.971201571786853 
 Recall: 0.9708946772366931 
 F1: 0.9709574157101106 
 Accuracy: 0.9708946772366931


## Prediction

In [111]:
sample = 0

cats = df["Category"].unique().tolist()
num_classes = len(cats)
class_to_index = {tag: i for i, tag in enumerate(cats)}
index_to_class = {v: k for k, v in class_to_index.items()}

label = test_dataset.__getitem__(sample)[1].item()
input_ids = torch.unsqueeze(test_dataset.__getitem__(sample)[0]["input_ids"], 0).to(device)
attention_masks = torch.unsqueeze(test_dataset.__getitem__(sample)[0]["attention_mask"], 0).to(device)
test_sample = dict(input_ids=input_ids, attention_mask=attention_masks)

with torch.no_grad():
    y_pred_test_sample = model.predict_proba(test_sample)
    print(f"Ground Truth: {label}, {index_to_class[int(label)]}")
    print(f"Predicted: {np.argmax(y_pred_test_sample)}, {index_to_class[int(np.argmax(y_pred_test_sample))]}")
    print(f"Predicted Probabilities: {np.round(y_pred_test_sample, 8)[0]}")

Ground Truth: 4.0, Science
Predicted: 4, Science
Predicted Probabilities: [2.30000e-06 1.36000e-06 8.20000e-07 1.23000e-06 9.99992e-01 1.27000e-06
 1.15000e-06]
