In [1]:
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader
from torch.optim import AdamW
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device: ", device)

Using device:  cuda


In [19]:
def train_evaluate_bert(texts, labels, model_name: str = "bert-base-uncased", epochs: int = 3, batch_size: int = 8, lr: float = 2e-5, max_length: int = 512):
    unique_labels = sorted(set(labels))
    label2id = {lab: i for i, lab in enumerate(unique_labels)}
    id2label = {i: lab for lab, i in label2id.items()}
    label_ids = [label2id[lab] for lab in labels]
    target_names = [str(l) for l in unique_labels]

    X_tr, X_te, y_tr, y_te = train_test_split(
        texts, label_ids,
        stratify=label_ids,
        test_size=0.2,
        random_state=42
    )

    tokenizer = BertTokenizer.from_pretrained(model_name)
    def encode(batch):
        return tokenizer(
            list(batch),
            padding="max_length",
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        )

    train_enc = encode(X_tr)
    test_enc  = encode(X_te)

    train_ds = TensorDataset(
        train_enc.input_ids,
        train_enc.attention_mask,
        torch.tensor(y_tr, dtype=torch.long)
    )

    test_ds = TensorDataset(
        test_enc.input_ids,
        test_enc.attention_mask,
        torch.tensor(y_te, dtype=torch.long)
    )

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, pin_memory=True)
    test_loader  = DataLoader(test_ds,  batch_size=batch_size, pin_memory=True)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)
    model = BertForSequenceClassification.from_pretrained(model_name, num_labels=len(unique_labels), id2label=id2label, label2id=label2id).to(device)
    optimizer = AdamW(model.parameters(), lr=lr)

    model.train()
    for epoch in range(1, epochs + 1):
        total_loss = 0.0
        print(f"\nEpoch {epoch}/{epochs}")
        for input_ids, attn_mask, labs in train_loader:
            input_ids, attn_mask, labs = ( input_ids.to(device), attn_mask.to(device), labs.to(device) )
            loss = model(input_ids=input_ids, attention_mask=attn_mask, labels=labs).loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            total_loss += loss.item()
        print(f"Average loss: {total_loss/len(train_loader):.4f}")

    model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for input_ids, attn_mask, labs in test_loader:
            input_ids, attn_mask = input_ids.to(device), attn_mask.to(device)
            logits = model(
                input_ids=input_ids,
                attention_mask=attn_mask
            ).logits
            preds.extend(torch.argmax(logits, dim=1).cpu().tolist())
            trues.extend(labs.tolist())

    accuracy = accuracy_score(trues, preds)
    f1_w     = f1_score(trues, preds, average="weighted")
    report   = classification_report(trues, preds, target_names=target_names)

    metrics = {
        "accuracy": accuracy,
        "f1_weighted": f1_w,
        "classification_report": report
    }

    return tokenizer, model, metrics

In [None]:
df = pd.read_csv("./../cleaned_datasets/cleaned_news_binary_bert.csv")
tok, model, stats = train_evaluate_bert(df["text"], df["label"])

print("\nAccuracy:   ", stats["accuracy"])
print("Weighted F1:", stats["f1_weighted"])
print("\nReport:\n", stats["classification_report"])

Using device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3
Average loss: 0.1546

Epoch 2/3
Average loss: 0.1385

Epoch 3/3
Average loss: 0.1082

Accuracy:    0.9152219140083218
Weighted F1: 0.9151204299319851

Report:
               precision    recall  f1-score   support

           0       0.95      0.88      0.91      5825
           1       0.88      0.95      0.92      5711

    accuracy                           0.92     11536
   macro avg       0.92      0.92      0.92     11536
weighted avg       0.92      0.92      0.92     11536



In [22]:
df = pd.read_csv("./../cleaned_datasets/cleaned_news_multinomial_bert.csv")
tok1, model1, stats1 = train_evaluate_bert(df["text"], df["label"])

print("\nAccuracy:   ", stats1["accuracy"])
print("Weighted F1:", stats1["f1_weighted"])
print("\nReport:\n", stats1["classification_report"])

Using device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3
Average loss: 0.3906

Epoch 2/3
Average loss: 0.3589

Epoch 3/3
Average loss: 0.3037

Accuracy:    0.834864771151179
Weighted F1: 0.8261187088768973

Report:
               precision    recall  f1-score   support

           0       0.93      0.92      0.93      5196
           1       0.89      0.95      0.92      4694
           2       0.28      0.19      0.22       491
           3       0.25      0.33      0.29       526
           4       0.24      0.17      0.20       420
           5       0.37      0.11      0.16       209

    accuracy                           0.83     11536
   macro avg       0.49      0.45      0.45     11536
weighted avg       0.82      0.83      0.83     11536



In [2]:
from imblearn.over_sampling import RandomOverSampler
import numpy as np

def train_evaluate_bert_oversampling(texts, labels, model_name: str = "bert-base-uncased", epochs: int = 3, batch_size: int = 8, lr: float = 2e-5, test_size: float = 0.2, random_state: int = 42, max_length: int = 512):
    unique_labels = sorted(set(labels))
    label2id = {lab: i for i, lab in enumerate(unique_labels)}
    id2label = {i: lab for lab, i in label2id.items()}
    label_ids = [label2id[lab] for lab in labels]
    target_names = [str(l) for l in unique_labels]

    X_tr, X_te, y_tr, y_te = train_test_split(texts, label_ids, stratify=label_ids, test_size=test_size, random_state=random_state)

    ros = RandomOverSampler(random_state=random_state)
    X_tr_arr = np.array(X_tr).reshape(-1, 1)
    X_res, y_res = ros.fit_resample(X_tr_arr, y_tr)
    X_res = X_res.flatten().tolist()

    print("Resampled training counts:", {lab: y_res.count(i) 
          for lab,i in label2id.items()})

    tokenizer = BertTokenizer.from_pretrained(model_name)
    def encode(batch_texts):
        return tokenizer(
            batch_texts,
            padding="max_length",
            truncation=True,
            max_length=max_length,
            return_tensors="pt"
        )

    train_enc = encode(X_res)
    test_enc  = encode(list(X_te))

    train_ds = TensorDataset(
        train_enc.input_ids,
        train_enc.attention_mask,
        torch.tensor(y_res, dtype=torch.long)
    )
    test_ds = TensorDataset(
        test_enc.input_ids,
        test_enc.attention_mask,
        torch.tensor(y_te, dtype=torch.long)
    )
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,  pin_memory=True)
    test_loader  = DataLoader(test_ds,  batch_size=batch_size, shuffle=False, pin_memory=True)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)
    model = BertForSequenceClassification.from_pretrained(
        model_name,
        num_labels=len(unique_labels),
        id2label=id2label,
        label2id=label2id
    ).to(device)
    optimizer = AdamW(model.parameters(), lr=lr)

    model.train()
    for epoch in range(1, epochs + 1):
        total_loss = 0.0
        print(f"\nEpoch {epoch}/{epochs}")
        for input_ids, attn_mask, labs in train_loader:
            input_ids, attn_mask, labs = (
                input_ids.to(device),
                attn_mask.to(device),
                labs.to(device)
            )
            loss = model(
                input_ids=input_ids,
                attention_mask=attn_mask,
                labels=labs
            ).loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            total_loss += loss.item()
        print(f"Average loss: {total_loss/len(train_loader):.4f}")

    model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for input_ids, attn_mask, labs in test_loader:
            input_ids, attn_mask = input_ids.to(device), attn_mask.to(device)
            logits = model(
                input_ids=input_ids,
                attention_mask=attn_mask
            ).logits
            preds.extend(torch.argmax(logits, dim=1).cpu().tolist())
            trues.extend(labs.tolist())

    accuracy = accuracy_score(trues, preds)
    f1_w     = f1_score(trues, preds, average="weighted")
    report   = classification_report(trues, preds, target_names=target_names)

    metrics = {
        "accuracy": accuracy,
        "f1_weighted": f1_w,
        "classification_report": report
    }

    return tokenizer, model, metrics

In [3]:
df = pd.read_csv("./../cleaned_datasets/cleaned_news_multinomial_bert.csv")
tok2, model2, stats2 = train_evaluate_bert_oversampling(df["text"], df["label"])

print("\nAccuracy:   ", stats2["accuracy"])
print("Weighted F1:", stats2["f1_weighted"])
print("\nReport:\n", stats2["classification_report"])

Resampled training counts: {0: 20780, 1: 20780, 2: 20780, 3: 20780, 4: 20780, 5: 20780}
Using device: cuda


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1/3
Average loss: 0.3813

Epoch 2/3
Average loss: 0.1004

Epoch 3/3
Average loss: 0.0702

Accuracy:    0.827756588072122
Weighted F1: 0.826214968332371

Report:
               precision    recall  f1-score   support

           0       0.98      0.91      0.94      5196
           1       0.87      0.95      0.91      4694
           2       0.24      0.31      0.27       491
           3       0.30      0.20      0.24       526
           4       0.20      0.30      0.24       420
           5       0.36      0.06      0.10       209

    accuracy                           0.83     11536
   macro avg       0.49      0.45      0.45     11536
weighted avg       0.83      0.83      0.83     11536



In [25]:
model.save_pretrained("../models/bert_pipeline")
tok.save_pretrained("../models/bert_pipeline")

('../models/bert_pipeline\\tokenizer_config.json',
 '../models/bert_pipeline\\special_tokens_map.json',
 '../models/bert_pipeline\\vocab.txt',
 '../models/bert_pipeline\\added_tokens.json')