In [1]:
# Import libraries
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm

In [2]:
# For demonstration, let's create a dummy dataset
data = [
    {"text": "Scientists discover a new species of butterfly in the Amazon rainforest.", "label": 0},
    {"text": "NASA announces plans for a manned mission to Mars by 2030.", "label": 0},
    {"text": "World Health Organization declares a global pandemic due to a new virus outbreak.", "label": 0},
    {"text": "BREAKING: Unicorns spotted in Central Park, New York!", "label": 1},
    {"text": "Elvis Presley found alive and well in a small town in Texas.", "label": 1},
    {"text": "New study claims eating chocolate every day leads to weight loss.", "label": 0},
    {"text": "Major earthquake hits the moon; lunar colonies in danger!", "label": 1},
    {"text": "International Space Station crew discovers evidence of alien life.", "label": 1},
    {"text": "Local cat wins Nobel Prize in Physics for groundbreaking research on string theory.", "label": 1},
    {"text": "Apple to release a smartphone that can teleport users to any location.", "label": 1},
    {"text": "Scientists develop a pill that grants superhuman intelligence.", "label": 1},
    {"text": "Study finds that people who own dogs live longer than those who don't.", "label": 0},
    {"text": "Government announces plans to build a floating city in the Pacific Ocean.", "label": 0},
    {"text": "BREAKING: Giant robots emerge from the ocean; cities on high alert!", "label": 1},
    {"text": "New research suggests that the Earth is flat.", "label": 1},
    {"text": "Researchers discover a cure for the common cold.", "label": 0},
    {"text": "World's largest pizza delivered to space station for astronauts.", "label": 0},
    {"text": "BREAKING: Time-traveling cat predicts the outcome of next year's elections.", "label": 1},
    {"text": "Invention of an invisibility cloak announced by a group of teenage prodigies.", "label": 1},
    {"text": "Robot becomes the first non-human to run for political office.", "label": 1},
]

In [3]:
# Split the dataset into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [4]:
# Define a custom dataset class
class NewsDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx]["text"]
        label = self.data[idx]["label"]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt",
        )

        return {
            "input_ids": encoding["input_ids"].flatten(),
            "attention_mask": encoding["attention_mask"].flatten(),
            "label": torch.tensor(label, dtype=torch.long),
        }

In [5]:
# Define the tokenizer and model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=2
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Define the dataset and dataloaders
max_length = 128
train_dataset = NewsDataset(train_data, tokenizer, max_length)
test_dataset = NewsDataset(test_data, tokenizer, max_length)

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=8, shuffle=False)


In [7]:
# Define training parameters
epochs = 100
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

In [8]:
# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(epochs):
    model.train()
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{epochs}"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

Epoch 1/100: 100%|██████████| 2/2 [00:03<00:00,  1.77s/it]
Epoch 2/100: 100%|██████████| 2/2 [00:00<00:00,  4.76it/s]
Epoch 3/100: 100%|██████████| 2/2 [00:02<00:00,  1.17s/it]
Epoch 4/100: 100%|██████████| 2/2 [00:00<00:00,  7.03it/s]
Epoch 5/100: 100%|██████████| 2/2 [00:00<00:00,  5.78it/s]
Epoch 6/100: 100%|██████████| 2/2 [00:01<00:00,  1.99it/s]
Epoch 7/100: 100%|██████████| 2/2 [00:00<00:00,  9.25it/s]
Epoch 8/100: 100%|██████████| 2/2 [00:00<00:00,  5.77it/s]
Epoch 9/100: 100%|██████████| 2/2 [00:00<00:00,  4.58it/s]
Epoch 10/100: 100%|██████████| 2/2 [00:00<00:00,  7.74it/s]
Epoch 11/100: 100%|██████████| 2/2 [00:00<00:00,  5.72it/s]
Epoch 12/100: 100%|██████████| 2/2 [00:00<00:00,  4.26it/s]
Epoch 13/100: 100%|██████████| 2/2 [00:00<00:00,  5.86it/s]
Epoch 14/100: 100%|██████████| 2/2 [00:00<00:00,  5.65it/s]
Epoch 15/100: 100%|██████████| 2/2 [00:00<00:00,  2.39it/s]
Epoch 16/100: 100%|██████████| 2/2 [00:00<00:00,  2.08it/s]
Epoch 17/100: 100%|██████████| 2/2 [00:00<00:00, 

In [9]:
# Evaluation
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

Evaluating: 100%|██████████| 1/1 [00:00<00:00, 28.72it/s]


In [10]:
# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
print(f"Accuracy: {accuracy}")

Accuracy: 0.5
