In [1]:
from sklearn.datasets import fetch_20newsgroups
from transformers import BertTokenizer
import numpy as np

In [2]:
# Load data
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

# Initialize tokenizer from the transformers library
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [3]:
# Function to tokenize and preprocess text
def preprocess(data):
    return tokenizer(data['data'], padding=True, truncation=True, max_length=512, return_tensors='pt')

# Apply preprocessing to training and test data
train_encodings = preprocess(newsgroups_train)
test_encodings = preprocess(newsgroups_test)

In [4]:
from transformers import BertForSequenceClassification
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader, TensorDataset

In [5]:
# Load BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=20)

# Create Tensor datasets
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], torch.tensor(newsgroups_train['target']))
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], torch.tensor(newsgroups_test['target']))

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Setting up the optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Training loop
model.train()
for epoch in range(3):  # Number of epochs
    for batch in train_loader:
        batch = [b.to(device) for b in batch]
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        model.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

KeyboardInterrupt: 

In [None]:
from sklearn.metrics import classification_report

model.eval()
predictions, true_labels = [], []
for batch in test_loader:
    batch = [b.to(device) for b in batch]
    inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predictions.extend(torch.argmax(logits, dim=-1).cpu().numpy())
    true_labels.extend(batch[2].cpu().numpy())

# Calculate metrics
report = classification_report(true_labels, predictions, target_names=newsgroups_train.target_names)
print(report)