In [17]:
# Import libraries
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, DataCollatorWithPadding
from torch.optim import AdamW

In [19]:
df = pd.read_csv("sample_data.csv")
df.head()

Unnamed: 0,Class Index,Title,Description,desc_length,clean_desc
0,3,"BBC set for major shake-up, claims newspaper","London - The British Broadcasting Corporation,...",39,london british broadcast corpor world 39 bigge...
1,3,Marsh averts cash crunch,Embattled insurance broker #39;s banks agree t...,24,embattl insur broker 39 bank agre waiv claus m...
2,2,"Jeter, Yankees Look to Take Control (AP)",AP - Derek Jeter turned a season that started ...,23,ap derek jeter turn season start terribl slump...
3,4,Flying the Sun to Safety,When the Genesis capsule comes back to Earth w...,29,genesi capsul come back earth sampl sun helico...
4,3,Stocks Seen Flat as Nortel and Oil Weigh,NEW YORK (Reuters) - U.S. stocks were set to ...,37,new york reuter us stock set open near unchang...


In [21]:
# Tokenizer setup
# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [23]:
# Dataset + Dataloader
class NewsDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Prepare data
texts = df['Description'].tolist()
labels = df['Title'].tolist()

# Create dataset and DataLoader
dataset = NewsDataset(texts, labels)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
train_loader = DataLoader(dataset, batch_size=16, shuffle=True, collate_fn=data_collator)


In [25]:
# Model + Optimizer Setup

# Initialize model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Training loop

# Set model to training mode
model.train()

for epoch in range(1):  # Set to more epochs if needed
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['Title']
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        print(f"Loss: {loss.item()}")

In [29]:
# Save Fine-Tuned model and tokenizer

model.save_pretrained('fine_tuned_bert')
tokenizer.save_pretrained('fine_tuned_bert')
print("Model saved to 'fine_tuned_bert'")

Model saved to 'fine_tuned_bert'
