In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.metrics import classification_report
from tqdm import tqdm
import json
import re

# File paths
train_file_path = r"C:\Users\Hikari\OneDrive\Desktop\cs180\train.json1"
dev_file_path = r"C:\Users\Hikari\OneDrive\Desktop\cs180\test.csv"

# Load training data
with open(train_file_path, "r", encoding="utf-8") as f:
    lines = [json.loads(line) for line in f if line.strip()]
train_df = pd.DataFrame(lines)

# Clean text
def clean_text(text):
    text_clean = " ".join(text.lower().strip().split())
    text_clean = re.sub(r'[^\w\s]', '', text_clean)
    return text_clean

train_df["text"] = train_df["text"].apply(clean_text)

# Load dev data
dev_df = pd.read_csv(dev_file_path)
dev_df["text"] = dev_df["text"].apply(clean_text)

# Label names and encoding
label_names = ["none", "metrics", "strategy", "risk", "governance"]
label2id = {label: idx for idx, label in enumerate(label_names)}
id2label = {idx: label for label, idx in label2id.items()}

if train_df["label"].dtype == object:
    train_df["label"] = train_df["label"].map(label2id)
if dev_df["label"].dtype == object:
    dev_df["label"] = dev_df["label"].map(label2id)

# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Dataset class
class TextDataset(Dataset):
    def __init__(self, data):
        texts = data["text"].tolist()
        self.labels = data["label"].tolist()
        self.encodings = tokenizer(texts, truncation=True, padding='longest', max_length=64)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.labels[idx])
        }

# Create datasets
train_dataset = TextDataset(train_df)
dev_dataset = TextDataset(dev_df)

# DataLoaders
batch_size = 8
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, num_workers=0)

# Set device to CPU only
device = torch.device("cpu")

# Initialize model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(label_names))
model.to(device)

# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
num_training_steps = len(train_loader) * 3  # 3 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=int(0.1 * num_training_steps), 
                                            num_training_steps=num_training_steps)

# Early stopping params
best_dev_loss = float('inf')
patience = 2
patience_counter = 0

# Training loop
for epoch in range(3):
    model.train()
    total_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} - Train Loss: {avg_train_loss:.4f}")

    # Validation on dev set
    model.eval()
    dev_loss = 0
    preds, targets = [], []

    with torch.no_grad():
        for batch in dev_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            dev_loss += loss.item()
            predictions = torch.argmax(logits, dim=-1)

            preds.extend(predictions.cpu().numpy())
            targets.extend(labels.cpu().numpy())

    avg_dev_loss = dev_loss / len(dev_loader)
    print(f"Epoch {epoch+1} - Dev Loss: {avg_dev_loss:.4f}")

    # Early stopping check
    if avg_dev_loss < best_dev_loss:
        best_dev_loss = avg_dev_loss
        patience_counter = 0
        torch.save(model.state_dict(), "best_model.pt")
    else:
        patience_counter += 1
        if patience_counter >= patience:
            print("Early stopping triggered")
            break

# Load best model and evaluate on both train and dev sets

model.load_state_dict(torch.load("best_model.pt"))
model.eval()

def evaluate(model, data_loader, label_names):
    all_preds, all_labels = [], []
    total_loss = 0
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            total_loss += loss.item()
            preds = torch.argmax(logits, dim=-1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(data_loader)
    print(f"Loss: {avg_loss:.4f}")
    print(classification_report(all_labels, all_preds, target_names=label_names))


print("Final evaluation on train set:")
evaluate(model, train_loader, label_names)

print("Final evaluation on dev set:")
evaluate(model, dev_loader, label_names)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|███████████████████████████████████████████████████████████████████████| 163/163 [14:43<00:00,  5.42s/it]


Epoch 1 - Train Loss: 1.3070
Epoch 1 - Dev Loss: 0.9955


Epoch 2: 100%|███████████████████████████████████████████████████████████████████████| 163/163 [10:46<00:00,  3.97s/it]


Epoch 2 - Train Loss: 0.7506
Epoch 2 - Dev Loss: 0.7647


Epoch 3:   6%|████▍                                                                   | 10/163 [00:34<08:55,  3.50s/it]

In [None]:
# Saving bert kibbler 
import os
directory = r"C:\Users\Hikari\OneDrive\Desktop\cs180"

# Save model and tokenizer
model.save_pretrained(directory)
tokenizer.save_pretrained(directory)

# Save label mappings
with open(os.path.join(directory, "id2label.json"), "w") as f:
    json.dump(id2label, f)

with open(os.path.join(directory, "label2id.json"), "w") as f:
    json.dump(label2id, f)

print("Model, tokenizer, and label mappings saved .")
