# BERT Project with Hugging Face
-Joshith Reddy Aleti

This notebook walks through fine-tuning, debugging, evaluating, and creatively applying a BERT model using the Hugging Face library.

## Part 1: Fine-Tuning BERT
We will fine-tune `bert-base-uncased` on the IMDb sentiment analysis dataset.

In [None]:
# Install necessary libraries
!pip install transformers datasets torch scikit-learn

In [None]:
import os
from datasets import load_dataset
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments

# Load IMDb dataset
dataset = load_dataset('imdb')

dataset

In [None]:
# Preprocess: tokenize
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

def tokenize(batch):
    return tokenizer(batch['text'], padding='max_length', truncation=True)

tokenized = dataset.map(tokenize, batched=True, batch_size=32)
tokenized = tokenized.rename_column('label', 'labels')
tokenized.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized['train'].shuffle(seed=42).select(range(2000)), tokenized['test'].shuffle(seed=42).select(range(500))

In [None]:
# Initialize model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

In [None]:
# Set up trainer
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    logging_steps=50,
    save_total_limit=1,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized['train'],
    eval_dataset=tokenized['test'],
)

# Start training
trainer.train()

## Part 2: Debugging Issues
If you encounter poor validation performance or overfitting, try adjusting hyperparameters or dataset size.

In [None]:
# Example: evaluate before and after adjusting learning rate
metrics = trainer.evaluate()
print(metrics)
# If accuracy < 0.8, reduce learning rate or increase batch size

## Part 3: Evaluation Metrics
**Tasks:**
- Generate predictions on test set
- Compute Accuracy, F1-Score, Exact Match (QA), Mean Squared Error (Regression), Log Loss
- Reflect on results and refine model


In [None]:
# Install evaluation libraries
!pip install evaluate scipy

In [None]:
import numpy as np
import evaluate
from scipy.special import softmax
from sklearn.metrics import log_loss

# Load metrics
accuracy_metric = evaluate.load('accuracy')
f1_metric = evaluate.load('f1')
squad_metric = evaluate.load('squad')      # for QA EM and F1
mse_metric = evaluate.load('mse')

# Assume `predictions` and `labels` from previous classification
# e.g., from: predictions = trainer.predict(test_dataset)
predictions = trainer.predict(tokenized['test'])
logits = predictions.predictions
labels = predictions.label_ids
preds = np.argmax(logits, axis=1)
probs = softmax(logits, axis=1)

# Classification metrics
acc = accuracy_metric.compute(predictions=preds, references=labels)['accuracy']
f1 = f1_metric.compute(predictions=preds, references=labels)['f1']
ll = log_loss(labels, probs)

print(f"Classification Accuracy: {acc:.4f}")
print(f"Classification F1-Score: {f1:.4f}")
print(f"Classification Log Loss: {ll:.4f}\n")

In [None]:
# Question-Answering metrics using pipeline
from transformers import pipeline

qa_pipeline = pipeline('question-answering', model='./bert-finetuned-qa', tokenizer='bert-base-uncased')
# Example QA data
qa_examples = [
    {'id': '1', 'context': 'Paris is the capital of France.', 'question': 'What is the capital of France?', 'answers': {'text': ['Paris'], 'answer_start': [0]}},
]
predictions_qa = []
for ex in qa_examples:
    out = qa_pipeline({'context': ex['context'], 'question': ex['question']})
    predictions_qa.append({'id': ex['id'], 'prediction_text': out['answer']})

# Compute QA metrics
results_qa = squad_metric.compute(predictions=predictions_qa, references=qa_examples)
print(f"QA Exact Match: {results_qa['exact_match']:.2f}")
print(f"QA F1: {results_qa['f1']:.2f}\n")

In [None]:
# Regression metrics on STS-B (Semantic Textual Similarity)
from datasets import load_dataset
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

# Load STS-B dataset
sts = load_dataset('glue', 'stsb')
label_key = 'label'

# Assume model fine-tuned for regression exists at ./bert-regression-stsb
reg_model = BertForSequenceClassification.from_pretrained('./bert-regression-stsb')
reg_trainer = Trainer(model=reg_model)
# Prepare test texts
texts = sts['test']['sentence1']
encodings = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
outputs = reg_model(**encodings)
preds_reg = outputs.logits.squeeze()
refs = np.array(sts['test'][label_key])

mse_val = mse_metric.compute(predictions=preds_reg, references=refs)['mse']
print(f"Regression MSE: {mse_val:.4f}")

## Part 4: Creative Application – Named Entity Recognition
Fine-tune BERT for NER on the CoNLL-2003 dataset.

In [None]:
# Install seqeval for NER evaluation
!pip install seqeval evaluate

In [None]:
import numpy as np
import evaluate
from datasets import load_dataset
from transformers import BertTokenizerFast, BertForTokenClassification, TrainingArguments, Trainer

# Load dataset
dataset_ner = load_dataset('conll2003')
labels = dataset_ner['train'].features['ner_tags'].feature.names
tokenizer_ner = BertTokenizerFast.from_pretrained('bert-base-cased')

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer_ner(examples['tokens'], is_split_into_words=True, truncation=True)
    all_labels = examples['ner_tags']
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(labels[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        new_labels.append(label_ids)
    tokenized_inputs['labels'] = new_labels
    return tokenized_inputs

tokenized_ner = dataset_ner.map(tokenize_and_align_labels, batched=True)
model_ner = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(labels))
args_ner = TrainingArguments(
    output_dir='./results-ner', evaluation_strategy='epoch', per_device_train_batch_size=16, num_train_epochs=3
)
seqeval = evaluate.load('seqeval')

def compute_metrics_ner(p):
    preds, labs = p
    preds = np.argmax(preds, axis=-1)
    true_preds = [[labels[p] for (p, l) in zip(pred, lab) if l != -100] for pred, lab in zip(preds, labs)]
    true_labels = [[labels[l] for (p, l) in zip(pred, lab) if l != -100] for pred, lab in zip(preds, labs)]
    results = seqeval.compute(predictions=true_preds, references=true_labels)
    return {
        'precision': results['overall_precision'],
        'recall': results['overall_recall'],
        'f1': results['overall_f1'],
        'accuracy': results['overall_accuracy'],
    }

trainer_ner = Trainer(
    model=model_ner, args=args_ner,
    train_dataset=tokenized_ner['train'].select(range(2000)),
    eval_dataset=tokenized_ner['validation'].select(range(1000)),
    tokenizer=tokenizer_ner,
    compute_metrics=compute_metrics_ner
)
trainer_ner.train()
metrics_ner = trainer_ner.evaluate()
print(metrics_ner)