# EEBO-BERT Fine-Tuning on Google Colab

**Steps:**
1. Upload `eebo_cleaned_corpus.txt` (7.6GB) to Google Drive
2. Run all cells below
3. Download trained model when complete

**Estimated time:** 6-8 hours on Colab GPU

## 1. Setup: Install Dependencies

In [None]:
!pip install transformers datasets accelerate -q

## 2. Mount Google Drive (to access corpus file)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## 3. Check GPU Availability

In [None]:
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

## 4. Training Configuration

In [None]:
# Update this path to where you uploaded the corpus in Google Drive
CORPUS_PATH = "/content/drive/MyDrive/eebo_cleaned_corpus.txt"

# Training settings
BATCH_SIZE = 8  # Colab GPUs have more memory
MAX_LENGTH = 512
NUM_EPOCHS = 3
LEARNING_RATE = 5e-5
SAVE_STEPS = 1000
LOGGING_STEPS = 100

## 5. Load and Tokenize Corpus

In [None]:
from transformers import BertTokenizer, BertForMaskedLM, DataCollatorForLanguageModeling
from datasets import load_dataset
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Load model and tokenizer
print("Loading BERT model...")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
print(f"✓ Model loaded: {model.num_parameters():,} parameters")

# Load corpus
print(f"\nLoading corpus from {CORPUS_PATH}...")
dataset = load_dataset('text', data_files={'train': CORPUS_PATH}, split='train')
print(f"✓ Loaded {len(dataset):,} lines")

# Tokenize
print("\nTokenizing corpus...")
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        max_length=MAX_LENGTH,
        padding='max_length',
        return_special_tokens_mask=True
    )

tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=['text'],
    desc="Tokenizing"
)
print("✓ Tokenization complete")

## 6. Setup Training

In [None]:
from transformers import Trainer, TrainingArguments

# Data collator for MLM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)

# Training arguments
training_args = TrainingArguments(
    output_dir="./bert_checkpoints",
    overwrite_output_dir=True,
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    save_steps=SAVE_STEPS,
    save_total_limit=2,
    logging_steps=LOGGING_STEPS,
    learning_rate=LEARNING_RATE,
    warmup_steps=500,
    weight_decay=0.01,
    fp16=True,  # Use mixed precision for speed
    logging_dir='./logs',
    report_to='none',
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset,
)

print("✓ Training setup complete")
print(f"Total training steps: {len(tokenized_dataset) // BATCH_SIZE * NUM_EPOCHS:,}")

## 7. Start Training (This will take 6-8 hours)

In [None]:
import time

print("="*60)
print("STARTING TRAINING")
print("="*60)
print("This will take approximately 6-8 hours.")
print("You can close this tab - training will continue.")
print("="*60)

start_time = time.time()

# Train
trainer.train()

total_time = time.time() - start_time
print(f"\n✓ Training complete! Total time: {total_time/3600:.2f} hours")

## 8. Save Final Model

In [None]:
# Save to local Colab storage
output_dir = "./eebo_bert_finetuned"
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"✓ Model saved to {output_dir}")

# Also save to Google Drive
drive_output = "/content/drive/MyDrive/eebo_bert_finetuned"
!cp -r {output_dir} {drive_output}
print(f"✓ Model also saved to Google Drive: {drive_output}")

## 9. Test the Model

In [None]:
from transformers import pipeline

# Load the trained model
fill_mask = pipeline("fill-mask", model=model, tokenizer=tokenizer)

# Test
test_text = "Shall I compare thee to a [MASK] day?"
print(f"Test: '{test_text}'\n")

predictions = fill_mask(test_text)
for i, pred in enumerate(predictions[:5], 1):
    print(f"{i}. {pred['token_str']}: {pred['score']:.4f}")

## 10. Download Model (Optional)

To download the trained model to your computer:

In [None]:
# Zip the model for easier download
!zip -r eebo_bert_finetuned.zip eebo_bert_finetuned/

# Download via Colab files panel (left sidebar)
from google.colab import files
files.download('eebo_bert_finetuned.zip')

print("✓ Model zip file ready for download!")
print("Check your Downloads folder or use the Files panel on the left.")