# Training BERT Model for Final Project

In [1]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm
2025-05-05 00:23:14.721824: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-05 00:23:14.725249: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-05-05 00:23:14.756356: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-05-05 00:23:14.756460: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-05-05 00:23:14.757432: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515

In [2]:
def generate_samples(df, label, window=20, overlap=0):
    all_words = []
    samples = []

    # Flatten all words from all lines into one list
    for line in df['text']:
        if isinstance(line, str):  # skip NaNs
            all_words.extend(line.strip().split())

    i = 0
    # Create overlapping samples
    while i + window <= len(all_words):
        sample = all_words[i:i + window]
        samples.append(' '.join(sample))
        i += window - overlap

    return pd.DataFrame({
        'text': samples,
        'label': [label] * len(samples)
    })

In [3]:
def load_data(shake_file, nonshake_file):
    shakespeare_df = pd.read_csv(shake_file)
    non_shakespeare_df = pd.read_csv(nonshake_file)

    shakespeare_df = generate_samples(shakespeare_df, 1)
    non_shakespeare_df = generate_samples(non_shakespeare_df, 0)

    min_len = min(len(shakespeare_df), len(non_shakespeare_df))
    shakespeare_df = shakespeare_df.sample(n=min_len, random_state=42).reset_index(drop=True)
    non_shakespeare_df = non_shakespeare_df.sample(n=min_len, random_state=42).reset_index(drop=True)

    combined_df = pd.concat([shakespeare_df, non_shakespeare_df], ignore_index=True)
    combined_df = combined_df.sample(frac=1.0, random_state=42).reset_index(drop=True)

    return combined_df['text'].tolist(), combined_df['label'].tolist()

In [4]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}

In [5]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()
    
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

def predict_sentiment(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
    return "Shakespeare" if preds.item() == 1 else "Non-Shakespeare"

In [6]:
bert_model_name = 'bert-base-uncased'
num_classes = 2
max_length = 128
batch_size = 16
num_epochs = 4
learning_rate = 2e-5

texts, labels = load_data('shakespeare.csv', 'nonShakespeare.csv')
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

device = "cpu"
model = BERTClassifier(bert_model_name, num_classes).to(device)

optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

Downloading vocab.txt: 100%|██████████| 226k/226k [00:00<00:00, 41.9MB/s]
Downloading tokenizer_config.json: 100%|██████████| 48.0/48.0 [00:00<00:00, 353kB/s]
Downloading config.json: 100%|██████████| 570/570 [00:00<00:00, 4.40MB/s]
Downloading pytorch_model.bin: 100%|██████████| 420M/420M [00:04<00:00, 108MB/s]
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the 

In [7]:
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train(model, train_dataloader, optimizer, scheduler, device)
    accuracy, report = evaluate(model, val_dataloader, device)
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(report)

torch.save(model.state_dict(), "bert_classifier.pth")

Epoch 1/4
Validation Accuracy: 0.8636
              precision    recall  f1-score   support

           0       0.80      1.00      0.89        37
           1       1.00      0.69      0.82        29

    accuracy                           0.86        66
   macro avg       0.90      0.84      0.85        66
weighted avg       0.89      0.86      0.86        66

Epoch 2/4
Validation Accuracy: 0.9545
              precision    recall  f1-score   support

           0       1.00      0.92      0.96        37
           1       0.91      1.00      0.95        29

    accuracy                           0.95        66
   macro avg       0.95      0.96      0.95        66
weighted avg       0.96      0.95      0.95        66

Epoch 3/4
Validation Accuracy: 0.9545
              precision    recall  f1-score   support

           0       1.00      0.92      0.96        37
           1       0.91      1.00      0.95        29

    accuracy                           0.95        66
   macro avg  

In [14]:
test_text = "life maintains Theridamas. TAMBURLAINE. Theridamas, my friend, take here my hand, Which is as much as if I swore by"
sentiment = predict_sentiment(test_text, model, tokenizer, device)
print("life maintains Theridamas. TAMBURLAINE. Theridamas, my friend, take here my hand, Which is as much as if I swore by")
print(f"Predicted sentiment: {sentiment}")

life maintains Theridamas. TAMBURLAINE. Theridamas, my friend, take here my hand, Which is as much as if I swore by
Predicted sentiment: Non-Shakespeare


In [23]:
for i in range(10, 20):
    sentiment = predict_sentiment(val_texts[i], model, tokenizer, device)
    print(val_texts[i], val_labels[i])
    print(f"Predicted sentiment: {sentiment}")

half serve me. FACE. No, sir! buy The covering off o' churches. MAM. That's true. FACE. Yes. Let them stand 0
Predicted sentiment: Non-Shakespeare
Go in and see, you traitor. Go! [EXIT FACE.] MAM. Who is it, sir? SUB. Nothing, sir; nothing. MAM. What's 0
Predicted sentiment: Non-Shakespeare
was not this nigh shore? ARIEL: Close by PROSPERO: But are they ARIEL: Not a hair perish'd; On their sustaining 1
Predicted sentiment: Shakespeare
Doth it not then our eyelids sink? I find not Myself disposed to sleep. ANTONIO: Nor I; my spirits are 1
Predicted sentiment: Shakespeare
works: Beside, we should give somewhat to man's nature, The place he lives in, still about the fire, And fume 0
Predicted sentiment: Shakespeare
and carbuncle. My foot-boy shall eat pheasants, calver'd salmons, Knots, godwits, lampreys: I myself will have The beards of barbels 0
Predicted sentiment: Non-Shakespeare
majesty complain Of Tamburlaine, that sturdy Scythian thief, That robs your merchants of Persepolis Tradin

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=4cd1bb92-5c33-4c71-9500-71d9821d29e2' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>