In [106]:
import torch
import json

from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup

In [107]:
# LOAD DATA FROM DISK

x_inputs = torch.load('./torch-cache/x_inputs_256.pt', weights_only=True)
x_masks = torch.load('./torch-cache/x_masks_256.pt', weights_only=True)
y_labels = torch.load('./torch-cache/y_labels_256.pt', weights_only=True)

print(x_inputs.shape)

with open('./torch-cache/authors.json', 'r') as file:
    authors = json.load(file)

torch.Size([10058, 256])


In [108]:
# PUT DATA INTO TORCH DATALOADER

print("INPUT + LABEL SHAPES: ", x_inputs.shape, x_masks.shape, y_labels.shape)

dataset = TensorDataset(x_inputs, x_masks, y_labels)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

batch_size = 16
train_dataloader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=batch_size
)
validation_dataloader = DataLoader(
    val_dataset,
    sampler=SequentialSampler(val_dataset),
    batch_size=batch_size
)

INPUT + LABEL SHAPES:  torch.Size([10058, 256]) torch.Size([10058, 256]) torch.Size([10058])
9,052 training samples
1,006 validation samples


In [109]:
for step, batch in enumerate(train_dataloader):
    b_input_ids = batch[0]
    b_input_mask = batch[1]
    b_labels = batch[2]

    print(b_input_ids[0])
    print(len(b_input_ids[0]))

    break

tensor([  100,  2002,  2003,  1996,  2062,  5506,  6178,  1997,  1996,  2048,
         2017,  2079,  2025, 10439,  2890, 22342,  2033,  2017,  2031,  1037,
         2980,  5317,  1999,  2115,  2677,  2085,  2017,  3685,  2907,  2292,
         2033,  2041,  2007,  2009,  6203,  2663,  1045,  1005,  2222,  2425,
         2032,  2870,  2079,  1998,  2202,  2035,  1996,  4283,  1998,  2172,
         2204,  2079, 15177,  3492,  2540,  2663,  2909,  2026,  2388,  2038,
         2018,  2014, 14085,  7730,  2300,  3459,  9906,  2011,  1996, 23626,
         2273,  1999, 11190,  4644,  1998,  2027,  2031,  2409,  2014,  2014,
         7280,  1998,  2079,  5676,  2014,  2016,  4618,  2196,  2031,  3407,
         3178,  4983,  2016,  5914,  2306,  2023, 12411,  1005,  2305,  1998,
         2043,  2009,  2003,  2009,  2442,  2022,  1037, 28441,  2027,  2360,
         1037,  2100,  2021,  2009,  2442,  2022,  1037, 10170, 28441,  2748,
         2061,  1996,  1056,  1005,  2060,  2158,  1997, 16808, 

In [110]:
# USE PRE-TRAINED BERT MODEL

model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels=len(authors),
    output_attentions=False,
    output_hidden_states=False,
)

model = model.to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [111]:
# SET UP HYPERPARAMETERS

optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
epochs = 4 # shoudl be 2-4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,  num_warmup_steps = 0, num_training_steps = total_steps)



In [112]:
# HELPER FUNCTIONS

import numpy as np
import time
import datetime
import random

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [113]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

all_labels = np.concatenate([batch[2].numpy() for batch in train_dataloader])
class_weights = compute_class_weight(
    class_weight='balanced',  # Option to balance automatically
    classes=np.unique(all_labels),
    y=all_labels
)
class_weights = torch.tensor(class_weights, dtype=torch.float).to('cuda' if torch.cuda.is_available() else 'cpu')
print("Class weights:", class_weights)

Class weights: tensor([2.3016, 0.8333, 3.1929, 0.7938, 1.2103, 0.8292, 4.5510, 0.3050, 3.8243],
       device='cuda:0')


In [114]:
# MAIN TRAINING LOOP
# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

seed_val = 42
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

training_stats = []
total_t0 = time.time()

for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    t0 = time.time()
    total_train_loss = 0
    model.train()

    for step, batch in enumerate(train_dataloader):
        if step % 40 == 0 and not step == 0:
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        model.zero_grad()        

        outputs = model(b_input_ids, 
                             token_type_ids=None, 
                             attention_mask=b_input_mask, 
                             labels=b_labels)
        # loss = outputs.loss
        logits = outputs.logits
        loss_fn = torch.nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fn(logits, b_labels)

        total_train_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_dataloader)   
    training_time = format_time(time.time() - t0)

    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
        
    # ========================================
    #               Validation
    # ========================================
    print("")
    print("Running Validation...")

    t0 = time.time()
    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0

    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        
        with torch.no_grad():        
            outputs = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
            loss = outputs.loss
            logits = outputs.logits
            
        total_eval_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        total_eval_accuracy += flat_accuracy(logits, label_ids)
        
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    validation_time = format_time(time.time() - t0)
    
    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation took: {:}".format(validation_time))

    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

cuda

Training...
  Batch    40  of    566.    Elapsed: 0:00:04.
  Batch    80  of    566.    Elapsed: 0:00:07.
  Batch   120  of    566.    Elapsed: 0:00:11.
  Batch   160  of    566.    Elapsed: 0:00:14.
  Batch   200  of    566.    Elapsed: 0:00:18.
  Batch   240  of    566.    Elapsed: 0:00:22.
  Batch   280  of    566.    Elapsed: 0:00:25.
  Batch   320  of    566.    Elapsed: 0:00:29.
  Batch   360  of    566.    Elapsed: 0:00:32.
  Batch   400  of    566.    Elapsed: 0:00:36.
  Batch   440  of    566.    Elapsed: 0:00:40.
  Batch   480  of    566.    Elapsed: 0:00:43.
  Batch   520  of    566.    Elapsed: 0:00:47.
  Batch   560  of    566.    Elapsed: 0:00:50.

  Average training loss: 1.49
  Training epcoh took: 0:00:51

Running Validation...
  Accuracy: 0.72
  Validation Loss: 0.91
  Validation took: 0:00:02

Training...
  Batch    40  of    566.    Elapsed: 0:00:04.
  Batch    80  of    566.    Elapsed: 0:00:07.
  Batch   120  of    566.    Elapsed: 0:00:11.
  Batch   160  of

In [115]:
# SAVE FINE-TUNED MODEL TO DISK

torch.save(model, './torch-cache/test11_base_256.ckpt')

# model_e2_b_cw1_s128 -> 0.80
# model_e3_b_cw1_s128 -> 0.81
# model_e4_b_cw1_s128 -> 0.85

# model_e2_b_cw1_s256 -> 0.87
# model_e3_b_cw1_s256 -> 0.91
# model_e4_b_cw1_s256 -> 0.92

# model_e2_b_cw1_s512 -> 0.80
# model_e3_b_cw1_s512 -> 0.90
# model_e4_b_cw1_s512 -> 0.96

# model_e4_l_cw1_s256 -> 0.93
# model_e4_l_cw1_s128 -> ???