# Prepare for Training

### Tokenizer

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")

### Load the tokenized_dataset

In [1]:
from datasets import load_from_disk

tokenized_dataset= load_from_disk("esg_tokenized")

### Convert to PyTorch tensors

In [2]:
tokenized_dataset.set_format("torch")

## Data Loader

Batch size can be increased if you have enough vram

In [6]:
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

# For dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Prepare the data in batch size of 2 with dynamic padding, shuffles at each epoch
train_dataloader = DataLoader(
    tokenized_dataset["train"], shuffle=True, batch_size=2, collate_fn=data_collator
)

# Prepare validation data in batch size of 2
val_dataloader = DataLoader(
    tokenized_dataset["validation"], batch_size=2, collate_fn=data_collator
)

# Prepare validation data in batch size of 2
test_dataloader = DataLoader(
    tokenized_dataset["test"], batch_size=2, collate_fn=data_collator
)

## Loading and modifying the model for regression

In [7]:
from transformers import LongformerModel
import torch.nn as nn

class LongformerForRegression(nn.Module):
    def __init__(self):
        super().__init__()
        # Load the pretrained Longformer model (handles up to 4096 tokens)
        self.longformer = LongformerModel.from_pretrained("allenai/longformer-base-4096")
        
        # Dropout for regularization (prevents overfitting)
        self.dropout = nn.Dropout(0.1)
        
        # Final regression layer to predict 4 scores: e_score, s_score, g_score, total_score
        self.regressor = nn.Linear(self.longformer.config.hidden_size, 4)

    def forward(self, input_ids, attention_mask):
        # Forward pass through Longformer
        outputs = self.longformer(input_ids=input_ids, attention_mask=attention_mask)
        
        # Try to use pooled output (if available)
        pooled_output = outputs.pooler_output
        
        # If pooled output is None (some models don't provide it), use mean pooling over all tokens
        if pooled_output is None:
            pooled_output = outputs.last_hidden_state.mean(dim=1)  # shape: (batch_size, hidden_size)
        
        # Apply dropout
        pooled_output = self.dropout(pooled_output)
        
        # Pass through regression head to get 4 continuous outputs
        return self.regressor(pooled_output)

### Test to see if model works

In [8]:
import torch

model = LongformerForRegression()
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
model.eval()

batch = next(iter(train_dataloader))  # get one batch
batch = {k: v.to(device) for k, v in batch.items()}  # move to device

inputs = {k: batch[k] for k in ["input_ids", "attention_mask"]}  # only model inputs
outputs = model(**inputs)

print(outputs.shape)  # should be [batch_size, 4]
print(outputs)        # raw predictions for the 4 scores

torch.Size([2, 4])
tensor([[ 0.2830, -0.0491,  0.1723,  0.2213],
        [ 0.2815, -0.0459,  0.1947,  0.2289]], device='cuda:0',
       grad_fn=<AddmmBackward0>)


As expected 2 batches of scores were outputted, each batch contains 4 scores corresponding to e_score, s_score, g_score, combined_score

## Optimizer

In [9]:
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

## Scheduler

In [10]:
# Define scheduler to change learning rate
from transformers import get_scheduler

num_epochs = 3
# num epochs * num of batches
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear", # Type of scheduler (linear decay)
    optimizer=optimizer,
    num_warmup_steps=0, # No warm-up period, meaning the learning rate starts at the maximum value right away and decreases linearly
    num_training_steps=num_training_steps,
)
print(num_training_steps)

16083


## Move Model to GPU

In [None]:
# Move model to GPU if avail or CPU
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

## Set Model to Train

In [11]:
# Set the model to training mode
model.train()  # This ensures layers like dropout are active during training

LongformerForRegression(
  (longformer): LongformerModel(
    (embeddings): LongformerEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (position_embeddings): Embedding(4098, 768, padding_idx=1)
    )
    (encoder): LongformerEncoder(
      (layer): ModuleList(
        (0-11): 12 x LongformerLayer(
          (attention): LongformerAttention(
            (self): LongformerSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (query_global): Linear(in_features=768, out_features=768, bias=True)
              (key_global): Linear(in_features=768, out_features=768, bias=True)
              (value_g

# Training Loop

In [13]:
from tqdm.auto import tqdm
import torch.nn as nn
import copy

loss_fn = nn.MSELoss()

best_val_loss = float('inf')
patience = 3         # how many epochs to wait before stopping
patience_counter = 0
best_model_state = None

for epoch in range(num_epochs):
    # === TRAINING ===
    model.train()
    train_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch+1}"):
        batch = {k: v.to(device) for k, v in batch.items()}

        inputs = {
            "input_ids": batch["input_ids"],
            "attention_mask": batch["attention_mask"]
        }
        labels = torch.stack([
            batch['e_score'],
            batch['s_score'],
            batch['g_score'],
            batch['total_score']
        ], dim=1).float()

        outputs = model(**inputs)
        loss = loss_fn(outputs, labels)
        train_loss += loss.item()

        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    avg_train_loss = train_loss / len(train_dataloader)
    print(f"\nEpoch {epoch+1} Training Loss: {avg_train_loss:.4f}")

    # === VALIDATION ===
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc=f"Validation Epoch {epoch+1}"):
            batch = {k: v.to(device) for k, v in batch.items()}

            inputs = {
                "input_ids": batch["input_ids"],
                "attention_mask": batch["attention_mask"]
            }
            labels = torch.stack([
                batch['e_score'],
                batch['s_score'],
                batch['g_score'],
                batch['total_score']
            ], dim=1).float()

            outputs = model(**inputs)
            loss = loss_fn(outputs, labels)
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1} Validation Loss: {avg_val_loss:.4f}")

    # === EARLY STOPPING CHECK ===
    if avg_val_loss < best_val_loss:
        print("Validation loss improved — saving model.\n")
        best_val_loss = avg_val_loss
        patience_counter = 0
        best_model_state = copy.deepcopy(model.state_dict())
    else:
        patience_counter += 1
        print(f"No improvement. Patience: {patience_counter}/{patience}\n")
        if patience_counter >= patience:
            print("Early stopping triggered.")
            break

# === Load best model ===
if best_model_state:
    model.load_state_dict(best_model_state)
    print("Loaded best model from saved state.")


Training Epoch 1:   0%|          | 0/5361 [00:00<?, ?it/s]

Input ids are automatically padded to be a multiple of `config.attention_window`: 512



Epoch 1 Training Loss: 30.6857


Validation Epoch 1:   0%|          | 0/594 [00:00<?, ?it/s]

Epoch 1 Validation Loss: 34.4633
Validation loss improved — saving model.



Training Epoch 2:   0%|          | 0/5361 [00:00<?, ?it/s]


Epoch 2 Training Loss: 29.0902


Validation Epoch 2:   0%|          | 0/594 [00:00<?, ?it/s]

Epoch 2 Validation Loss: 34.4729
No improvement. Patience: 1/3



Training Epoch 3:   0%|          | 0/5361 [00:00<?, ?it/s]


Epoch 3 Training Loss: 29.0468


Validation Epoch 3:   0%|          | 0/594 [00:00<?, ?it/s]

Epoch 3 Validation Loss: 34.6825
No improvement. Patience: 2/3

Loaded best model from saved state.


# Save the model weights

In [16]:
import torch

# Save model weights
output_dir = "C:/Users/steve/HuggingFace Models/LongFormer_ESG_Score"
torch.save(model.state_dict(), f"{output_dir}/pytorch_model.bin")

# Save tokenizer as usual (since it's from Hugging Face)
tokenizer.save_pretrained(output_dir)

('C:/Users/steve/HuggingFace Models/LongFormer_ESG_Score\\tokenizer_config.json',
 'C:/Users/steve/HuggingFace Models/LongFormer_ESG_Score\\special_tokens_map.json',
 'C:/Users/steve/HuggingFace Models/LongFormer_ESG_Score\\vocab.json',
 'C:/Users/steve/HuggingFace Models/LongFormer_ESG_Score\\merges.txt',
 'C:/Users/steve/HuggingFace Models/LongFormer_ESG_Score\\added_tokens.json',
 'C:/Users/steve/HuggingFace Models/LongFormer_ESG_Score\\tokenizer.json')