# Alejandro Paredes, Parameter tuning of BERT

https://arunm8489.medium.com/understanding-distil-bert-in-depth-5f2ca92cf1ed

In [1]:
#from google.colab import drive
#drive.mount('/content/gdrive')

In [2]:
import torch

if torch.cuda.is_available():
    print("CUDA is available!")
else:
    print("CUDA is not available.")

CUDA is available!


In [3]:
#!pip install transformers datasets peft evaluate datasets contractions tweet-preprocessor

In [4]:
from datasets import load_dataset, DatasetDict, Dataset
from transformers import (
    AutoTokenizer,
    DistilBertModel,
    DistilBertTokenizer,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

from tqdm import tqdm

import re
import contractions
import numpy as np

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import preprocessor as p

# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

device

  from .autonotebook import tqdm as notebook_tqdm


'cuda'

In [5]:
from datasets import load_dataset

from transformers import BertTokenizerFast

from torch.utils.data import DataLoader

# Load all CSV files in the ./data directory
#data_files = "./data/*.csv"

# Load and combine the datasets
dataset = load_dataset("csv", data_files="./data/2017_1.csv")#data_files)

# Filter and split the dataset
df  = dataset['train'].filter(
    lambda example: example['headline'] is not None and example['headline'].strip() != ''
).train_test_split(test_size=0.1)

# Display the resulting dataset
df 

DatasetDict({
    train: Dataset({
        features: ['id', 'date_publish', 'outlet', 'headline', 'lead', 'body', 'authors', 'domain', 'url', 'political_leaning'],
        num_rows: 132046
    })
    test: Dataset({
        features: ['id', 'date_publish', 'outlet', 'headline', 'lead', 'body', 'authors', 'domain', 'url', 'political_leaning'],
        num_rows: 14672
    })
})

In [6]:
model_checkpoint = 'distilbert-base-uncased'

#Define label maps
id2label = {0:"UNDEFINED" ,1:"LEFT",2:"RIGHT",3:"CENTER"}
label2id = {"UNDEFINED": 0, "LEFT": 1, "RIGHT": 2, "CENTER": 3}

tokenizer =  DistilBertTokenizer.from_pretrained(model_checkpoint, add_prefix=True)

In [7]:
#lemmatization and removing stopwords
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('stopwords')

#lemmatizer = WordNetLemmatizer()
#stop_words = set(stopwords.words("english"))

p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.SMILEY)

def preprocess(text):
    def is_english_word(word):
        """Function to filter out non-English words."""
        return bool(re.match(r'^[a-zA-Z]+$', word))
    text = text.lower()
    text = contractions.fix(text)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    text = p.clean(text)
    return text

In [8]:
for i in range(5):
    print('Original Text: ', df['train']['headline'][i], '\n')
    print('Tokenized Text: ', tokenizer.tokenize(preprocess(df['train']['headline'][i])), '\n')
    print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(df['train']['headline'][i])))

#for i in range(2):
    #print('Original Text: ', df['train']['body'][i], '\n')
    #print('Tokenized Text: ', tokenizer.tokenize(preprocess(df['train']['body'][i])), '\n')
    #print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(df['train']['body'][i])))


Original Text:  Scott Walker’s School Bonus 

Tokenized Text:  ['scott', 'walker', 's', 'school', 'bonus'] 

Token IDs:  [3660, 5232, 1521, 1055, 2082, 6781]
Original Text:  BRIEF-Deutsche Bank hires William White as head of US Life Sciences 

Tokenized Text:  ['brief', '-', 'deutsche', 'bank', 'hires', 'william', 'white', 'as', 'head', 'of', 'us', 'life', 'sciences'] 

Token IDs:  [4766, 1011, 11605, 2924, 28208, 2520, 2317, 2004, 2132, 1997, 2149, 2166, 4163]
Original Text:  State police: 3 dead, including deputy, in rural Arkansas 

Tokenized Text:  ['state', 'police', ':', '3', 'dead', ',', 'including', 'deputy', ',', 'in', 'rural', 'arkansas'] 

Token IDs:  [2110, 2610, 1024, 1017, 2757, 1010, 2164, 4112, 1010, 1999, 3541, 6751]
Original Text:  Republicans are incapable of crafting a humane health care bill 

Tokenized Text:  ['republicans', 'are', 'incapable', 'of', 'craft', '##ing', 'a', 'humane', 'health', 'care', 'bill'] 

Token IDs:  [10643, 2024, 19907, 1997, 7477, 2075, 103

In [9]:
texts = df['train']['headline']

# Handle None or missing values by filtering out None entries
text_lengths = [len(text.split(' ')) if text is not None else 0 for text in texts]

print(min(text_lengths))
print(max(text_lengths))

# Count how many texts have 300 or more words
print(sum([1 for length in text_lengths if length >= 300]))

# Repeat for the 'body' column
texts = df['train']['body']

# Handle None or missing values by filtering out None entries
text_lengths = [len(text.split()) if text is not None else 0 for text in texts]

print(min(text_lengths))
print(max(text_lengths))

# Count how many texts have 300 or more words
print(sum([1 for length in text_lengths if length >= 300]))


1
40
0
15
17700
87445


# **Creating a custom model**

In [None]:
import torch
from transformers import DistilBertModel

class DistillBERTClass(torch.nn.Module):
    def __init__(self, model_checkpoint='distilbert-base-uncased'):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained(model_checkpoint, num_labels=8)

        # Freeze DistilBERT parameters (except for the new layers)
        for param in self.l1.parameters():
            param.requires_grad = False
        
        # Add custom query layers for each transformer layer
        # Custom query low-rank matrices for the attention mechanism (inspired by LoRA)
        self.DocClass_A = torch.nn.Parameter(torch.randn(768, 768))  # Matrix A (query transformation)
        self.DocClass_B = torch.nn.Parameter(torch.randn(768, 768))  # Matrix B (query transformation)

        # Additional layers for classification
        self.dropout = torch.nn.Dropout(0.3)
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.fc1 = torch.nn.Linear(768, 1024)  
        self.classifier = torch.nn.Linear(1024, 5)
        self.relu = torch.nn.ReLU()
        self.softmax = torch.nn.Softmax(dim=1)

    def add_query_layers(self):
        # Access the attention layers of DistilBERT
        for i, layer in enumerate(self.l1.transformer.layer):
            # Access query (q_lin) and create new layers
            # These layers will be trainable while keeping the base layers frozen
            q_lin = layer.attention.q_lin
            
            # Freezing the original query layer weights, and adding LoRA layers
            q_lin.base_layer.weight.requires_grad = False
            q_lin.base_layer.bias.requires_grad = False
            
            # Add new LoRA layers (A and B matrices)
            q_DocClass_w = torch.nn.Parameter(torch.randn_like(q_lin.base_layer.weight))
            q_DocClass_b = torch.nn.Parameter(torch.randn_like(q_lin.base_layer.bias))
            
            # Register new parameters to the model
            self.register_parameter(f'layer_{i}_lora_A', q_DocClass_w)
            self.register_parameter(f'layer_{i}_lora_B', q_DocClass_b)
            
            # Re-define q_lin as a combination of the base layer and LoRA layers
            q_lin.DocClass_w = q_DocClass_w
            q_lin.DocClass_b = q_DocClass_b

    def forward(self, input_ids, attention_mask):
        output = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = self.dropout(pooler)
        pooler = self.fc1(pooler)
        pooler = self.relu(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        output = self.softmax(output)
        return output


In [11]:
from torch.optim.lr_scheduler import StepLR

# Defining some key variables that will be used later on in the training
MAX_LEN = 512
TRAIN_BATCH_SIZE = 10
VALID_BATCH_SIZE = 10
EPOCHS = 10
LEARNING_RATE = 1e-05



model = DistillBERTClass()
model.to(device)

# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
scheduler = StepLR(optimizer, step_size=2, gamma=0.35)


model

AttributeError: 'Linear' object has no attribute 'base_layer'

In [None]:
for name, param in model.named_parameters():
    print(f"{name}: requires_grad={param.requires_grad}")

l1.embeddings.word_embeddings.weight: requires_grad=False
l1.embeddings.position_embeddings.weight: requires_grad=False
l1.embeddings.LayerNorm.weight: requires_grad=False
l1.embeddings.LayerNorm.bias: requires_grad=False
l1.transformer.layer.0.attention.q_lin.weight: requires_grad=False
l1.transformer.layer.0.attention.q_lin.bias: requires_grad=False
l1.transformer.layer.0.attention.k_lin.weight: requires_grad=False
l1.transformer.layer.0.attention.k_lin.bias: requires_grad=False
l1.transformer.layer.0.attention.v_lin.weight: requires_grad=False
l1.transformer.layer.0.attention.v_lin.bias: requires_grad=False
l1.transformer.layer.0.attention.out_lin.weight: requires_grad=False
l1.transformer.layer.0.attention.out_lin.bias: requires_grad=False
l1.transformer.layer.0.sa_layer_norm.weight: requires_grad=False
l1.transformer.layer.0.sa_layer_norm.bias: requires_grad=False
l1.transformer.layer.0.ffn.lin1.weight: requires_grad=False
l1.transformer.layer.0.ffn.lin1.bias: requires_grad=False


In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

if tokenizer.pad_token is None:
  tokenizer.add_special_tokens({'pad_token': '[PAD]'})
  model.resize_token_embeddings(len(tokenizer))

In [None]:
def tokenize_function(examples):
    #text = examples["body"]
    text = examples["body"]
    labels = examples["political_leaning"]

    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,#[preprocess(t) for t in text] ,
        return_tensors = "np",
        padding = True,
        truncation = True,
        max_length = 512
        )

    tokenized_inputs["labels"] = [label2id[label] for label in labels]
    return tokenized_inputs

#tokenized_dataset = df.map(tokenize_function, batched=True)
#tokenized_dataset

In [None]:
# Define split ratio for validation
train_test_split = df["train"].train_test_split(test_size=0.1)  # 10% for validation
datasets = DatasetDict({
    "train": train_test_split["train"],
    "validation": train_test_split["test"],  # This is your validation set
    "test": df["test"],       # Keep the original test set
})

In [None]:
import re
import contractions
from torch.utils.data import Dataset

# Define the mapping for political leaning categories to numeric values
category_mapping = {
    'LEFT': 0,
    'CENTER': 1,
    'RIGHT': 2,
    'UNDEFINED': 3
}

# Preprocessing function
def preprocess(text):
    """ Preprocess the text to clean it for tokenization """
    def is_english_word(word):
        """Function to filter out non-English words."""
        return bool(re.match(r'^[a-zA-Z]+$', word))

    text = text.lower()  # Convert to lowercase
    text = contractions.fix(text)  # Expand contractions (e.g., "don't" -> "do not")
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII characters
    text = p.clean(text)  # Clean text using the clean-text library
    return text

class Triage(Dataset):
    def __init__(self, dataset, tokenizer, max_length):
        self.texts = dataset['body']  # Assuming 'text' column contains the raw text
        self.labels = dataset['political_leaning']
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __getitem__(self, index):
        # Get raw text and label for the current index
        text = self.texts[index]
        label = self.labels[index]
        tokenizer.truncation_side = "left"
        #tokenized_inputs = self.tokenizer(
        tokenized_inputs = self.tokenizer.encode_plus(
            preprocess(text),
            None,
            #return_tensors="pt",
            #padding=True,
            #truncation=True,
            #max_length=self.max_length
            add_special_tokens=True,
            max_length=self.max_length,
            pad_to_max_length=True
        )

        #encoding = tokenize_function({"text": [text], "labels": [label]}, self.tokenizer, self.max_length)
        input_ids = tokenized_inputs['input_ids']  # Remove the batch dimension
        attention_mask = tokenized_inputs['attention_mask']  # Remove the batch dimension

        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'labels': torch.tensor(category_mapping[self.labels[index]], dtype=torch.float)
        }

    def __len__(self):
        return len(self.texts)


In [None]:
train_dataset = Triage(datasets['train'], tokenizer, max_length=512)
val_dataset = Triage(datasets['validation'], tokenizer, max_length=512)
test_dataset = Triage(datasets['test'], tokenizer, max_length=512)

In [None]:
# Training DataLoader
training_loader = DataLoader(
    train_dataset,
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    collate_fn=data_collator,
)

# Validation DataLoader
val_loader = DataLoader(
    val_dataset,
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    collate_fn=data_collator,
)

# Test DataLoader
test_loader = DataLoader(
    test_dataset,
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    collate_fn=data_collator,
)

### Training the model

In [None]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model
def calculate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        #token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['labels'].to(device, dtype = torch.long)

        outputs = model(ids, mask)#, token_type_ids)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calculate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)

        if _%500==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples
            print(f"Training Loss per 500 steps: {loss_step}")
            print(f"Training Accuracy per 500 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return

def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            #token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['labels'].to(device, dtype = torch.long)
            outputs = model(ids, mask).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calculate_accuracy(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)

            if _%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")

    return epoch_loss, epoch_accu


In [None]:
best_val_loss = float("inf")

for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch + 1}/{EPOCHS}")
    print("-" * 30)

    train_loss = train(epoch)
    val_loss, val_accuracy = valid(model, val_loader)
    # Save the best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "./models/local_run_BERT_body/best_model.pt")
        print("Saved Best Model!")
    scheduler.step()


Epoch 1/10
------------------------------


0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
1it [00:01,  1.45s/it]

Training Loss per 500 steps: 1.6161530017852783
Training Accuracy per 500 steps: 0.0


501it [02:36,  3.52it/s]

Training Loss per 500 steps: 1.5428457041224557
Training Accuracy per 500 steps: 32.375249500998


1001it [05:10,  3.26it/s]

Training Loss per 500 steps: 1.5175689638673246
Training Accuracy per 500 steps: 36.4035964035964


1501it [07:50,  3.21it/s]

Training Loss per 500 steps: 1.4942098008561817
Training Accuracy per 500 steps: 39.693537641572284


2001it [10:28,  3.26it/s]

Training Loss per 500 steps: 1.4723683567061416
Training Accuracy per 500 steps: 42.31384307846077


2501it [13:09,  3.11it/s]

Training Loss per 500 steps: 1.4573322988328625
Training Accuracy per 500 steps: 44.05037984806078


3001it [15:56,  2.95it/s]

Training Loss per 500 steps: 1.4449079675858754
Training Accuracy per 500 steps: 45.481506164611794


3501it [18:41,  3.08it/s]

Training Loss per 500 steps: 1.4355127770571803
Training Accuracy per 500 steps: 46.441016852327905


4001it [21:25,  3.12it/s]

Training Loss per 500 steps: 1.4271307935478745
Training Accuracy per 500 steps: 47.32066983254187


4501it [24:11,  3.17it/s]

Training Loss per 500 steps: 1.4203222286592614
Training Accuracy per 500 steps: 48.02710508775828


5001it [26:59,  3.17it/s]

Training Loss per 500 steps: 1.4139577963428005
Training Accuracy per 500 steps: 48.61427714457108


5501it [29:47,  3.11it/s]

Training Loss per 500 steps: 1.40909990642834
Training Accuracy per 500 steps: 49.07289583712053


6001it [32:36,  2.85it/s]

Training Loss per 500 steps: 1.403704816590665
Training Accuracy per 500 steps: 49.61673054490918


6501it [35:25,  2.96it/s]

Training Loss per 500 steps: 1.3986092389060834
Training Accuracy per 500 steps: 50.10459929241655


7001it [38:09,  3.30it/s]

Training Loss per 500 steps: 1.3939991015193292
Training Accuracy per 500 steps: 50.59277246107699


7501it [40:53,  3.11it/s]

Training Loss per 500 steps: 1.3898318587994674
Training Accuracy per 500 steps: 51.03586188508199


8001it [43:37,  3.19it/s]

Training Loss per 500 steps: 1.387112283472746
Training Accuracy per 500 steps: 51.31233595800525


8501it [46:19,  3.41it/s]

Training Loss per 500 steps: 1.3838841073962607
Training Accuracy per 500 steps: 51.66098106105164


9001it [48:58,  3.41it/s]

Training Loss per 500 steps: 1.3810516423933268
Training Accuracy per 500 steps: 51.953116320408846


9501it [51:36,  3.01it/s]

Training Loss per 500 steps: 1.37823401204085
Training Accuracy per 500 steps: 52.257657088727505


10001it [54:15,  3.26it/s]

Training Loss per 500 steps: 1.3758129998274224
Training Accuracy per 500 steps: 52.504749525047494


10501it [57:00,  3.10it/s]

Training Loss per 500 steps: 1.3730635547249694
Training Accuracy per 500 steps: 52.775926102275974


11001it [59:44,  2.87it/s]

Training Loss per 500 steps: 1.3705200910210642
Training Accuracy per 500 steps: 53.00609035542224


11501it [1:02:31,  2.82it/s]

Training Loss per 500 steps: 1.368373766094485
Training Accuracy per 500 steps: 53.21450308668811


11885it [1:04:38,  3.06it/s]


The Total Accuracy for Epoch 0: 53.394872140086335
Training Loss Epoch: 1.3666074358168199
Training Accuracy Epoch: 53.394872140086335


1it [00:00,  2.46it/s]

Validation Loss per 100 steps: 1.277788758277893
Validation Accuracy per 100 steps: 60.0


1321it [06:59,  3.15it/s]


Validation Loss Epoch: 1.3072979011651154
Validation Accuracy Epoch: 58.93979553199546
Saved Best Model!

Epoch 2/10
------------------------------


1it [00:00,  2.31it/s]

Training Loss per 500 steps: 1.2032194137573242
Training Accuracy per 500 steps: 90.0


501it [02:49,  3.10it/s]

Training Loss per 500 steps: 1.315928783483372
Training Accuracy per 500 steps: 58.08383233532934


1001it [05:40,  3.17it/s]

Training Loss per 500 steps: 1.3130218085947332
Training Accuracy per 500 steps: 58.63136863136863


1501it [08:27,  2.96it/s]

Training Loss per 500 steps: 1.3091357048553756
Training Accuracy per 500 steps: 59.09393737508328


2001it [11:12,  3.20it/s]

Training Loss per 500 steps: 1.3062417942961713
Training Accuracy per 500 steps: 59.44527736131934


2501it [14:00,  2.94it/s]

Training Loss per 500 steps: 1.3057544640615815
Training Accuracy per 500 steps: 59.42423030787685


3001it [16:48,  2.72it/s]

Training Loss per 500 steps: 1.305373816718979
Training Accuracy per 500 steps: 59.480173275574806


3501it [19:37,  3.00it/s]

Training Loss per 500 steps: 1.3059892589042814
Training Accuracy per 500 steps: 59.41159668666096


4001it [22:25,  3.06it/s]

Training Loss per 500 steps: 1.3053471856014753
Training Accuracy per 500 steps: 59.45013746563359


4501it [25:17,  2.54it/s]

Training Loss per 500 steps: 1.305383093977896
Training Accuracy per 500 steps: 59.446789602310595


5001it [28:35,  1.86it/s]

Training Loss per 500 steps: 1.3041611463635998
Training Accuracy per 500 steps: 59.542091581683664


5501it [31:42,  2.95it/s]

Training Loss per 500 steps: 1.304425466539556
Training Accuracy per 500 steps: 59.538265769860026


6001it [34:31,  3.06it/s]

Training Loss per 500 steps: 1.304336841623697
Training Accuracy per 500 steps: 59.55507415430762


6501it [37:19,  3.00it/s]

Training Loss per 500 steps: 1.3029152771598136
Training Accuracy per 500 steps: 59.709275496077524


7001it [40:09,  2.91it/s]

Training Loss per 500 steps: 1.302708188843138
Training Accuracy per 500 steps: 59.73860877017569


7501it [43:03,  3.10it/s]

Training Loss per 500 steps: 1.302828518585624
Training Accuracy per 500 steps: 59.72137048393547


8001it [45:50,  3.15it/s]

Training Loss per 500 steps: 1.3027352384888609
Training Accuracy per 500 steps: 59.722534683164604


8501it [48:21,  3.32it/s]

Training Loss per 500 steps: 1.302699649197931
Training Accuracy per 500 steps: 59.73297259145983


9001it [50:55,  3.41it/s]

Training Loss per 500 steps: 1.3025540443820802
Training Accuracy per 500 steps: 59.735584935007225


9501it [53:26,  2.98it/s]

Training Loss per 500 steps: 1.3017417118745331
Training Accuracy per 500 steps: 59.82738659088517


10001it [55:56,  3.13it/s]

Training Loss per 500 steps: 1.301117408157599
Training Accuracy per 500 steps: 59.9020097990201


10501it [58:26,  3.36it/s]

Training Loss per 500 steps: 1.3008907526439173
Training Accuracy per 500 steps: 59.92000761832207


11001it [1:00:58,  3.19it/s]

Training Loss per 500 steps: 1.300824186995012
Training Accuracy per 500 steps: 59.92364330515408


11501it [1:03:27,  3.60it/s]

Training Loss per 500 steps: 1.3003459801550337
Training Accuracy per 500 steps: 59.98521867663681


11885it [1:05:22,  3.03it/s]


The Total Accuracy for Epoch 1: 60.049982750061005
Training Loss Epoch: 1.2997019212337215
Training Accuracy Epoch: 60.049982750061005


1it [00:00,  2.80it/s]

Validation Loss per 100 steps: 1.2994295358657837
Validation Accuracy per 100 steps: 70.0


1321it [07:11,  3.06it/s]


Validation Loss Epoch: 1.2805591528384999
Validation Accuracy Epoch: 61.893222264293826
Saved Best Model!

Epoch 3/10
------------------------------


1it [00:00,  3.26it/s]

Training Loss per 500 steps: 1.5001012086868286
Training Accuracy per 500 steps: 40.0


501it [02:27,  3.32it/s]

Training Loss per 500 steps: 1.2819078206540107
Training Accuracy per 500 steps: 62.29540918163673


1001it [04:53,  3.63it/s]

Training Loss per 500 steps: 1.2862963100056073
Training Accuracy per 500 steps: 61.75824175824176


1501it [07:20,  3.29it/s]

Training Loss per 500 steps: 1.2870408196515992
Training Accuracy per 500 steps: 61.652231845436376


2001it [09:47,  3.44it/s]

Training Loss per 500 steps: 1.2873815163739142
Training Accuracy per 500 steps: 61.58420789605197


2501it [12:13,  3.30it/s]

Training Loss per 500 steps: 1.287135664533015
Training Accuracy per 500 steps: 61.607357057177126


3001it [14:39,  3.31it/s]

Training Loss per 500 steps: 1.2862713568729387
Training Accuracy per 500 steps: 61.67277574141953


3501it [17:07,  3.60it/s]

Training Loss per 500 steps: 1.2857452420737532
Training Accuracy per 500 steps: 61.65381319622965


4001it [19:34,  3.45it/s]

Training Loss per 500 steps: 1.2853905905994347
Training Accuracy per 500 steps: 61.624593851537114


4501it [22:01,  3.66it/s]

Training Loss per 500 steps: 1.286024919723675
Training Accuracy per 500 steps: 61.506331926238616


5001it [24:29,  3.34it/s]

Training Loss per 500 steps: 1.2857016411763957
Training Accuracy per 500 steps: 61.54169166166766


5501it [26:55,  3.48it/s]

Training Loss per 500 steps: 1.2848771393266598
Training Accuracy per 500 steps: 61.6015269950918


6001it [29:23,  3.36it/s]

Training Loss per 500 steps: 1.284538881557422
Training Accuracy per 500 steps: 61.64305949008499


6501it [31:50,  3.21it/s]

Training Loss per 500 steps: 1.2845531863499524
Training Accuracy per 500 steps: 61.644362405783724


7001it [34:18,  3.49it/s]

Training Loss per 500 steps: 1.285039492027025
Training Accuracy per 500 steps: 61.568347378945866


7501it [36:46,  3.26it/s]

Training Loss per 500 steps: 1.2848063242417973
Training Accuracy per 500 steps: 61.59578722836955


8001it [39:13,  3.61it/s]

Training Loss per 500 steps: 1.285065951689737
Training Accuracy per 500 steps: 61.54230721159855


8501it [41:43,  3.33it/s]

Training Loss per 500 steps: 1.2856692822733118
Training Accuracy per 500 steps: 61.471591577461474


9001it [44:09,  3.46it/s]

Training Loss per 500 steps: 1.2857345832664295
Training Accuracy per 500 steps: 61.45983779580047


9501it [46:37,  3.32it/s]

Training Loss per 500 steps: 1.2856327539970342
Training Accuracy per 500 steps: 61.460898852752344


10001it [49:05,  3.42it/s]

Training Loss per 500 steps: 1.2856004309587485
Training Accuracy per 500 steps: 61.455854414558544


10501it [52:06,  1.68it/s]

Training Loss per 500 steps: 1.2848961067510758
Training Accuracy per 500 steps: 61.52366441291306


11001it [54:53,  3.62it/s]

Training Loss per 500 steps: 1.284922920241268
Training Accuracy per 500 steps: 61.528951913462414


11501it [57:48,  2.22it/s]

Training Loss per 500 steps: 1.2850420068589596
Training Accuracy per 500 steps: 61.52769324406573


11885it [1:00:43,  3.26it/s]


The Total Accuracy for Epoch 2: 61.5814407485632
Training Loss Epoch: 1.2845688478706963
Training Accuracy Epoch: 61.5814407485632


1it [00:00,  1.77it/s]

Validation Loss per 100 steps: 1.2670718431472778
Validation Accuracy per 100 steps: 70.0


1321it [09:30,  2.32it/s]


Validation Loss Epoch: 1.2745207325695682
Validation Accuracy Epoch: 62.529344945096554
Saved Best Model!

Epoch 4/10
------------------------------


1it [00:00,  1.79it/s]

Training Loss per 500 steps: 1.4189057350158691
Training Accuracy per 500 steps: 50.0


501it [03:42,  3.39it/s]

Training Loss per 500 steps: 1.2849637805344816
Training Accuracy per 500 steps: 61.477045908183634


1001it [06:09,  3.41it/s]

Training Loss per 500 steps: 1.2870539551252846
Training Accuracy per 500 steps: 61.408591408591406


1501it [08:37,  3.32it/s]

Training Loss per 500 steps: 1.287765516153103
Training Accuracy per 500 steps: 61.3057961359094


2001it [12:30,  1.95it/s]

Training Loss per 500 steps: 1.283299863785282
Training Accuracy per 500 steps: 61.75912043978011


2501it [16:45,  3.46it/s]

Training Loss per 500 steps: 1.2827683824484275
Training Accuracy per 500 steps: 61.819272291083564


3001it [19:14,  3.36it/s]

Training Loss per 500 steps: 1.282546733169308
Training Accuracy per 500 steps: 61.86604465178274


3501it [21:43,  3.55it/s]

Training Loss per 500 steps: 1.2811984007135864
Training Accuracy per 500 steps: 62.01656669522993


4001it [24:39,  1.61it/s]

Training Loss per 500 steps: 1.2795644298668833
Training Accuracy per 500 steps: 62.15696075981005


4501it [29:26,  1.99it/s]

Training Loss per 500 steps: 1.280028994137434
Training Accuracy per 500 steps: 62.095089980004445


5001it [33:54,  1.88it/s]

Training Loss per 500 steps: 1.2796291888081963
Training Accuracy per 500 steps: 62.13557288542292


5501it [38:19,  1.98it/s]

Training Loss per 500 steps: 1.2793267525514458
Training Accuracy per 500 steps: 62.16869660061807


6001it [42:46,  2.01it/s]

Training Loss per 500 steps: 1.2787563534324078
Training Accuracy per 500 steps: 62.22796200633228


6501it [47:12,  2.22it/s]

Training Loss per 500 steps: 1.2789686372914217
Training Accuracy per 500 steps: 62.19812336563606


7001it [51:40,  1.84it/s]

Training Loss per 500 steps: 1.280237092186495
Training Accuracy per 500 steps: 62.01542636766176


7501it [56:08,  1.49it/s]

Training Loss per 500 steps: 1.2801373528013418
Training Accuracy per 500 steps: 62.01839754699373


8001it [1:00:36,  1.98it/s]

Training Loss per 500 steps: 1.2799388371159353
Training Accuracy per 500 steps: 62.03599550056243


8501it [1:05:02,  1.99it/s]

Training Loss per 500 steps: 1.279969088197694
Training Accuracy per 500 steps: 62.05034701799788


9001it [1:09:32,  1.78it/s]

Training Loss per 500 steps: 1.2800869122238612
Training Accuracy per 500 steps: 62.041995333851794


9501it [1:14:00,  2.13it/s]

Training Loss per 500 steps: 1.2799595127280368
Training Accuracy per 500 steps: 62.04504788969582


10001it [1:18:29,  2.14it/s]

Training Loss per 500 steps: 1.280078497067438
Training Accuracy per 500 steps: 62.02379762023798


10501it [1:22:52,  1.67it/s]

Training Loss per 500 steps: 1.2797841061729327
Training Accuracy per 500 steps: 62.04456718407771


11001it [1:27:10,  1.70it/s]

Training Loss per 500 steps: 1.2794494666237037
Training Accuracy per 500 steps: 62.07072084355968


11501it [1:31:32,  2.43it/s]

Training Loss per 500 steps: 1.2792476451393833
Training Accuracy per 500 steps: 62.0841665942092


11885it [1:34:50,  2.09it/s]


The Total Accuracy for Epoch 3: 62.071170723908416
Training Loss Epoch: 1.2792797826948314
Training Accuracy Epoch: 62.071170723908416


1it [00:00,  1.59it/s]

Validation Loss per 100 steps: 1.2978764772415161
Validation Accuracy per 100 steps: 60.0


1321it [10:54,  2.02it/s]


Validation Loss Epoch: 1.2680422133969864
Validation Accuracy Epoch: 63.25634229458539
Saved Best Model!

Epoch 5/10
------------------------------


1it [00:00,  1.27it/s]

Training Loss per 500 steps: 1.3714884519577026
Training Accuracy per 500 steps: 50.0


501it [04:23,  2.23it/s]

Training Loss per 500 steps: 1.277561011190662
Training Accuracy per 500 steps: 61.996007984031934


1001it [08:47,  1.92it/s]

Training Loss per 500 steps: 1.270370101357078
Training Accuracy per 500 steps: 62.82717282717283


1501it [13:10,  1.88it/s]

Training Loss per 500 steps: 1.276824814053395
Training Accuracy per 500 steps: 62.22518321119254


2001it [17:31,  1.97it/s]

Training Loss per 500 steps: 1.2788916100328531
Training Accuracy per 500 steps: 61.95402298850575


2501it [22:01,  1.93it/s]

Training Loss per 500 steps: 1.2791457629260994
Training Accuracy per 500 steps: 61.92323070771691


3001it [26:30,  1.93it/s]

Training Loss per 500 steps: 1.2783489058868602
Training Accuracy per 500 steps: 61.97934021992669


3501it [31:02,  1.95it/s]

Training Loss per 500 steps: 1.2782581099712043
Training Accuracy per 500 steps: 62.050842616395315


4001it [35:33,  2.38it/s]

Training Loss per 500 steps: 1.2764977226105967
Training Accuracy per 500 steps: 62.25693576605848


4501it [40:08,  1.79it/s]

Training Loss per 500 steps: 1.276311634395208
Training Accuracy per 500 steps: 62.31948455898689


5001it [44:38,  1.81it/s]

Training Loss per 500 steps: 1.2758566223604872
Training Accuracy per 500 steps: 62.39152169566087


5501it [49:08,  2.09it/s]

Training Loss per 500 steps: 1.2766397576227206
Training Accuracy per 500 steps: 62.314124704599166


6001it [53:36,  1.83it/s]

Training Loss per 500 steps: 1.2780841012494482
Training Accuracy per 500 steps: 62.18463589401767


6501it [58:09,  1.96it/s]

Training Loss per 500 steps: 1.277408679198016
Training Accuracy per 500 steps: 62.261190586063684


7001it [1:13:06,  1.36s/it]

Training Loss per 500 steps: 1.2772841739273126
Training Accuracy per 500 steps: 62.291101271246966


7501it [1:24:51,  1.53s/it]

Training Loss per 500 steps: 1.2773994006024822
Training Accuracy per 500 steps: 62.27969604052793


8001it [1:37:04,  1.35s/it]

Training Loss per 500 steps: 1.2771544552135432
Training Accuracy per 500 steps: 62.3034620672416


8501it [1:48:50,  1.49s/it]

Training Loss per 500 steps: 1.277109654499999
Training Accuracy per 500 steps: 62.315021762145626


8825it [1:56:51,  1.35s/it]

In [None]:
#!cp best_model.pt '/content/gdrive/MyDrive/ColabNotebooks/NLP Project/distilBERT/'

In [None]:
model.load_state_dict(torch.load("best_model.pt"))
model.to(device)

  model.load_state_dict(torch.load("best_model.pt"))


DistillBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Test function
def test_model(model, data_loader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Testing"):
            # Move batch to GPU/CPU
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            # Collect predictions and true labels
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="weighted")

    print("\nTest Results")
    print("-" * 30)
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")

    return accuracy, precision, recall, f1

# After training and validation, evaluate on the test set
print("\nEvaluating on Test Set")
test_accuracy, test_precision, test_recall, test_f1 = test_model(model, test_loader, device)


Evaluating on Test Set


Testing: 100%|██████████| 1796/1796 [04:12<00:00,  7.12it/s]



Test Results
------------------------------
Accuracy: 0.5100
Precision: 0.5181
Recall: 0.5100
F1-score: 0.4948


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Plot function for metrics
def plot_metrics(metrics, metric_names, title):
    fig, ax = plt.subplots(figsize=(8, 6))
    bars = ax.bar(metric_names, metrics, color=['skyblue', 'orange', 'green', 'red'])

    # Add value annotations on bars
    for bar in bars:
        ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.02,
                f"{bar.get_height():.4f}", ha='center', fontsize=10)

    ax.set_ylim(0, 1)
    ax.set_title(title, fontsize=16)
    ax.set_ylabel("Score", fontsize=14)
    ax.set_xlabel("Metrics", fontsize=14)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.show()

# After testing, plot the metrics
print("\nEvaluating on Test Set")
test_accuracy, test_precision, test_recall, test_f1 = test_model(model, test_loader, device)

# Metrics and their names
metrics = [test_accuracy, test_precision, test_recall, test_f1]
metric_names = ["Accuracy", "Precision", "Recall", "F1-Score"]

# Plot the test results
plot_metrics(metrics, metric_names, title="Test Metrics Overview")



Evaluating on Test Set


NameError: name 'test_model' is not defined

### Other form of training

In [None]:
'''
accuracy = evaluate.load("accuracy")

def compute_metrics(p):
  predictions, labels = p
  predictions = np.argmax(predictions, axis=1)
  return {"accuracy": accuracy.compute(predictions=predictions
                                       , references=labels)}
'''

In [None]:
'''
lr = 1e-3
batch_size = 10
num_epochs = 10

training_args = TrainingArguments(
    output_dir=""+model_checkpoint+"lora-txt",
    learning_rate = lr,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    num_train_epochs = num_epochs,
    weight_decay = 0.01,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    load_best_model_at_end = True,
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_dataset["train"],
    eval_dataset = tokenized_dataset["test"],
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics
)
'''

In [None]:
#trainer.train()

### Load pretrained model

In [None]:
"""
# Load model and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
state_dict = torch.load("trained_model_gral_imbd.pth", map_location=device)

text_list = ['''President-elect Trump announced on Tuesday night that he intends to appoint Linda McMahon, former CEO of World Wrestling Entertainment (WWE), to lead the Department of Education. His announcement, which was posted on Truth Social, came hours after two sources told Fox News that McMahon was likely to be picked. "It is my great honor to announce that Linda McMahon, former Administrator of the Small Business Administration, will be the United States Secretary of Education," Trump's statement read.
"As Secretary of Education, Linda will fight tirelessly to expand Choice to every State in America, and empower parents to make the best Education decisions for their families," the press release added. "Linda served for two years on the Connecticut Board of Education, where she was one of fifteen members overseeing all Public Education in the State, including its Technical High School system."''',
             '''Donald Trump believes presidents have almost absolute power. In his second term, there will be few political or legal restraints to check him. The president-elects sweeping victory over Vice President Kamala Harris suddenly turned the theoretical notion that he will indulge his autocratic instincts into a genuine possibility.When Trump returns to the White House in January as one of the most powerful presidents in history, hell be able to take advantage of his own filleting of guardrails during his first presidency, which he continued through legal maneuverings out of office.''',
             '''Nearly 100 Democrats, including Salud Carbajal, requested the Ethics Committee release its report on former Congressman Matt Gaetz's misconduct allegations. The letter, led by Rep. Sean Casten, emphasized that the Senate needs information for Gaetz's attorney general nomination. House Speaker Mike Johnson opposed releasing the report, stating Gaetz is now a "private citizen" and outside the panel's jurisdiction.'''
             , ''' A South Dakota judge dismissed a lawsuit from the anti-abortion group Life Defense targeting an abortion rights measure that voters later rejected.
Judge John Pekas dismissed the lawsuit at the request of Life Defense, which had challenged the ballot measure's petitions.
Voters in nine states, including South Dakota, rejected abortion rights measures during the November election. '''
             ]
model.to('cuda')
print('Trained model predictions')
for text in text_list:
  inputs = tokenizer.encode(text, return_tensors='pt').to('cuda')

  logits = model(inputs).logits
  predictions = torch.max(logits,1).indices

  #print(f'{text} - {id2label[predictions.tolist()[0]]}')
  print(f'{id2label[predictions.tolist()[0]]}')
"""