# Alejandro Paredes, Parameter tuning of BERT

https://arunm8489.medium.com/understanding-distil-bert-in-depth-5f2ca92cf1ed

In [1]:
#from google.colab import drive
#drive.mount('/content/gdrive')

In [2]:
import torch

if torch.cuda.is_available():
    print("CUDA is available!")
else:
    print("CUDA is not available.")

CUDA is available!


In [3]:
#!pip install transformers datasets peft evaluate datasets contractions tweet-preprocessor

In [4]:
from datasets import load_dataset, DatasetDict, Dataset
from transformers import (
    AutoTokenizer,
    DistilBertModel,
    DistilBertTokenizer,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

from tqdm import tqdm

import re
import contractions
import numpy as np

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import preprocessor as p

# Setting up the device for GPU usage
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

device

  from .autonotebook import tqdm as notebook_tqdm


'cuda'

In [5]:
from datasets import load_dataset

from transformers import BertTokenizerFast

from torch.utils.data import DataLoader

# Load all CSV files in the ./data directory
#data_files = "./data/*.csv"

# Load and combine the datasets
dataset = load_dataset("csv", data_files="./data/2017_1.csv")#data_files)

# Filter and split the dataset
df  = dataset['train'].filter(
    lambda example: example['headline'] is not None and example['headline'].strip() != ''
).train_test_split(test_size=0.1)

# Display the resulting dataset
df 

DatasetDict({
    train: Dataset({
        features: ['id', 'date_publish', 'outlet', 'headline', 'lead', 'body', 'authors', 'domain', 'url', 'political_leaning'],
        num_rows: 132046
    })
    test: Dataset({
        features: ['id', 'date_publish', 'outlet', 'headline', 'lead', 'body', 'authors', 'domain', 'url', 'political_leaning'],
        num_rows: 14672
    })
})

In [6]:
model_checkpoint = 'distilbert-base-uncased'

#Define label maps
id2label = {0:"UNDEFINED" ,1:"LEFT",2:"RIGHT",3:"CENTER"}
label2id = {"UNDEFINED": 0, "LEFT": 1, "RIGHT": 2, "CENTER": 3}

tokenizer =  DistilBertTokenizer.from_pretrained(model_checkpoint, add_prefix=True)

In [7]:
#lemmatization and removing stopwords
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('stopwords')

#lemmatizer = WordNetLemmatizer()
#stop_words = set(stopwords.words("english"))

p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.SMILEY)

def preprocess(text):
    def is_english_word(word):
        """Function to filter out non-English words."""
        return bool(re.match(r'^[a-zA-Z]+$', word))
    text = text.lower()
    text = contractions.fix(text)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    text = p.clean(text)
    return text

In [8]:
for i in range(5):
    print('Original Text: ', df['train']['headline'][i], '\n')
    print('Tokenized Text: ', tokenizer.tokenize(preprocess(df['train']['headline'][i])), '\n')
    print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(df['train']['headline'][i])))

#for i in range(2):
    #print('Original Text: ', df['train']['body'][i], '\n')
    #print('Tokenized Text: ', tokenizer.tokenize(preprocess(df['train']['body'][i])), '\n')
    #print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(df['train']['body'][i])))


Original Text:  The Merriam-Webster Dictionary Has Been Trolling Trump On Twitter For Months 

Tokenized Text:  ['the', 'mer', '##riam', '-', 'webster', 'dictionary', 'has', 'been', 'troll', '##ing', 'trump', 'on', 'twitter', 'for', 'months'] 

Token IDs:  [1996, 21442, 25557, 1011, 11635, 9206, 2038, 2042, 18792, 2075, 8398, 2006, 10474, 2005, 2706]
Original Text:  White House Takes Flak for Letting Russian State Media Into Oval Office 

Tokenized Text:  ['white', 'house', 'takes', 'fl', '##ak', 'for', 'letting', 'russian', 'state', 'media', 'into', 'oval', 'office'] 

Token IDs:  [2317, 2160, 3138, 13109, 4817, 2005, 5599, 2845, 2110, 2865, 2046, 9242, 2436]
Original Text:  Selle: Timing is everything for protesting scientists 

Tokenized Text:  ['sell', '##e', ':', 'timing', 'is', 'everything', 'for', 'protesting', 'scientists'] 

Token IDs:  [5271, 2063, 1024, 10984, 2003, 2673, 2005, 21248, 6529]
Original Text:  Richard Collins III's death a grim reminder that hate thrives - even 

In [9]:
texts = df['train']['headline']

# Handle None or missing values by filtering out None entries
text_lengths = [len(text.split(' ')) if text is not None else 0 for text in texts]

print(min(text_lengths))
print(max(text_lengths))

# Count how many texts have 300 or more words
print(sum([1 for length in text_lengths if length >= 300]))

# Repeat for the 'body' column
texts = df['train']['body']

# Handle None or missing values by filtering out None entries
text_lengths = [len(text.split()) if text is not None else 0 for text in texts]

print(min(text_lengths))
print(max(text_lengths))

# Count how many texts have 300 or more words
print(sum([1 for length in text_lengths if length >= 300]))


1
40
0
15
17700
87447


# **Creating a custom model**

In [31]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model.

class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained(model_checkpoint, num_labels=8)

        # Freeze DistilBERT parameters
        for param in self.l1.parameters():
            param.requires_grad = False

        self.dropout = torch.nn.Dropout(0.3)
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.fc1 = torch.nn.Linear(768, 1024)  # Input dimension is 768 for BERT
        #self.fc2 = torch.nn.Linear(1024, 512)
        self.classifier = torch.nn.Linear(1024, 5)
        self.relu = torch.nn.ReLU()
        self.relu = torch.nn.ReLU()
        self.softmax = torch.nn.Softmax(dim=1)


    def forward(self, input_ids, attention_mask):
        output = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = self.dropout(pooler)
        pooler = self.fc1(pooler)
        pooler = self.relu(pooler)
        pooler = self.dropout(pooler)
        #pooler = self.fc2(pooler)
        #pooler = self.relu(pooler)
        #pooler = self.dropout(pooler)
        #pooler = self.fc3(pooler)
        #pooler = self.softmax(pooler)
        output = self.classifier(pooler)
        output = self.softmax(output)
        return output

In [38]:
from torch.optim.lr_scheduler import StepLR

# Defining some key variables that will be used later on in the training
MAX_LEN = 512
TRAIN_BATCH_SIZE = 10
VALID_BATCH_SIZE = 10
EPOCHS = 10
LEARNING_RATE = 1e-05



model = DistillBERTClass()
model.to(device)

# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)
scheduler = StepLR(optimizer, step_size=2, gamma=0.15)


model

DistillBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(

In [22]:
for name, param in model.named_parameters():
    print(f"{name}: requires_grad={param.requires_grad}")

l1.embeddings.word_embeddings.weight: requires_grad=False
l1.embeddings.position_embeddings.weight: requires_grad=False
l1.embeddings.LayerNorm.weight: requires_grad=False
l1.embeddings.LayerNorm.bias: requires_grad=False
l1.transformer.layer.0.attention.q_lin.weight: requires_grad=False
l1.transformer.layer.0.attention.q_lin.bias: requires_grad=False
l1.transformer.layer.0.attention.k_lin.weight: requires_grad=False
l1.transformer.layer.0.attention.k_lin.bias: requires_grad=False
l1.transformer.layer.0.attention.v_lin.weight: requires_grad=False
l1.transformer.layer.0.attention.v_lin.bias: requires_grad=False
l1.transformer.layer.0.attention.out_lin.weight: requires_grad=False
l1.transformer.layer.0.attention.out_lin.bias: requires_grad=False
l1.transformer.layer.0.sa_layer_norm.weight: requires_grad=False
l1.transformer.layer.0.sa_layer_norm.bias: requires_grad=False
l1.transformer.layer.0.ffn.lin1.weight: requires_grad=False
l1.transformer.layer.0.ffn.lin1.bias: requires_grad=False


In [23]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

if tokenizer.pad_token is None:
  tokenizer.add_special_tokens({'pad_token': '[PAD]'})
  model.resize_token_embeddings(len(tokenizer))

In [24]:
def tokenize_function(examples):
    #text = examples["body"]
    text = examples["body"]
    labels = examples["political_leaning"]

    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,#[preprocess(t) for t in text] ,
        return_tensors = "np",
        padding = True,
        truncation = True,
        max_length = 512
        )

    tokenized_inputs["labels"] = [label2id[label] for label in labels]
    return tokenized_inputs

#tokenized_dataset = df.map(tokenize_function, batched=True)
#tokenized_dataset

In [25]:
# Define split ratio for validation
train_test_split = df["train"].train_test_split(test_size=0.1)  # 10% for validation
datasets = DatasetDict({
    "train": train_test_split["train"],
    "validation": train_test_split["test"],  # This is your validation set
    "test": df["test"],       # Keep the original test set
})

In [26]:
import re
import contractions
from torch.utils.data import Dataset

# Define the mapping for political leaning categories to numeric values
category_mapping = {
    'LEFT': 0,
    'CENTER': 1,
    'RIGHT': 2,
    'UNDEFINED': 3
}

# Preprocessing function
def preprocess(text):
    """ Preprocess the text to clean it for tokenization """
    def is_english_word(word):
        """Function to filter out non-English words."""
        return bool(re.match(r'^[a-zA-Z]+$', word))

    text = text.lower()  # Convert to lowercase
    text = contractions.fix(text)  # Expand contractions (e.g., "don't" -> "do not")
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII characters
    text = p.clean(text)  # Clean text using the clean-text library
    return text

class Triage(Dataset):
    def __init__(self, dataset, tokenizer, max_length):
        self.texts = dataset['body']  # Assuming 'text' column contains the raw text
        self.labels = dataset['political_leaning']
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __getitem__(self, index):
        # Get raw text and label for the current index
        text = self.texts[index]
        label = self.labels[index]
        tokenizer.truncation_side = "left"
        #tokenized_inputs = self.tokenizer(
        tokenized_inputs = self.tokenizer.encode_plus(
            preprocess(text),
            None,
            #return_tensors="pt",
            #padding=True,
            #truncation=True,
            #max_length=self.max_length
            add_special_tokens=True,
            max_length=self.max_length,
            pad_to_max_length=True
        )

        #encoding = tokenize_function({"text": [text], "labels": [label]}, self.tokenizer, self.max_length)
        input_ids = tokenized_inputs['input_ids']  # Remove the batch dimension
        attention_mask = tokenized_inputs['attention_mask']  # Remove the batch dimension

        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'labels': torch.tensor(category_mapping[self.labels[index]], dtype=torch.float)
        }

    def __len__(self):
        return len(self.texts)


In [27]:
train_dataset = Triage(datasets['train'], tokenizer, max_length=512)
val_dataset = Triage(datasets['validation'], tokenizer, max_length=512)
test_dataset = Triage(datasets['test'], tokenizer, max_length=512)

In [28]:
# Training DataLoader
training_loader = DataLoader(
    train_dataset,
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=True,
    collate_fn=data_collator,
)

# Validation DataLoader
val_loader = DataLoader(
    val_dataset,
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    collate_fn=data_collator,
)

# Test DataLoader
test_loader = DataLoader(
    test_dataset,
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    collate_fn=data_collator,
)

### Training the model

In [29]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model
def calculate_accuracy(preds, targets):
    n_correct = (preds==targets).sum().item()
    return n_correct

def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        #token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['labels'].to(device, dtype = torch.long)

        outputs = model(ids, mask)#, token_type_ids)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calculate_accuracy(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)

        if _%500==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples
            print(f"Training Loss per 500 steps: {loss_step}")
            print(f"Training Accuracy per 500 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return

def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0; tr_loss=0; nb_tr_steps=0; nb_tr_examples=0
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            #token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['labels'].to(device, dtype = torch.long)
            outputs = model(ids, mask).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calculate_accuracy(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)

            if _%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")

    return epoch_loss, epoch_accu


In [None]:
best_val_loss = float("inf")

# Load model checkpoint
checkpoint = torch.load("./models/local_run_BERT_body/best_model.pt")
#print(checkpoint.keys())
model.load_state_dict(checkpoint)


for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch + 1}/{EPOCHS}")
    print("-" * 30)

    train_loss = train(epoch)
    val_loss, val_accuracy = valid(model, val_loader)
    for param_group in optimizer.param_groups:
        print("Learning rate:", param_group['lr'])
    # Save the best model
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "./models/local_run_BERT_body_v2/best_model.pt")
        print("Saved Best Model!")
    if val_accuracy > 83:
        break
    scheduler.step()

  checkpoint = torch.load("./models/local_run_BERT_body/best_model.pt")



Epoch 1/10
------------------------------


1it [00:01,  1.00s/it]

Training Loss per 500 steps: 1.2654756307601929
Training Accuracy per 500 steps: 60.0


501it [02:33,  3.12it/s]

Training Loss per 500 steps: 1.283077555740189
Training Accuracy per 500 steps: 61.73652694610779


1001it [05:01,  3.20it/s]

Training Loss per 500 steps: 1.2763246492548779
Training Accuracy per 500 steps: 62.42757242757243


1501it [07:32,  3.50it/s]

Training Loss per 500 steps: 1.2800937439504263
Training Accuracy per 500 steps: 62.03197868087941


2001it [10:00,  3.29it/s]

Training Loss per 500 steps: 1.280411941328387
Training Accuracy per 500 steps: 61.94902548725637


2501it [12:31,  3.34it/s]

Training Loss per 500 steps: 1.2778922111356035
Training Accuracy per 500 steps: 62.19512195121951


3001it [15:01,  3.17it/s]

Training Loss per 500 steps: 1.2773628322254296
Training Accuracy per 500 steps: 62.239253582139284


3501it [17:31,  3.26it/s]

Training Loss per 500 steps: 1.2780004236671592
Training Accuracy per 500 steps: 62.19365895458441


4001it [20:01,  3.30it/s]

Training Loss per 500 steps: 1.2767492117538537
Training Accuracy per 500 steps: 62.30192451887028


4501it [22:30,  3.30it/s]

Training Loss per 500 steps: 1.2763685491370562
Training Accuracy per 500 steps: 62.34836702954899


5001it [25:02,  3.30it/s]

Training Loss per 500 steps: 1.2763624122275803
Training Accuracy per 500 steps: 62.31353729254149


5501it [27:31,  3.53it/s]

Training Loss per 500 steps: 1.275782074100038
Training Accuracy per 500 steps: 62.35411743319396


6001it [30:00,  3.32it/s]

Training Loss per 500 steps: 1.2755733370741216
Training Accuracy per 500 steps: 62.37627062156307


6501it [32:31,  3.51it/s]

Training Loss per 500 steps: 1.275855200975753
Training Accuracy per 500 steps: 62.339640055376094


7001it [35:01,  3.37it/s]

Training Loss per 500 steps: 1.275908750528064
Training Accuracy per 500 steps: 62.30967004713612


7501it [37:30,  3.59it/s]

Training Loss per 500 steps: 1.2752732423413136
Training Accuracy per 500 steps: 62.38901479802693


8001it [40:00,  3.39it/s]

Training Loss per 500 steps: 1.2746468851334183
Training Accuracy per 500 steps: 62.45844269466317


8501it [42:31,  3.32it/s]

Training Loss per 500 steps: 1.274301224674229
Training Accuracy per 500 steps: 62.505587577932005


9001it [44:59,  3.25it/s]

Training Loss per 500 steps: 1.2735860075451058
Training Accuracy per 500 steps: 62.597489167870236


9466it [47:20,  2.68it/s]

In [None]:
#!cp best_model.pt '/content/gdrive/MyDrive/ColabNotebooks/NLP Project/distilBERT/'

In [None]:
model.load_state_dict(torch.load("best_model.pt"))
model.to(device)

  model.load_state_dict(torch.load("best_model.pt"))


DistillBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Test function
def test_model(model, data_loader, device):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Testing"):
            # Move batch to GPU/CPU
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            # Collect predictions and true labels
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # Calculate metrics
    accuracy = accuracy_score(all_labels, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="weighted")

    print("\nTest Results")
    print("-" * 30)
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-score: {f1:.4f}")

    return accuracy, precision, recall, f1

# After training and validation, evaluate on the test set
print("\nEvaluating on Test Set")
test_accuracy, test_precision, test_recall, test_f1 = test_model(model, test_loader, device)


Evaluating on Test Set


Testing: 100%|██████████| 1796/1796 [04:12<00:00,  7.12it/s]



Test Results
------------------------------
Accuracy: 0.5100
Precision: 0.5181
Recall: 0.5100
F1-score: 0.4948


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Plot function for metrics
def plot_metrics(metrics, metric_names, title):
    fig, ax = plt.subplots(figsize=(8, 6))
    bars = ax.bar(metric_names, metrics, color=['skyblue', 'orange', 'green', 'red'])

    # Add value annotations on bars
    for bar in bars:
        ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.02,
                f"{bar.get_height():.4f}", ha='center', fontsize=10)

    ax.set_ylim(0, 1)
    ax.set_title(title, fontsize=16)
    ax.set_ylabel("Score", fontsize=14)
    ax.set_xlabel("Metrics", fontsize=14)
    plt.xticks(fontsize=12)
    plt.yticks(fontsize=12)
    plt.show()

# After testing, plot the metrics
print("\nEvaluating on Test Set")
test_accuracy, test_precision, test_recall, test_f1 = test_model(model, test_loader, device)

# Metrics and their names
metrics = [test_accuracy, test_precision, test_recall, test_f1]
metric_names = ["Accuracy", "Precision", "Recall", "F1-Score"]

# Plot the test results
plot_metrics(metrics, metric_names, title="Test Metrics Overview")



Evaluating on Test Set


NameError: name 'test_model' is not defined

### Other form of training

In [None]:
'''
accuracy = evaluate.load("accuracy")

def compute_metrics(p):
  predictions, labels = p
  predictions = np.argmax(predictions, axis=1)
  return {"accuracy": accuracy.compute(predictions=predictions
                                       , references=labels)}
'''

In [None]:
'''
lr = 1e-3
batch_size = 10
num_epochs = 10

training_args = TrainingArguments(
    output_dir=""+model_checkpoint+"lora-txt",
    learning_rate = lr,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    num_train_epochs = num_epochs,
    weight_decay = 0.01,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    load_best_model_at_end = True,
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_dataset["train"],
    eval_dataset = tokenized_dataset["test"],
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics
)
'''

In [None]:
#trainer.train()

### Load pretrained model

In [None]:
"""
# Load model and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
state_dict = torch.load("trained_model_gral_imbd.pth", map_location=device)

text_list = ['''President-elect Trump announced on Tuesday night that he intends to appoint Linda McMahon, former CEO of World Wrestling Entertainment (WWE), to lead the Department of Education. His announcement, which was posted on Truth Social, came hours after two sources told Fox News that McMahon was likely to be picked. "It is my great honor to announce that Linda McMahon, former Administrator of the Small Business Administration, will be the United States Secretary of Education," Trump's statement read.
"As Secretary of Education, Linda will fight tirelessly to expand Choice to every State in America, and empower parents to make the best Education decisions for their families," the press release added. "Linda served for two years on the Connecticut Board of Education, where she was one of fifteen members overseeing all Public Education in the State, including its Technical High School system."''',
             '''Donald Trump believes presidents have almost absolute power. In his second term, there will be few political or legal restraints to check him. The president-elects sweeping victory over Vice President Kamala Harris suddenly turned the theoretical notion that he will indulge his autocratic instincts into a genuine possibility.When Trump returns to the White House in January as one of the most powerful presidents in history, hell be able to take advantage of his own filleting of guardrails during his first presidency, which he continued through legal maneuverings out of office.''',
             '''Nearly 100 Democrats, including Salud Carbajal, requested the Ethics Committee release its report on former Congressman Matt Gaetz's misconduct allegations. The letter, led by Rep. Sean Casten, emphasized that the Senate needs information for Gaetz's attorney general nomination. House Speaker Mike Johnson opposed releasing the report, stating Gaetz is now a "private citizen" and outside the panel's jurisdiction.'''
             , ''' A South Dakota judge dismissed a lawsuit from the anti-abortion group Life Defense targeting an abortion rights measure that voters later rejected.
Judge John Pekas dismissed the lawsuit at the request of Life Defense, which had challenged the ballot measure's petitions.
Voters in nine states, including South Dakota, rejected abortion rights measures during the November election. '''
             ]
model.to('cuda')
print('Trained model predictions')
for text in text_list:
  inputs = tokenizer.encode(text, return_tensors='pt').to('cuda')

  logits = model(inputs).logits
  predictions = torch.max(logits,1).indices

  #print(f'{text} - {id2label[predictions.tolist()[0]]}')
  print(f'{id2label[predictions.tolist()[0]]}')
"""

In [None]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

# List of classes
classes = ["LEFT", "RIGHT", "UNDECIDED", "CENTER"]

# Initialize metrics dictionary
metrics_dict = {}

for label in classes:
    # Calculate TP, FP, FN for the current label
    df_metrics = df.withColumn("TP", F.when((F.col("pred_class") == label) & (F.col("political_leaning") == label), 1).otherwise(0)) \
                   .withColumn("FP", F.when((F.col("pred_class") == label) & (F.col("political_leaning") != label), 1).otherwise(0)) \
                   .withColumn("FN", F.when((F.col("pred_class") != label) & (F.col("political_leaning") == label), 1).otherwise(0)) \
                   .withColumn("TN", F.when((F.col("pred_class") != label) & (F.col("political_leaning") != label), 1).otherwise(0))

    # Aggregate metrics
    aggregated = df_metrics.agg(
        F.sum("TP").alias("TP"),
        F.sum("FP").alias("FP"),
        F.sum("FN").alias("FN"),
        F.sum("TN").alias("TN")
    ).collect()[0]

    TP, FP, FN, TN = aggregated["TP"], aggregated["FP"], aggregated["FN"], aggregated["TN"]

    # Calculate precision, recall, and F1-score
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    # Store results for the current label
    metrics_dict[label] = {
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1_score
    }

# Calculate overall accuracy
correct_predictions = df.filter(F.col("pred_class") == F.col("political_leaning")).count()
total_predictions = df.count()
accuracy = correct_predictions / total_predictions

# Display results
print(f"For Dataset 2017_1 Overall Accuracy: {accuracy:.2f}")
for label, metrics in metrics_dict.items():
    print(f"\nMetrics for {label}:")
    print(f"  Precision: {metrics['Precision']:.2f}")
    print(f"  Recall: {metrics['Recall']:.2f}")
    print(f"  F1 Score: {metrics['F1 Score']:.2f}")