# Alejandro Paredes, Parameter tuning of BERT

In [1]:
#!pip install transformers datasets peft evaluate #

In [11]:
from datasets import load_dataset, DatasetDict, Dataset
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

In [12]:
model_checkpoint = 'distilbert-base-uncased'

#Define label maps
id2label = {0:"UNDEFINED" ,1:"LEFT",2:"RIGHT",3:"CENTER"}
label2id = {"UNDEFINED": 0, "LEFT": 1, "RIGHT": 2, "CENTER": 3}

model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=4, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from datasets import load_dataset

from transformers import BertTokenizerFast

from torch.utils.data import DataLoader

df = load_dataset("csv", data_files="./2017_1.csv") 
df

DatasetDict({
    train: Dataset({
        features: ['id', 'date_publish', 'outlet', 'headline', 'lead', 'body', 'authors', 'domain', 'url', 'political_leaning'],
        num_rows: 146718
    })
})

In [175]:
# train_testvalid = 
df = df['train'].train_test_split(test_size=0.1)

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix=True)

In [5]:
def tokenize_function(examples):
    text = examples["body"]
    labels = examples["political_leaning"]  
    
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors = "np",
        padding = True,
        truncation = True,
        max_length = 512
        )

    tokenized_inputs["labels"] = [label2id[label] for label in labels]  
    return tokenized_inputs

In [6]:
if tokenizer.pad_token is None:
  tokenizer.add_special_tokens({'pad_token': '[PAD]'})
  model.resize_token_embeddings(len(tokenizer))

In [211]:
tokenized_dataset = df.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/132046 [00:00<?, ? examples/s]

Map:   0%|          | 0/14672 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'date_publish', 'outlet', 'headline', 'lead', 'body', 'authors', 'domain', 'url', 'political_leaning', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 132046
    })
    test: Dataset({
        features: ['id', 'date_publish', 'outlet', 'headline', 'lead', 'body', 'authors', 'domain', 'url', 'political_leaning', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14672
    })
})

In [212]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [213]:
accuracy = evaluate.load("accuracy")

def compute_metrics(p):
  predictions, labels = p
  predictions = np.argmax(predictions, axis=1)
  return {"accuracy": accuracy.compute(predictions=predictions
                                       , references=labels)}

In [9]:
text_list = ["Abortions gay weddings, free healthcare, public intervention", "Privatization, free market, deregulation, tax cuts",
             "Women have the right to choose and abortion should be allowed."]

import torch

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

print("Untrained model")
for text in text_list:
    inputs = tokenizer(text, return_tensors="pt").to(device)  # Move inputs to the correct device
    logits = model(**inputs).logits  # Forward pass
    predictions = torch.argmax(logits, dim=-1)
    print(f'{text} - {id2label[predictions.item()]}')

Untrained model
Abortions gay weddings, free healthcare, public intervention - UNDEFINED
Privatization, free market, deregulation, tax cuts - CENTER
Women have the right to choose and abortion should be allowed. - UNDEFINED


In [215]:
peft_config = LoraConfig(task_type='SEQ_CLS',
                         r = 4,
                         lora_alpha=32,
                         lora_dropout=0.01,
                         target_modules = ['q_lin'])

In [221]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 630,532 || all params: 67,587,080 || trainable%: 0.9329


In [223]:
lr = 1e-3
batch_size = 10
num_epochs = 5

training_args = TrainingArguments(
    output_dir=""+model_checkpoint+"lora-txt",
    learning_rate = lr,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    num_train_epochs = num_epochs,
    weight_decay = 0.01,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    load_best_model_at_end = True,
)

In [225]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_dataset["train"],
    eval_dataset = tokenized_dataset["test"],
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics
)

  trainer = Trainer(


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


In [None]:
# Load model and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
state_dict = torch.load("trained_model_gral_imbd.pth", map_location=device)

text_list = ['''President-elect Trump announced on Tuesday night that he intends to appoint Linda McMahon, former CEO of World Wrestling Entertainment (WWE), to lead the Department of Education. His announcement, which was posted on Truth Social, came hours after two sources told Fox News that McMahon was likely to be picked. "It is my great honor to announce that Linda McMahon, former Administrator of the Small Business Administration, will be the United States Secretary of Education," Trump's statement read.
"As Secretary of Education, Linda will fight tirelessly to expand Choice to every State in America, and empower parents to make the best Education decisions for their families," the press release added. "Linda served for two years on the Connecticut Board of Education, where she was one of fifteen members overseeing all Public Education in the State, including its Technical High School system."''', 
             '''Donald Trump believes presidents have almost absolute power. In his second term, there will be few political or legal restraints to check him. The president-elects sweeping victory over Vice President Kamala Harris suddenly turned the theoretical notion that he will indulge his autocratic instincts into a genuine possibility.When Trump returns to the White House in January as one of the most powerful presidents in history, hell be able to take advantage of his own filleting of guardrails during his first presidency, which he continued through legal maneuverings out of office.''',
             '''Nearly 100 Democrats, including Salud Carbajal, requested the Ethics Committee release its report on former Congressman Matt Gaetz's misconduct allegations. The letter, led by Rep. Sean Casten, emphasized that the Senate needs information for Gaetz's attorney general nomination. House Speaker Mike Johnson opposed releasing the report, stating Gaetz is now a "private citizen" and outside the panel's jurisdiction.'''
             , ''' A South Dakota judge dismissed a lawsuit from the anti-abortion group Life Defense targeting an abortion rights measure that voters later rejected.
Judge John Pekas dismissed the lawsuit at the request of Life Defense, which had challenged the ballot measure's petitions.
Voters in nine states, including South Dakota, rejected abortion rights measures during the November election. '''
             ]
model.to('cuda')
print('Trained model predictions')
for text in text_list:
  inputs = tokenizer.encode(text, return_tensors='pt').to('cuda')

  logits = model(inputs).logits
  predictions = torch.max(logits,1).indices

  #print(f'{text} - {id2label[predictions.tolist()[0]]}')
  print(f'{id2label[predictions.tolist()[0]]}')

  state_dict = torch.load("trained_model_gral_imbd.pth", map_location=device)


Trained model predictions
RIGHT
RIGHT
RIGHT
RIGHT
