# BERT with LoRA for analysis on political leaning of news; SUMMARY 

In [1]:
!pip install transformers datasets peft evaluate



In [2]:
from datasets import load_dataset, DatasetDict, Dataset
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

In [3]:
model_checkpoint = "distilbert-base-uncased"

# Define label maps
id2label = {0: "UNDEFINED", 1: "LEFT", 2: "RIGHT", 3: "CENTER"}
label2id = {"UNDEFINED": 0, "LEFT": 1, "RIGHT": 2, "CENTER": 3}

model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=4, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
from datasets import load_dataset

from transformers import BertTokenizerFast

from torch.utils.data import DataLoader

df = load_dataset("csv", data_files="/Users/ilseoplee/NLPizza_final_project/Filing/LORA_Summary/training_data_body_summary_19K.csv")
df

DatasetDict({
    train: Dataset({
        features: ['id', 'date_publish', 'outlet', 'headline', 'lead', 'body', 'authors', 'domain', 'url', 'political_leaning', 'summary'],
        num_rows: 19328
    })
})

In [5]:
# train_testvalid =
df = df["train"].train_test_split(test_size=0.1)

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix=True)

In [7]:
def tokenize_function(examples):
    text = examples["summary"]
    labels = examples["political_leaning"]

    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text, return_tensors="np", padding=True, truncation=True, max_length=512
    )

    tokenized_inputs["labels"] = [label2id[label] for label in labels]
    return tokenized_inputs

In [8]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})
    model.resize_token_embeddings(len(tokenizer))

In [9]:
tokenized_dataset = df.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/17395 [00:00<?, ? examples/s]

Map:   0%|          | 0/1933 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'date_publish', 'outlet', 'headline', 'lead', 'body', 'authors', 'domain', 'url', 'political_leaning', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 17395
    })
    test: Dataset({
        features: ['id', 'date_publish', 'outlet', 'headline', 'lead', 'body', 'authors', 'domain', 'url', 'political_leaning', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1933
    })
})

In [10]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [11]:
accuracy = evaluate.load("accuracy")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

In [12]:
text_list = [
    "It was good.",
    "Not a fan, don't recommended",
    "Better than the first one.",
    "Women have the right to choose and abortion should be allowed.",
]

import torch

# Set device
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = model.to(device)

print("Untrained model")
for text in text_list:
    inputs = tokenizer(text, return_tensors="pt").to(
        device
    )  # Move inputs to the correct device
    logits = model(**inputs).logits  # Forward pass
    predictions = torch.argmax(logits, dim=-1)
    print(f"{text} - {id2label[predictions.item()]}")

# print("Untrained model")
# for text in text_list:
#   inputs = tokenizer.encode(text, return_tensors="pt")
#   logits = model(inputs).logits
#   predictions = torch.argmax(logits)
#   print(f'{text} - {id2label[predictions.tolist()]}')

Untrained model
It was good. - UNDEFINED
Not a fan, don't recommended - UNDEFINED
Better than the first one. - CENTER
Women have the right to choose and abortion should be allowed. - UNDEFINED


In [23]:
peft_config = LoraConfig(
    task_type="SEQ_CLS", r=12, lora_alpha=32, lora_dropout=0.01, target_modules=["q_lin"]
)

In [26]:
lr = 1e-3
batch_size = 10
num_epochs = 5

training_args = TrainingArguments(
    output_dir="" + model_checkpoint + "lora-txt",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [28]:
def compute_metrics(eval_pred):  #Training
    """
    Computes accuracy, precision, recall, and F1 score.
    eval_pred: A tuple of (predictions, labels) provided by the Trainer.
    """
    predictions, labels = eval_pred
    # Convert predictions to the predicted class indices (argmax for softmax outputs)
    predictions = predictions.argmax(axis=-1)

    # Compute metrics
    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, average='weighted')  # Weighted for class imbalance
    recall = recall_score(labels, predictions, average='weighted')
    f1 = f1_score(labels, predictions, average='weighted')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


In [29]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [30]:
trainer.train()

  0%|          | 0/8700 [00:00<?, ?it/s]

{'loss': 1.1493, 'grad_norm': 2.0976951122283936, 'learning_rate': 0.0009425287356321838, 'epoch': 0.29}
{'loss': 1.0608, 'grad_norm': 2.5578582286834717, 'learning_rate': 0.0008850574712643679, 'epoch': 0.57}
{'loss': 1.0549, 'grad_norm': 1.7384960651397705, 'learning_rate': 0.0008275862068965517, 'epoch': 0.86}


  0%|          | 0/194 [00:00<?, ?it/s]

{'eval_loss': 1.0149575471878052, 'eval_accuracy': 0.5576823590274185, 'eval_precision': 0.5576650132188341, 'eval_recall': 0.5576823590274185, 'eval_f1': 0.5432822625324656, 'eval_runtime': 11.4717, 'eval_samples_per_second': 168.501, 'eval_steps_per_second': 16.911, 'epoch': 1.0}
{'loss': 1.0053, 'grad_norm': 2.834160327911377, 'learning_rate': 0.0007701149425287356, 'epoch': 1.15}
{'loss': 0.989, 'grad_norm': 2.729121446609497, 'learning_rate': 0.0007126436781609196, 'epoch': 1.44}
{'loss': 0.981, 'grad_norm': 2.7624571323394775, 'learning_rate': 0.0006551724137931034, 'epoch': 1.72}


  0%|          | 0/194 [00:00<?, ?it/s]

{'eval_loss': 0.9866294860839844, 'eval_accuracy': 0.5804449042938438, 'eval_precision': 0.5765379576458176, 'eval_recall': 0.5804449042938438, 'eval_f1': 0.574008690898699, 'eval_runtime': 11.035, 'eval_samples_per_second': 175.169, 'eval_steps_per_second': 17.58, 'epoch': 2.0}
{'loss': 0.9656, 'grad_norm': 2.1639926433563232, 'learning_rate': 0.0005977011494252874, 'epoch': 2.01}
{'loss': 0.913, 'grad_norm': 2.016406536102295, 'learning_rate': 0.0005402298850574712, 'epoch': 2.3}
{'loss': 0.9056, 'grad_norm': 2.2516777515411377, 'learning_rate': 0.0004827586206896552, 'epoch': 2.59}
{'loss': 0.9015, 'grad_norm': 2.457991361618042, 'learning_rate': 0.00042528735632183906, 'epoch': 2.87}


  0%|          | 0/194 [00:00<?, ?it/s]

{'eval_loss': 0.9759300947189331, 'eval_accuracy': 0.5840662183135024, 'eval_precision': 0.5944183217431899, 'eval_recall': 0.5840662183135024, 'eval_f1': 0.5783113770444677, 'eval_runtime': 10.7888, 'eval_samples_per_second': 179.167, 'eval_steps_per_second': 17.982, 'epoch': 3.0}
{'loss': 0.8622, 'grad_norm': 4.048056125640869, 'learning_rate': 0.000367816091954023, 'epoch': 3.16}
{'loss': 0.8299, 'grad_norm': 2.778053045272827, 'learning_rate': 0.0003103448275862069, 'epoch': 3.45}
{'loss': 0.8419, 'grad_norm': 3.2340686321258545, 'learning_rate': 0.00025287356321839085, 'epoch': 3.74}


  0%|          | 0/194 [00:00<?, ?it/s]

{'eval_loss': 0.9835162162780762, 'eval_accuracy': 0.5959648215209519, 'eval_precision': 0.5919171076115526, 'eval_recall': 0.5959648215209519, 'eval_f1': 0.5889405909288957, 'eval_runtime': 11.6488, 'eval_samples_per_second': 165.94, 'eval_steps_per_second': 16.654, 'epoch': 4.0}
{'loss': 0.8267, 'grad_norm': 3.102609872817993, 'learning_rate': 0.00019540229885057472, 'epoch': 4.02}
{'loss': 0.7574, 'grad_norm': 2.841707706451416, 'learning_rate': 0.00013793103448275863, 'epoch': 4.31}
{'loss': 0.7639, 'grad_norm': 3.844043493270874, 'learning_rate': 8.045977011494253e-05, 'epoch': 4.6}
{'loss': 0.7647, 'grad_norm': 2.497410535812378, 'learning_rate': 2.2988505747126437e-05, 'epoch': 4.89}


  0%|          | 0/194 [00:00<?, ?it/s]

{'eval_loss': 1.0004345178604126, 'eval_accuracy': 0.5923435075012933, 'eval_precision': 0.5883787820307593, 'eval_recall': 0.5923435075012933, 'eval_f1': 0.587215559250738, 'eval_runtime': 10.9506, 'eval_samples_per_second': 176.52, 'eval_steps_per_second': 17.716, 'epoch': 5.0}
{'train_runtime': 1117.7161, 'train_samples_per_second': 77.815, 'train_steps_per_second': 7.784, 'train_loss': 0.91250649813948, 'epoch': 5.0}


TrainOutput(global_step=8700, training_loss=0.91250649813948, metrics={'train_runtime': 1117.7161, 'train_samples_per_second': 77.815, 'train_steps_per_second': 7.784, 'total_flos': 2439377683482480.0, 'train_loss': 0.91250649813948, 'epoch': 5.0})

<!--  -->

In [21]:
import torch

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

model = model.to(device)
print("Trained model predictions")

for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to(device)

    logits = model(inputs).logits

    predictions = torch.argmax(logits, dim=-1)

    print(f"{text} - {id2label[predictions.item()]}")


# INITIAL CODE
# model.to('cuda')
# print('Trained model predictions')
# for text in text_list:
#   inputs = tokenizer.encode(text, return_tensors='pt').to('cuda')

#   logits = model(inputs).logits
#   predictions = torch.max(logits,1).indices

#   print(f'{text} - {id2label[predictions.tolist()[0]]}')

Using device: mps
Trained model predictions
It was good. - LEFT
Not a fan, don't recommended - LEFT
Better than the first one. - LEFT
Women have the right to choose and abortion should be allowed. - LEFT


In [22]:
output_model_file = "pytorch_distilbert_imbd.bin"
output_vocab_file = "vocab_distilbert_imbd.bin"

# Save model
model_to_save = model
torch.save(model_to_save, output_model_file)

# Save tokenizer vocabulary in the current directory
tokenizer.save_vocabulary(".")  # Current directory

# Save model state dictionary
torch.save(model.state_dict(), "LORA_distilBERT_SUMMARY_2017_2_T5_Base.pth")

print("All files saved")

All files saved
