# BERT with LoRA for analysis on political leaning of news; HEADLINE 

In [1]:
!pip install transformers datasets peft evaluate



In [2]:
from datasets import load_dataset, DatasetDict, Dataset
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

In [3]:
model_checkpoint = "distilbert-base-uncased"

# Define label maps
id2label = {0: "UNDEFINED", 1: "LEFT", 2: "RIGHT", 3: "CENTER"}
label2id = {"UNDEFINED": 0, "LEFT": 1, "RIGHT": 2, "CENTER": 3}

model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=4, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
from datasets import load_dataset

from transformers import BertTokenizerFast

from torch.utils.data import DataLoader

df = load_dataset("csv", data_files="2017_1.csv")
df

DatasetDict({
    train: Dataset({
        features: ['id', 'date_publish', 'outlet', 'headline', 'lead', 'body', 'authors', 'domain', 'url', 'political_leaning'],
        num_rows: 146718
    })
})

In [5]:
# train_testvalid =
df = df["train"].train_test_split(test_size=0.1)

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix=True)

In [7]:
def tokenize_function(examples):
    text = examples["headline"]
    labels = examples["political_leaning"]

    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text, return_tensors="np", padding=True, truncation=True, max_length=512
    )

    tokenized_inputs["labels"] = [label2id[label] for label in labels]
    return tokenized_inputs

In [8]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})
    model.resize_token_embeddings(len(tokenizer))

In [9]:
tokenized_dataset = df.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/132046 [00:00<?, ? examples/s]

Map:   0%|          | 0/14672 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'date_publish', 'outlet', 'headline', 'lead', 'body', 'authors', 'domain', 'url', 'political_leaning', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 132046
    })
    test: Dataset({
        features: ['id', 'date_publish', 'outlet', 'headline', 'lead', 'body', 'authors', 'domain', 'url', 'political_leaning', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14672
    })
})

In [10]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [11]:
accuracy = evaluate.load("accuracy")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

In [12]:
text_list = [
    "It was good.",
    "Not a fan, don't recommended",
    "Better than the first one.",
    "Women have the right to choose and abortion should be allowed.",
]

import torch

# Set device
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = model.to(device)

print("Untrained model")
for text in text_list:
    inputs = tokenizer(text, return_tensors="pt").to(
        device
    )  # Move inputs to the correct device
    logits = model(**inputs).logits  # Forward pass
    predictions = torch.argmax(logits, dim=-1)
    print(f"{text} - {id2label[predictions.item()]}")

# print("Untrained model")
# for text in text_list:
#   inputs = tokenizer.encode(text, return_tensors="pt")
#   logits = model(inputs).logits
#   predictions = torch.argmax(logits)
#   print(f'{text} - {id2label[predictions.tolist()]}')

Untrained model
It was good. - CENTER
Not a fan, don't recommended - CENTER
Better than the first one. - CENTER
Women have the right to choose and abortion should be allowed. - CENTER


In [13]:
peft_config = LoraConfig(
    task_type="SEQ_CLS", r=4, lora_alpha=32, lora_dropout=0.01, target_modules=["q_lin"]
)

In [14]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 630,532 || all params: 67,587,080 || trainable%: 0.9329


In [15]:
lr = 1e-3
batch_size = 10
num_epochs = 5

training_args = TrainingArguments(
    output_dir="" + model_checkpoint + "lora-txt",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)



In [16]:
def compute_metrics(eval_pred):  #Training
    """
    Computes accuracy, precision, recall, and F1 score.
    eval_pred: A tuple of (predictions, labels) provided by the Trainer.
    """
    predictions, labels = eval_pred
    # Convert predictions to the predicted class indices (argmax for softmax outputs)
    predictions = predictions.argmax(axis=-1)

    # Compute metrics
    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, average='weighted')  # Weighted for class imbalance
    recall = recall_score(labels, predictions, average='weighted')
    f1 = f1_score(labels, predictions, average='weighted')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [18]:
trainer.train()

  0%|          | 0/66025 [00:00<?, ?it/s]

{'loss': 1.1826, 'grad_norm': 2.1559102535247803, 'learning_rate': 0.0009924271109428247, 'epoch': 0.04}
{'loss': 1.1276, 'grad_norm': 2.546229600906372, 'learning_rate': 0.0009848542218856495, 'epoch': 0.08}
{'loss': 1.1081, 'grad_norm': 2.4174036979675293, 'learning_rate': 0.0009772813328284742, 'epoch': 0.11}
{'loss': 1.0726, 'grad_norm': 3.5102779865264893, 'learning_rate': 0.0009697084437712988, 'epoch': 0.15}
{'loss': 1.0772, 'grad_norm': 2.8828771114349365, 'learning_rate': 0.0009621355547141235, 'epoch': 0.19}
{'loss': 1.0928, 'grad_norm': 2.792771100997925, 'learning_rate': 0.0009545626656569481, 'epoch': 0.23}
{'loss': 1.07, 'grad_norm': 4.14138650894165, 'learning_rate': 0.0009469897765997728, 'epoch': 0.27}
{'loss': 1.0634, 'grad_norm': 2.927995443344116, 'learning_rate': 0.0009394168875425976, 'epoch': 0.3}
{'loss': 1.0389, 'grad_norm': 5.25220251083374, 'learning_rate': 0.0009318439984854222, 'epoch': 0.34}
{'loss': 1.0532, 'grad_norm': 4.338897228240967, 'learning_rate':

  0%|          | 0/1468 [00:00<?, ?it/s]

{'eval_loss': 1.0168834924697876, 'eval_accuracy': 0.5697928026172301, 'eval_precision': 0.5741508299340383, 'eval_recall': 0.5697928026172301, 'eval_f1': 0.5530911712858528, 'eval_runtime': 31.4472, 'eval_samples_per_second': 466.56, 'eval_steps_per_second': 46.681, 'epoch': 1.0}
{'loss': 1.0201, 'grad_norm': 6.383045196533203, 'learning_rate': 0.0007955319954562665, 'epoch': 1.02}
{'loss': 1.0194, 'grad_norm': 6.24865198135376, 'learning_rate': 0.0007879591063990913, 'epoch': 1.06}
{'loss': 1.0264, 'grad_norm': 3.3156142234802246, 'learning_rate': 0.0007803862173419159, 'epoch': 1.1}
{'loss': 1.0159, 'grad_norm': 5.132165908813477, 'learning_rate': 0.0007728133282847406, 'epoch': 1.14}
{'loss': 1.032, 'grad_norm': 6.944118022918701, 'learning_rate': 0.0007652404392275654, 'epoch': 1.17}
{'loss': 1.0383, 'grad_norm': 6.1990509033203125, 'learning_rate': 0.00075766755017039, 'epoch': 1.21}
{'loss': 1.0392, 'grad_norm': 4.666685581207275, 'learning_rate': 0.0007500946611132147, 'epoch':

  0%|          | 0/1468 [00:00<?, ?it/s]

{'eval_loss': 0.9951158761978149, 'eval_accuracy': 0.5804934569247546, 'eval_precision': 0.5827199270338542, 'eval_recall': 0.5804934569247546, 'eval_f1': 0.572097262717907, 'eval_runtime': 31.0295, 'eval_samples_per_second': 472.841, 'eval_steps_per_second': 47.31, 'epoch': 2.0}
{'loss': 1.0114, 'grad_norm': 8.391899108886719, 'learning_rate': 0.0005986368799697085, 'epoch': 2.01}
{'loss': 1.0016, 'grad_norm': 4.278904914855957, 'learning_rate': 0.0005910639909125332, 'epoch': 2.04}
{'loss': 0.9964, 'grad_norm': 6.008801460266113, 'learning_rate': 0.0005834911018553578, 'epoch': 2.08}
{'loss': 0.9869, 'grad_norm': 7.183769226074219, 'learning_rate': 0.0005759182127981825, 'epoch': 2.12}
{'loss': 0.978, 'grad_norm': 7.1634907722473145, 'learning_rate': 0.0005683453237410072, 'epoch': 2.16}
{'loss': 0.9592, 'grad_norm': 4.88427734375, 'learning_rate': 0.0005607724346838319, 'epoch': 2.2}
{'loss': 1.0186, 'grad_norm': 5.267115592956543, 'learning_rate': 0.0005531995456266566, 'epoch': 2.

  0%|          | 0/1468 [00:00<?, ?it/s]

{'eval_loss': 0.932221531867981, 'eval_accuracy': 0.6040076335877863, 'eval_precision': 0.6039375909284557, 'eval_recall': 0.6040076335877863, 'eval_f1': 0.5994331072176401, 'eval_runtime': 30.499, 'eval_samples_per_second': 481.064, 'eval_steps_per_second': 48.133, 'epoch': 3.0}
{'loss': 0.9686, 'grad_norm': 4.214250087738037, 'learning_rate': 0.000394168875425975, 'epoch': 3.03}
{'loss': 0.9536, 'grad_norm': 5.479396343231201, 'learning_rate': 0.00038659598636879973, 'epoch': 3.07}
{'loss': 0.94, 'grad_norm': 3.582141399383545, 'learning_rate': 0.0003790230973116244, 'epoch': 3.1}
{'loss': 0.9495, 'grad_norm': 7.096805572509766, 'learning_rate': 0.00037145020825444906, 'epoch': 3.14}
{'loss': 0.9477, 'grad_norm': 5.46033239364624, 'learning_rate': 0.0003638773191972738, 'epoch': 3.18}
{'loss': 0.9364, 'grad_norm': 3.569082021713257, 'learning_rate': 0.00035630443014009845, 'epoch': 3.22}
{'loss': 0.9522, 'grad_norm': 4.834888935089111, 'learning_rate': 0.00034873154108292317, 'epoch'

  0%|          | 0/1468 [00:00<?, ?it/s]

{'eval_loss': 0.9141486287117004, 'eval_accuracy': 0.6138222464558343, 'eval_precision': 0.6140510228721306, 'eval_recall': 0.6138222464558343, 'eval_f1': 0.6060873913924246, 'eval_runtime': 29.7338, 'eval_samples_per_second': 493.446, 'eval_steps_per_second': 49.372, 'epoch': 4.0}
{'loss': 0.9061, 'grad_norm': 4.245834827423096, 'learning_rate': 0.0001972737599394169, 'epoch': 4.01}
{'loss': 0.9036, 'grad_norm': 5.381833076477051, 'learning_rate': 0.0001897008708822416, 'epoch': 4.05}
{'loss': 0.8732, 'grad_norm': 3.8634862899780273, 'learning_rate': 0.00018212798182506626, 'epoch': 4.09}
{'loss': 0.8932, 'grad_norm': 4.223770618438721, 'learning_rate': 0.00017455509276789096, 'epoch': 4.13}
{'loss': 0.9099, 'grad_norm': 2.003406286239624, 'learning_rate': 0.00016698220371071565, 'epoch': 4.17}
{'loss': 0.9164, 'grad_norm': 3.672020673751831, 'learning_rate': 0.00015940931465354034, 'epoch': 4.2}
{'loss': 0.8881, 'grad_norm': 7.169710159301758, 'learning_rate': 0.000151836425596365, '

  0%|          | 0/1468 [00:00<?, ?it/s]

{'eval_loss': 0.8931793570518494, 'eval_accuracy': 0.6250681570338059, 'eval_precision': 0.6236362289471225, 'eval_recall': 0.6250681570338059, 'eval_f1': 0.6187832168640847, 'eval_runtime': 30.368, 'eval_samples_per_second': 483.14, 'eval_steps_per_second': 48.34, 'epoch': 5.0}
{'train_runtime': 2374.8099, 'train_samples_per_second': 278.014, 'train_steps_per_second': 27.802, 'train_loss': 0.9798310379438534, 'epoch': 5.0}


TrainOutput(global_step=66025, training_loss=0.9798310379438534, metrics={'train_runtime': 2374.8099, 'train_samples_per_second': 278.014, 'train_steps_per_second': 27.802, 'total_flos': 1.0172928909681888e+16, 'train_loss': 0.9798310379438534, 'epoch': 5.0})

<!--  -->

In [19]:
import torch

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

model = model.to(device)
print("Trained model predictions")

for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to(device)

    logits = model(inputs).logits

    predictions = torch.argmax(logits, dim=-1)

    print(f"{text} - {id2label[predictions.item()]}")


# INITIAL CODE
# model.to('cuda')
# print('Trained model predictions')
# for text in text_list:
#   inputs = tokenizer.encode(text, return_tensors='pt').to('cuda')

#   logits = model(inputs).logits
#   predictions = torch.max(logits,1).indices

#   print(f'{text} - {id2label[predictions.tolist()[0]]}')

Using device: mps
Trained model predictions
It was good. - LEFT
Not a fan, don't recommended - LEFT
Better than the first one. - LEFT
Women have the right to choose and abortion should be allowed. - LEFT


In [20]:
output_model_file = "pytorch_distilbert_imbd.bin"
output_vocab_file = "vocab_distilbert_imbd.bin"

# Save model
model_to_save = model
torch.save(model_to_save, output_model_file)

# Save tokenizer vocabulary in the current directory
tokenizer.save_vocabulary(".")  # Current directory

# Save model state dictionary
torch.save(model.state_dict(), "LORA_distilBERT_HEAD_2017_1.pth")

print("All files saved")

All files saved
