# BERT with LoRA for analysis on political leaning of news; BODY 

In [63]:
!pip install transformers datasets peft evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [65]:
from datasets import load_dataset, DatasetDict, Dataset
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

In [66]:
model_checkpoint = "distilbert-base-uncased"

# Define label maps
id2label = {0: "UNDEFINED", 1: "LEFT", 2: "RIGHT", 3: "CENTER"}
label2id = {"UNDEFINED": 0, "LEFT": 1, "RIGHT": 2, "CENTER": 3}

model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=4, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [54]:
from datasets import load_dataset

from transformers import BertTokenizerFast

from torch.utils.data import DataLoader

df = load_dataset("csv", data_files="/Users/ilseoplee/NLPizza_final_project/Filing/LORA_Head/2017_1.csv")
df

DatasetDict({
    train: Dataset({
        features: ['id', 'date_publish', 'outlet', 'headline', 'lead', 'body', 'authors', 'domain', 'url', 'political_leaning'],
        num_rows: 146718
    })
})

In [55]:
# train_testvalid =
df = df["train"].train_test_split(test_size=0.1)

In [56]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix=True)

In [57]:
def tokenize_function(examples):
    text = examples["body"]
    labels = examples["political_leaning"]

    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text, return_tensors="np", padding=True, truncation=True, max_length=512
    )

    tokenized_inputs["labels"] = [label2id[label] for label in labels]
    return tokenized_inputs

In [58]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({"pad_token": "[PAD]"})
    model.resize_token_embeddings(len(tokenizer))

In [59]:
tokenized_dataset = df.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/14672 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'date_publish', 'outlet', 'headline', 'lead', 'body', 'authors', 'domain', 'url', 'political_leaning', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 132046
    })
    test: Dataset({
        features: ['id', 'date_publish', 'outlet', 'headline', 'lead', 'body', 'authors', 'domain', 'url', 'political_leaning', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14672
    })
})

In [60]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [61]:
accuracy = evaluate.load("accuracy")


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": accuracy.compute(predictions=predictions, references=labels)}

In [62]:
text_list = [
    "It was good.",
    "Not a fan, don't recommended",
    "Better than the first one.",
    "Women have the right to choose and abortion should be allowed.",
]

import torch

# Set device
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = model.to(device)

print("Untrained model")
for text in text_list:
    inputs = tokenizer(text, return_tensors="pt").to(
        device
    )  # Move inputs to the correct device
    logits = model(**inputs).logits  # Forward pass
    predictions = torch.argmax(logits, dim=-1)
    print(f"{text} - {id2label[predictions.item()]}")

# print("Untrained model")
# for text in text_list:
#   inputs = tokenizer.encode(text, return_tensors="pt")
#   logits = model(inputs).logits
#   predictions = torch.argmax(logits)
#   print(f'{text} - {id2label[predictions.tolist()]}')

Untrained model
It was good. - UNDEFINED
Not a fan, don't recommended - UNDEFINED
Better than the first one. - UNDEFINED
Women have the right to choose and abortion should be allowed. - UNDEFINED


In [43]:
peft_config = LoraConfig(
    task_type="SEQ_CLS", r=4, lora_alpha=32, lora_dropout=0.01, target_modules=["q_lin"]
)

In [44]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 630,532 || all params: 67,587,080 || trainable%: 0.9329


In [45]:
lr = 1e-3
batch_size = 10
num_epochs = 5

training_args = TrainingArguments(
    output_dir="" + model_checkpoint + "lora-txt",
    learning_rate=lr,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

In [46]:
def compute_metrics(eval_pred):  #Training
    """
    Computes accuracy, precision, recall, and F1 score.
    eval_pred: A tuple of (predictions, labels) provided by the Trainer.
    """
    predictions, labels = eval_pred
    # Convert predictions to the predicted class indices (argmax for softmax outputs)
    predictions = predictions.argmax(axis=-1)

    # Compute metrics
    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, average='weighted')  # Weighted for class imbalance
    recall = recall_score(labels, predictions, average='weighted')
    f1 = f1_score(labels, predictions, average='weighted')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


In [47]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [48]:
trainer.train()

  0%|          | 0/66025 [00:00<?, ?it/s]

{'loss': 0.8287, 'grad_norm': 4.748694896697998, 'learning_rate': 0.0009924271109428247, 'epoch': 0.04}
{'loss': 0.6316, 'grad_norm': 9.43201732635498, 'learning_rate': 0.0009848542218856495, 'epoch': 0.08}
{'loss': 0.6149, 'grad_norm': 3.449251413345337, 'learning_rate': 0.0009772813328284742, 'epoch': 0.11}
{'loss': 0.5521, 'grad_norm': 6.571860313415527, 'learning_rate': 0.0009697084437712988, 'epoch': 0.15}
{'loss': 0.5324, 'grad_norm': 6.062027931213379, 'learning_rate': 0.0009621355547141235, 'epoch': 0.19}
{'loss': 0.5253, 'grad_norm': 9.735512733459473, 'learning_rate': 0.0009545626656569481, 'epoch': 0.23}
{'loss': 0.5188, 'grad_norm': 21.42173194885254, 'learning_rate': 0.0009469897765997728, 'epoch': 0.27}
{'loss': 0.5011, 'grad_norm': 10.18978500366211, 'learning_rate': 0.0009394168875425976, 'epoch': 0.3}
{'loss': 0.4707, 'grad_norm': 6.81619119644165, 'learning_rate': 0.0009318439984854222, 'epoch': 0.34}
{'loss': 0.4687, 'grad_norm': 13.150165557861328, 'learning_rate': 

  0%|          | 0/1468 [00:00<?, ?it/s]

{'eval_loss': 0.3901447653770447, 'eval_accuracy': 0.8636859323882224, 'eval_precision': 0.8651431915030486, 'eval_recall': 0.8636859323882224, 'eval_f1': 0.862021879021791, 'eval_runtime': 206.6558, 'eval_samples_per_second': 70.997, 'eval_steps_per_second': 7.104, 'epoch': 1.0}
{'loss': 0.4254, 'grad_norm': 14.58061408996582, 'learning_rate': 0.0007955319954562665, 'epoch': 1.02}
{'loss': 0.4492, 'grad_norm': 14.51124382019043, 'learning_rate': 0.0007879591063990913, 'epoch': 1.06}
{'loss': 0.4416, 'grad_norm': 11.60539436340332, 'learning_rate': 0.0007803862173419159, 'epoch': 1.1}
{'loss': 0.4482, 'grad_norm': 2.033473014831543, 'learning_rate': 0.0007728133282847406, 'epoch': 1.14}
{'loss': 0.4298, 'grad_norm': 4.922005653381348, 'learning_rate': 0.0007652404392275654, 'epoch': 1.17}
{'loss': 0.4118, 'grad_norm': 44.417884826660156, 'learning_rate': 0.00075766755017039, 'epoch': 1.21}
{'loss': 0.4164, 'grad_norm': 21.770553588867188, 'learning_rate': 0.0007500946611132147, 'epoch'

  0%|          | 0/1468 [00:00<?, ?it/s]

{'eval_loss': 0.3659074902534485, 'eval_accuracy': 0.8736368593238822, 'eval_precision': 0.8736656151536688, 'eval_recall': 0.8736368593238822, 'eval_f1': 0.8734697771736488, 'eval_runtime': 183.3657, 'eval_samples_per_second': 80.015, 'eval_steps_per_second': 8.006, 'epoch': 2.0}
{'loss': 0.3962, 'grad_norm': 2.1413371562957764, 'learning_rate': 0.0005986368799697085, 'epoch': 2.01}
{'loss': 0.3686, 'grad_norm': 6.581758499145508, 'learning_rate': 0.0005910639909125332, 'epoch': 2.04}
{'loss': 0.3896, 'grad_norm': 12.2778902053833, 'learning_rate': 0.0005834911018553578, 'epoch': 2.08}
{'loss': 0.3777, 'grad_norm': 37.872291564941406, 'learning_rate': 0.0005759182127981825, 'epoch': 2.12}
{'loss': 0.3708, 'grad_norm': 3.4691784381866455, 'learning_rate': 0.0005683453237410072, 'epoch': 2.16}
{'loss': 0.3976, 'grad_norm': 5.366386413574219, 'learning_rate': 0.0005607724346838319, 'epoch': 2.2}
{'loss': 0.4009, 'grad_norm': 3.442211389541626, 'learning_rate': 0.0005531995456266566, 'epo

  0%|          | 0/1468 [00:00<?, ?it/s]

{'eval_loss': 0.3323524594306946, 'eval_accuracy': 0.8880179934569248, 'eval_precision': 0.8884314537818313, 'eval_recall': 0.8880179934569248, 'eval_f1': 0.8871161729000467, 'eval_runtime': 181.3339, 'eval_samples_per_second': 80.912, 'eval_steps_per_second': 8.096, 'epoch': 3.0}
{'loss': 0.3462, 'grad_norm': 11.210787773132324, 'learning_rate': 0.000394168875425975, 'epoch': 3.03}
{'loss': 0.3438, 'grad_norm': 17.527494430541992, 'learning_rate': 0.00038659598636879973, 'epoch': 3.07}
{'loss': 0.3471, 'grad_norm': 16.668025970458984, 'learning_rate': 0.0003790230973116244, 'epoch': 3.1}
{'loss': 0.3485, 'grad_norm': 7.274332046508789, 'learning_rate': 0.00037145020825444906, 'epoch': 3.14}
{'loss': 0.3388, 'grad_norm': 1.66489577293396, 'learning_rate': 0.0003638773191972738, 'epoch': 3.18}
{'loss': 0.359, 'grad_norm': 1.5970563888549805, 'learning_rate': 0.00035630443014009845, 'epoch': 3.22}
{'loss': 0.3263, 'grad_norm': 7.26780891418457, 'learning_rate': 0.00034873154108292317, 'e

  0%|          | 0/1468 [00:00<?, ?it/s]

{'eval_loss': 0.3035416007041931, 'eval_accuracy': 0.9017175572519084, 'eval_precision': 0.9022964681872638, 'eval_recall': 0.9017175572519084, 'eval_f1': 0.9005797837820249, 'eval_runtime': 180.471, 'eval_samples_per_second': 81.298, 'eval_steps_per_second': 8.134, 'epoch': 4.0}
{'loss': 0.3046, 'grad_norm': 2.8429315090179443, 'learning_rate': 0.0001972737599394169, 'epoch': 4.01}
{'loss': 0.2904, 'grad_norm': 1.144376516342163, 'learning_rate': 0.0001897008708822416, 'epoch': 4.05}
{'loss': 0.3082, 'grad_norm': 3.105647087097168, 'learning_rate': 0.00018212798182506626, 'epoch': 4.09}
{'loss': 0.283, 'grad_norm': 110.03023529052734, 'learning_rate': 0.00017455509276789096, 'epoch': 4.13}
{'loss': 0.2631, 'grad_norm': 2.8784265518188477, 'learning_rate': 0.00016698220371071565, 'epoch': 4.17}
{'loss': 0.2825, 'grad_norm': 11.242555618286133, 'learning_rate': 0.00015940931465354034, 'epoch': 4.2}
{'loss': 0.2695, 'grad_norm': 4.8011016845703125, 'learning_rate': 0.000151836425596365, 

  0%|          | 0/1468 [00:00<?, ?it/s]

{'eval_loss': 0.28180262446403503, 'eval_accuracy': 0.9074427480916031, 'eval_precision': 0.908375419554778, 'eval_recall': 0.9074427480916031, 'eval_f1': 0.9066165564820392, 'eval_runtime': 182.4428, 'eval_samples_per_second': 80.42, 'eval_steps_per_second': 8.046, 'epoch': 5.0}
{'train_runtime': 18599.8932, 'train_samples_per_second': 35.496, 'train_steps_per_second': 3.55, 'train_loss': 0.3784656818018678, 'epoch': 5.0}


TrainOutput(global_step=66025, training_loss=0.3784656818018678, metrics={'train_runtime': 18599.8932, 'train_samples_per_second': 35.496, 'train_steps_per_second': 3.55, 'total_flos': 8.874093177643008e+16, 'train_loss': 0.3784656818018678, 'epoch': 5.0})

<!--  -->

In [49]:
import torch

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

model = model.to(device)
print("Trained model predictions")

for text in text_list:
    inputs = tokenizer.encode(text, return_tensors="pt").to(device)

    logits = model(inputs).logits

    predictions = torch.argmax(logits, dim=-1)

    print(f"{text} - {id2label[predictions.item()]}")


# INITIAL CODE
# model.to('cuda')
# print('Trained model predictions')
# for text in text_list:
#   inputs = tokenizer.encode(text, return_tensors='pt').to('cuda')

#   logits = model(inputs).logits
#   predictions = torch.max(logits,1).indices

#   print(f'{text} - {id2label[predictions.tolist()[0]]}')

Using device: mps
Trained model predictions
It was good. - UNDEFINED
Not a fan, don't recommended - UNDEFINED
Better than the first one. - UNDEFINED
Women have the right to choose and abortion should be allowed. - LEFT


In [None]:
output_model_file = "pytorch_distilbert_imbd.bin"
output_vocab_file = "vocab_distilbert_imbd.bin"

# Save model
model_to_save = model
torch.save(model_to_save, output_model_file)

# Save tokenizer vocabulary in the current directory
tokenizer.save_vocabulary(".")  # Current directory

# Save model state dictionary
torch.save(model.state_dict(), "trained_model_gral_imbd_body_2017_1_shawn")

print("All files saved")

All files saved
