# Alejandro Paredes, Parameter tuning of BERT

In [1]:
#!pip install transformers datasets peft evaluate #

In [1]:
from datasets import load_dataset, DatasetDict, Dataset
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np


import re
import contractions
import numpy as np

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

  from .autonotebook import tqdm as notebook_tqdm


In [23]:
model_checkpoint = 'distilbert-base-uncased'

#Define label maps
id2label = {0:"UNDEFINED" ,1:"LEFT",2:"RIGHT",3:"CENTER"}
label2id = {"UNDEFINED": 0, "LEFT": 1, "RIGHT": 2, "CENTER": 3}

model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=4, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
from datasets import load_dataset

from transformers import BertTokenizerFast

from torch.utils.data import DataLoader

df = load_dataset("csv", data_files="./data/2019_2.csv", split="train")
df

Dataset({
    features: ['id', 'date_publish', 'outlet', 'headline', 'lead', 'body', 'authors', 'domain', 'url', 'political_leaning'],
    num_rows: 50269
})

In [8]:
# Load and combine the datasets
dataset = load_dataset("csv", data_files="./data/2019_2.csv")#data_files)

# Filter and split the dataset
df  = dataset['train'].filter(
    lambda example: example['headline'] is not None and example['headline'].strip() != ''
).train_test_split(test_size=0.1)

# Display the resulting dataset
df 

Filter: 100%|██████████| 50269/50269 [00:01<00:00, 27238.80 examples/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'date_publish', 'outlet', 'headline', 'lead', 'body', 'authors', 'domain', 'url', 'political_leaning'],
        num_rows: 45242
    })
    test: Dataset({
        features: ['id', 'date_publish', 'outlet', 'headline', 'lead', 'body', 'authors', 'domain', 'url', 'political_leaning'],
        num_rows: 5027
    })
})

In [9]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix=True)

for i in range(5):
    print('Original Text: ', df['train']['headline'][i], '\n')
    print('Tokenized Text: ', tokenizer.tokenize(df['train']['headline'][i]), '\n')
    print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(df['train']['headline'][i])))

for i in range(2):
    print('Original Text: ', df['train']['body'][i], '\n')
    print('Tokenized Text: ', tokenizer.tokenize(df['train']['body'][i]), '\n')
    print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(df['train']['body'][i])))

Original Text:  For many, it's clear why El Paso, the "ground zero" of the border debate, was the shooting target 

Tokenized Text:  ['for', 'many', ',', 'it', "'", 's', 'clear', 'why', 'el', 'paso', ',', 'the', '"', 'ground', 'zero', '"', 'of', 'the', 'border', 'debate', ',', 'was', 'the', 'shooting', 'target'] 

Token IDs:  [2005, 2116, 1010, 2009, 1005, 1055, 3154, 2339, 3449, 17161, 1010, 1996, 1000, 2598, 5717, 1000, 1997, 1996, 3675, 5981, 1010, 2001, 1996, 5008, 4539]
Original Text:  Politics of automation: Factory workers and robots 

Tokenized Text:  ['politics', 'of', 'automation', ':', 'factory', 'workers', 'and', 'robots'] 

Token IDs:  [4331, 1997, 19309, 1024, 4713, 3667, 1998, 13507]
Original Text:  'Straight Pride Parade' in Boston draws counterprotesters and heavy police presence 

Tokenized Text:  ["'", 'straight', 'pride', 'parade', "'", 'in', 'boston', 'draws', 'counter', '##pro', '##test', '##ers', 'and', 'heavy', 'police', 'presence'] 

Token IDs:  [1005, 3442, 66

Token indices sequence length is longer than the specified maximum sequence length for this model (1374 > 512). Running this sequence through the model will result in indexing errors


Tokenized Text:  ['when', 'a', 'gun', '##man', 'stormed', 'a', 'crowded', 'wal', '##mart', 'in', 'el', 'paso', 'on', 'saturday', ',', 'killing', 'at', 'least', '20', 'people', 'and', 'injuring', 'more', 'than', 'two', 'dozen', 'others', ',', 'the', 'texas', 'border', 'city', 'was', 'hit', 'with', 'an', 'unprecedented', 'level', 'of', 'blood', '##shed', 'and', 'grief', '.', 'along', 'with', 'another', 'mass', 'shooting', 'in', 'dayton', ',', 'ohio', 'some', '13', 'hours', 'later', ',', 'the', 'massacre', 'in', 'el', 'paso', 'reign', '##ited', 'the', 'highly', 'contentious', 'national', 'debate', 'around', 'proposals', 'to', 'regulate', 'guns', '.', 'el', 'paso', 'is', 'a', 'border', 'community', 'una', '##ccus', '##tom', '##ed', 'to', 'such', 'large', '-', 'scale', 'acts', 'of', 'violence', '.', 'and', 'for', 'many', 'residents', 'of', 'el', 'paso', '—', 'an', 'epic', '##enter', 'of', 'another', 'of', 'the', 'nation', "'", 's', 'most', 'di', '##vis', '##ive', 'issues', ',', 'immigration

In [10]:
tokenizer.tokenize('EQT, KKR Among Potential Bidders for Long Beach Container Terminal')

['e',
 '##q',
 '##t',
 ',',
 'k',
 '##kr',
 'among',
 'potential',
 'bid',
 '##ders',
 'for',
 'long',
 'beach',
 'container',
 'terminal']

In [28]:

#lemmatization and removing stopwords
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

#lemmatizer = WordNetLemmatizer()
#stop_words = set(stopwords.words("english"))

def preprocess(text):
    def is_english_word(word):
        """Function to filter out non-English words."""
        return bool(re.match(r'^[a-zA-Z]+$', word))
    text = text.lower()
    text = contractions.fix(text)
    # Remove non-English characters and punctuation
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Removes non-ASCII characters
    return text



for i in range(5):
    print('Original Text: ', df['train']['headline'][i], '\n')
    print('Tokenized Text: ', tokenizer.tokenize(preprocess(df['train']['headline'][i])), '\n')
    print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(df['train']['headline'][i])))

for i in range(2):
    print('Original Text: ', df['train']['body'][i], '\n')
    print('Tokenized Text: ', tokenizer.tokenize(preprocess(df['train']['body'][i])), '\n')
    print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(df['train']['body'][i])))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ALEJANDRO\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ALEJANDRO\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ALEJANDRO\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Original Text:  Portage schools to use Sheriff's Department for resource officers in city 

Tokenized Text:  ['portage', 'schools', 'to', 'use', 'sheriff', "'", 's', 'department', 'for', 'resource', 'officers', 'in', 'city'] 

Token IDs:  [25140, 2816, 2000, 2224, 6458, 1005, 1055, 2533, 2005, 7692, 3738, 1999, 2103]
Original Text:  EQT, KKR Among Potential Bidders for Long Beach Container Terminal 

Tokenized Text:  ['e', '##q', '##t', ',', 'k', '##kr', 'among', 'potential', 'bid', '##ders', 'for', 'long', 'beach', 'container', 'terminal'] 

Token IDs:  [1041, 4160, 2102, 1010, 1047, 21638, 2426, 4022, 7226, 13375, 2005, 2146, 3509, 11661, 5536]
Original Text:  Kano: Meet Christians wey like carry dia case go Islamic police, Sharia court 

Tokenized Text:  ['kan', '##o', ':', 'meet', 'christians', 'we', '##y', 'like', 'carry', 'dia', 'case', 'go', 'islamic', 'police', ',', 'sha', '##ria', 'court'] 

Token IDs:  [22827, 2080, 1024, 3113, 8135, 2057, 2100, 2066, 4287, 22939, 2553, 2175,

In [11]:
def tokenize_function(examples):
    text = examples["body"]
    labels = examples["political_leaning"]  
    
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors = "np",
        padding = True,
        truncation = True,
        max_length = 512
        )

    tokenized_inputs["labels"] = [label2id[label] for label in labels]  
    return tokenized_inputs

In [12]:
texts = df['train']['headline']

# Handle None or missing values by filtering out None entries
text_lengths = [len(text.split(' ')) if text is not None else 0 for text in texts]

print(min(text_lengths))
print(max(text_lengths))

# Count how many texts have 300 or more words
print(sum([1 for length in text_lengths if length >= 300]))

# Repeat for the 'body' column
texts = df['train']['body']

# Handle None or missing values by filtering out None entries
text_lengths = [len(text.split()) if text is not None else 0 for text in texts]

print(min(text_lengths))
print(max(text_lengths))

# Count how many texts have 300 or more words
print(sum([1 for length in text_lengths if length >= 300]))


1
37
0
15
12036
29515


In [50]:
if tokenizer.pad_token is None:
  tokenizer.add_special_tokens({'pad_token': '[PAD]'})
  model.resize_token_embeddings(len(tokenizer))

In [51]:
tokenized_dataset = df.map(tokenize_function, batched=True)
tokenized_dataset

Map: 100%|██████████| 136735/136735 [01:55<00:00, 1179.22 examples/s]
Map: 100%|██████████| 15193/15193 [00:14<00:00, 1015.18 examples/s]


DatasetDict({
    train: Dataset({
        features: ['id', 'date_publish', 'outlet', 'headline', 'lead', 'body', 'authors', 'domain', 'url', 'political_leaning', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 136735
    })
    test: Dataset({
        features: ['id', 'date_publish', 'outlet', 'headline', 'lead', 'body', 'authors', 'domain', 'url', 'political_leaning', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 15193
    })
})

In [13]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [14]:
accuracy = evaluate.load("accuracy")

def compute_metrics(p):
  predictions, labels = p
  predictions = np.argmax(predictions, axis=1)
  return {"accuracy": accuracy.compute(predictions=predictions
                                       , references=labels)}

In [15]:
peft_config = LoraConfig(task_type='SEQ_CLS',
                         r = 4,
                         lora_alpha=32,
                         lora_dropout=0.01,
                         target_modules = ['q_lin'])

In [16]:
model = get_peft_model(model, peft_config)
print(model.print_trainable_parameters())

model

trainable params: 630,532 || all params: 67,587,080 || trainable%: 0.9329
None


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): DistilBertForSequenceClassification(
      (distilbert): DistilBertModel(
        (embeddings): Embeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (transformer): Transformer(
          (layer): ModuleList(
            (0-5): 6 x TransformerBlock(
              (attention): DistilBertSdpaAttention(
                (dropout): Dropout(p=0.1, inplace=False)
                (q_lin): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.01, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=7

In [7]:
for name, param in model.named_parameters():
    print(f"{name}: requires_grad={param.requires_grad}")

base_model.model.distilbert.embeddings.word_embeddings.weight: requires_grad=False
base_model.model.distilbert.embeddings.position_embeddings.weight: requires_grad=False
base_model.model.distilbert.embeddings.LayerNorm.weight: requires_grad=False
base_model.model.distilbert.embeddings.LayerNorm.bias: requires_grad=False
base_model.model.distilbert.transformer.layer.0.attention.q_lin.base_layer.weight: requires_grad=False
base_model.model.distilbert.transformer.layer.0.attention.q_lin.base_layer.bias: requires_grad=False
base_model.model.distilbert.transformer.layer.0.attention.q_lin.lora_A.default.weight: requires_grad=True
base_model.model.distilbert.transformer.layer.0.attention.q_lin.lora_B.default.weight: requires_grad=True
base_model.model.distilbert.transformer.layer.0.attention.k_lin.weight: requires_grad=False
base_model.model.distilbert.transformer.layer.0.attention.k_lin.bias: requires_grad=False
base_model.model.distilbert.transformer.layer.0.attention.v_lin.weight: requires

In [None]:
lr = 1e-3
batch_size = 10
num_epochs = 10

training_args = TrainingArguments(
    output_dir=""+model_checkpoint+"lora-txt",
    learning_rate = lr,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    num_train_epochs = num_epochs,
    weight_decay = 0.01,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    load_best_model_at_end = True,
)

In [225]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_dataset["train"],
    eval_dataset = tokenized_dataset["test"],
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics
)

  trainer = Trainer(


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


### **Make predictions**

In [22]:
from peft import LoraConfig, get_peft_model

# Recreate LoRA configuration
peft_config = LoraConfig(
    task_type="SEQ_CLS", r=4, lora_alpha=32, lora_dropout=0.01, target_modules=["q_lin"]
)

# Apply LoRA configuration
base_model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=4, id2label=id2label, label2id=label2id)
model = get_peft_model(base_model, peft_config)

# Load LoRA adapter weights
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.load_state_dict(torch.load(r"C:\Users\ALEJANDRO\Documents\7. DUKE\1. ECE 684 - NLP\Assignments\Final Project\models\LORA_distilBERT_BODY_2017_1.pth", map_location=device), strict=False)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(r"C:\Users\ALEJANDRO\Documents\7. DUKE\1. ECE 684 - NLP\Assignments\Final Project\models\LORA_distilBERT_BODY_2017_1.pth", map_location=device), strict=False)


<All keys matched successfully>

In [None]:
# Load model and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
state_dict = torch.load("trained_model_gral_imbd.pth", map_location=device)

text_list = ['''President-elect Trump announced on Tuesday night that he intends to appoint Linda McMahon, former CEO of World Wrestling Entertainment (WWE), to lead the Department of Education. His announcement, which was posted on Truth Social, came hours after two sources told Fox News that McMahon was likely to be picked. "It is my great honor to announce that Linda McMahon, former Administrator of the Small Business Administration, will be the United States Secretary of Education," Trump's statement read.
"As Secretary of Education, Linda will fight tirelessly to expand Choice to every State in America, and empower parents to make the best Education decisions for their families," the press release added. "Linda served for two years on the Connecticut Board of Education, where she was one of fifteen members overseeing all Public Education in the State, including its Technical High School system."''', 
             '''Donald Trump believes presidents have almost absolute power. In his second term, there will be few political or legal restraints to check him. The president-elects sweeping victory over Vice President Kamala Harris suddenly turned the theoretical notion that he will indulge his autocratic instincts into a genuine possibility.When Trump returns to the White House in January as one of the most powerful presidents in history, hell be able to take advantage of his own filleting of guardrails during his first presidency, which he continued through legal maneuverings out of office.''',
             '''Nearly 100 Democrats, including Salud Carbajal, requested the Ethics Committee release its report on former Congressman Matt Gaetz's misconduct allegations. The letter, led by Rep. Sean Casten, emphasized that the Senate needs information for Gaetz's attorney general nomination. House Speaker Mike Johnson opposed releasing the report, stating Gaetz is now a "private citizen" and outside the panel's jurisdiction.'''
             , ''' A South Dakota judge dismissed a lawsuit from the anti-abortion group Life Defense targeting an abortion rights measure that voters later rejected.
Judge John Pekas dismissed the lawsuit at the request of Life Defense, which had challenged the ballot measure's petitions.
Voters in nine states, including South Dakota, rejected abortion rights measures during the November election. '''
             ]
model.to('cuda')
print('Trained model predictions')
for text in text_list:
  inputs = tokenizer.encode(text, return_tensors='pt').to('cuda')

  logits = model(inputs).logits
  predictions = torch.max(logits,1).indices

  #print(f'{text} - {id2label[predictions.tolist()[0]]}')
  print(f'{id2label[predictions.tolist()[0]]}')

  state_dict = torch.load("LORA_distilBERT_BODY_2017_1.pth", map_location=device)


FileNotFoundError: [Errno 2] No such file or directory: 'LORA_distilBERT_BODY_2017_1.pth'