# Alejandro Paredes, Parameter tuning of BERT

In [6]:
!pip install transformers datasets peft evaluate



In [7]:
from datasets import load_dataset, DatasetDict, Dataset
from transformers import (
    AutoTokenizer,
    AutoConfig,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer
)
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
import torch
import numpy as np

In [8]:
model_checkpoint = 'distilbert-base-uncased'

#Define label maps
id2label = {0:"UNDEFINED" ,1:"LEFT",2:"RIGHT",3:"CENTER"}
label2id = {"UNDEFINED": 0, "LEFT": 1, "RIGHT": 2, "CENTER": 3}

model = AutoModelForSequenceClassification.from_pretrained(
    model_checkpoint, num_labels=4, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
from datasets import load_dataset

from transformers import BertTokenizerFast

from torch.utils.data import DataLoader

df = load_dataset("csv", data_files="/Users/ilseoplee/NLPizza_final_project/2017_1.csv") 
df

DatasetDict({
    train: Dataset({
        features: ['id', 'date_publish', 'outlet', 'headline', 'lead', 'body', 'authors', 'domain', 'url', 'political_leaning'],
        num_rows: 146718
    })
})

In [10]:
# train_testvalid = 
df = df['train'].train_test_split(test_size=0.1)

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix=True)

In [12]:
def tokenize_function(examples):
    text = examples["body"]
    labels = examples["political_leaning"]  
    
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors = "np",
        padding = True,
        truncation = True,
        max_length = 512
        )

    tokenized_inputs["labels"] = [label2id[label] for label in labels]  
    return tokenized_inputs

In [13]:
if tokenizer.pad_token is None:
  tokenizer.add_special_tokens({'pad_token': '[PAD]'})
  model.resize_token_embeddings(len(tokenizer))

In [14]:
tokenized_dataset = df.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/132046 [00:00<?, ? examples/s]

Map:   0%|          | 0/14672 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'date_publish', 'outlet', 'headline', 'lead', 'body', 'authors', 'domain', 'url', 'political_leaning', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 132046
    })
    test: Dataset({
        features: ['id', 'date_publish', 'outlet', 'headline', 'lead', 'body', 'authors', 'domain', 'url', 'political_leaning', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14672
    })
})

In [15]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
accuracy = evaluate.load("accuracy")

def compute_metrics(p):
  predictions, labels = p
  predictions = np.argmax(predictions, axis=1)
  return {"accuracy": accuracy.compute(predictions=predictions
                                       , references=labels)}

In [17]:
text_list = ["It was good.", "Not a fan, don't recommended",
              "Better than the first one.", "Women have the right to choose and abortion should be allowed."]

import torch

# Set device
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = model.to(device)

print("Untrained model")
for text in text_list:
    inputs = tokenizer(text, return_tensors="pt").to(device)  # Move inputs to the correct device
    logits = model(**inputs).logits  # Forward pass
    predictions = torch.argmax(logits, dim=-1)
    print(f'{text} - {id2label[predictions.item()]}')

# print("Untrained model")
# for text in text_list:
#   inputs = tokenizer.encode(text, return_tensors="pt")
#   logits = model(inputs).logits
#   predictions = torch.argmax(logits)
#   print(f'{text} - {id2label[predictions.tolist()]}')

Untrained model
It was good. - LEFT
Not a fan, don't recommended - LEFT
Better than the first one. - LEFT
Women have the right to choose and abortion should be allowed. - LEFT


In [18]:
peft_config = LoraConfig(task_type='SEQ_CLS',
                         r = 4,
                         lora_alpha=32,
                         lora_dropout=0.01,
                         target_modules = ['q_lin'])

In [19]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 630,532 || all params: 67,587,080 || trainable%: 0.9329


In [20]:
lr = 1e-3
batch_size = 10
num_epochs = 5

training_args = TrainingArguments(
    output_dir=""+model_checkpoint+"lora-txt",
    learning_rate = lr,
    per_device_train_batch_size = batch_size,
    per_device_eval_batch_size = batch_size,
    num_train_epochs = num_epochs,
    weight_decay = 0.01,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    load_best_model_at_end = True,
)



In [21]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_dataset["train"],
    eval_dataset = tokenized_dataset["test"],
    tokenizer = tokenizer,
    data_collator = data_collator,
    compute_metrics = compute_metrics
)

  trainer = Trainer(


In [22]:
trainer.train()

  0%|          | 0/66025 [00:00<?, ?it/s]

{'loss': 0.8446, 'grad_norm': 2.7001771926879883, 'learning_rate': 0.0009924271109428247, 'epoch': 0.04}
{'loss': 0.6004, 'grad_norm': 9.18339729309082, 'learning_rate': 0.0009848542218856495, 'epoch': 0.08}
{'loss': 0.5425, 'grad_norm': 11.239620208740234, 'learning_rate': 0.0009772813328284742, 'epoch': 0.11}
{'loss': 0.5302, 'grad_norm': 3.765162944793701, 'learning_rate': 0.0009697084437712988, 'epoch': 0.15}
{'loss': 0.4978, 'grad_norm': 7.315803050994873, 'learning_rate': 0.0009621355547141235, 'epoch': 0.19}
{'loss': 0.5099, 'grad_norm': 2.364064931869507, 'learning_rate': 0.0009545626656569481, 'epoch': 0.23}
{'loss': 0.4565, 'grad_norm': 2.588992118835449, 'learning_rate': 0.0009469897765997728, 'epoch': 0.27}
{'loss': 0.469, 'grad_norm': 5.096480846405029, 'learning_rate': 0.0009394168875425976, 'epoch': 0.3}
{'loss': 0.4698, 'grad_norm': 7.594978332519531, 'learning_rate': 0.0009318439984854222, 'epoch': 0.34}
{'loss': 0.4569, 'grad_norm': 5.7825446128845215, 'learning_rate'

  0%|          | 0/1468 [00:00<?, ?it/s]

{'eval_loss': 0.3771231770515442, 'eval_accuracy': {'accuracy': 0.8710468920392584}, 'eval_runtime': 298.3098, 'eval_samples_per_second': 49.184, 'eval_steps_per_second': 4.921, 'epoch': 1.0}
{'loss': 0.4396, 'grad_norm': 8.75094985961914, 'learning_rate': 0.0007955319954562665, 'epoch': 1.02}
{'loss': 0.4191, 'grad_norm': 5.662125110626221, 'learning_rate': 0.0007879591063990913, 'epoch': 1.06}
{'loss': 0.4056, 'grad_norm': 3.3248071670532227, 'learning_rate': 0.0007803862173419159, 'epoch': 1.1}
{'loss': 0.4243, 'grad_norm': 20.49200439453125, 'learning_rate': 0.0007728133282847406, 'epoch': 1.14}
{'loss': 0.4113, 'grad_norm': 11.759016990661621, 'learning_rate': 0.0007652404392275654, 'epoch': 1.17}
{'loss': 0.4125, 'grad_norm': 9.839488983154297, 'learning_rate': 0.00075766755017039, 'epoch': 1.21}
{'loss': 0.4089, 'grad_norm': 5.333361625671387, 'learning_rate': 0.0007500946611132147, 'epoch': 1.25}
{'loss': 0.4004, 'grad_norm': 9.51548957824707, 'learning_rate': 0.000742521772056

  0%|          | 0/1468 [00:00<?, ?it/s]

{'eval_loss': 0.3429100215435028, 'eval_accuracy': {'accuracy': 0.8820883315158125}, 'eval_runtime': 309.9586, 'eval_samples_per_second': 47.335, 'eval_steps_per_second': 4.736, 'epoch': 2.0}
{'loss': 0.3956, 'grad_norm': 1.027021050453186, 'learning_rate': 0.0005986368799697085, 'epoch': 2.01}
{'loss': 0.3551, 'grad_norm': 9.879945755004883, 'learning_rate': 0.0005910639909125332, 'epoch': 2.04}
{'loss': 0.3768, 'grad_norm': 3.439880132675171, 'learning_rate': 0.0005834911018553578, 'epoch': 2.08}
{'loss': 0.3653, 'grad_norm': 6.963294982910156, 'learning_rate': 0.0005759182127981825, 'epoch': 2.12}
{'loss': 0.3783, 'grad_norm': 5.618332386016846, 'learning_rate': 0.0005683453237410072, 'epoch': 2.16}
{'loss': 0.3658, 'grad_norm': 8.744894981384277, 'learning_rate': 0.0005607724346838319, 'epoch': 2.2}
{'loss': 0.3746, 'grad_norm': 3.110947370529175, 'learning_rate': 0.0005531995456266566, 'epoch': 2.23}
{'loss': 0.3796, 'grad_norm': 5.894505977630615, 'learning_rate': 0.0005456266565

  0%|          | 0/1468 [00:00<?, ?it/s]

{'eval_loss': 0.29516446590423584, 'eval_accuracy': {'accuracy': 0.8969465648854962}, 'eval_runtime': 302.592, 'eval_samples_per_second': 48.488, 'eval_steps_per_second': 4.851, 'epoch': 3.0}
{'loss': 0.3311, 'grad_norm': 0.2548888921737671, 'learning_rate': 0.000394168875425975, 'epoch': 3.03}
{'loss': 0.3041, 'grad_norm': 1.0469456911087036, 'learning_rate': 0.00038659598636879973, 'epoch': 3.07}
{'loss': 0.3284, 'grad_norm': 2.9848968982696533, 'learning_rate': 0.0003790230973116244, 'epoch': 3.1}
{'loss': 0.3259, 'grad_norm': 0.7888076901435852, 'learning_rate': 0.00037145020825444906, 'epoch': 3.14}
{'loss': 0.3236, 'grad_norm': 6.392965316772461, 'learning_rate': 0.0003638773191972738, 'epoch': 3.18}
{'loss': 0.3173, 'grad_norm': 5.866499423980713, 'learning_rate': 0.00035630443014009845, 'epoch': 3.22}
{'loss': 0.3112, 'grad_norm': 11.672548294067383, 'learning_rate': 0.00034873154108292317, 'epoch': 3.26}
{'loss': 0.3206, 'grad_norm': 1.3947741985321045, 'learning_rate': 0.0003

  0%|          | 0/1468 [00:00<?, ?it/s]

{'eval_loss': 0.28066781163215637, 'eval_accuracy': {'accuracy': 0.9083287895310797}, 'eval_runtime': 283.7647, 'eval_samples_per_second': 51.705, 'eval_steps_per_second': 5.173, 'epoch': 4.0}
{'loss': 0.2605, 'grad_norm': 3.2562201023101807, 'learning_rate': 0.0001972737599394169, 'epoch': 4.01}
{'loss': 0.2866, 'grad_norm': 6.792115688323975, 'learning_rate': 0.0001897008708822416, 'epoch': 4.05}
{'loss': 0.2637, 'grad_norm': 0.23758991062641144, 'learning_rate': 0.00018212798182506626, 'epoch': 4.09}
{'loss': 0.2786, 'grad_norm': 2.8434126377105713, 'learning_rate': 0.00017455509276789096, 'epoch': 4.13}
{'loss': 0.2667, 'grad_norm': 5.341547012329102, 'learning_rate': 0.00016698220371071565, 'epoch': 4.17}
{'loss': 0.2879, 'grad_norm': 4.266791343688965, 'learning_rate': 0.00015940931465354034, 'epoch': 4.2}
{'loss': 0.2701, 'grad_norm': 1.939521074295044, 'learning_rate': 0.000151836425596365, 'epoch': 4.24}
{'loss': 0.2493, 'grad_norm': 0.46833840012550354, 'learning_rate': 0.000

  0%|          | 0/1468 [00:00<?, ?it/s]

{'eval_loss': 0.25740399956703186, 'eval_accuracy': {'accuracy': 0.9148037077426391}, 'eval_runtime': 289.8877, 'eval_samples_per_second': 50.613, 'eval_steps_per_second': 5.064, 'epoch': 5.0}
{'train_runtime': 34774.366, 'train_samples_per_second': 18.986, 'train_steps_per_second': 1.899, 'train_loss': 0.36235147410475455, 'epoch': 5.0}


TrainOutput(global_step=66025, training_loss=0.36235147410475455, metrics={'train_runtime': 34774.366, 'train_samples_per_second': 18.986, 'train_steps_per_second': 1.899, 'total_flos': 8.874093177643008e+16, 'train_loss': 0.36235147410475455, 'epoch': 5.0})

In [24]:
model.to('cuda')
print('Trained model predictions')
for text in text_list:
  inputs = tokenizer.encode(text, return_tensors='pt').to('cuda')

  logits = model(inputs).logits
  predictions = torch.max(logits,1).indices

  print(f'{text} - {id2label[predictions.tolist()[0]]}')

AssertionError: Torch not compiled with CUDA enabled

In [None]:
output_model_file = 'pytorch_distilbert_imbd.bin'
output_vocab_file = 'vocab_distilbert_imbd.bin'

# Save model
model_to_save = model
torch.save(model_to_save, output_model_file)

# Save tokenizer vocabulary in the current directory
tokenizer.save_vocabulary(".")  # Current directory

# Save model state dictionary
torch.save(model.state_dict(), 'trained_model_gral_imbd.pth')

print('All files saved')



All files saved
