In [1]:
import os
import torch
import json
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, BitsAndBytesConfig
from datasets import Dataset
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# clear unused memory in GPU
torch.cuda.empty_cache()
torch.cuda.reset_peak_memory_stats()
# avoid fragmentation (out of memory error)
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

In [3]:
# 1: load the opt model and tokenizer
# load with quantization (regular fine-tunning struggles on 12GB RTX 3060)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)


model_name = "facebook/opt-2.7b"

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2, quantization_config=bnb_config, device_map="auto")


if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-2.7b and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# 2: prepare the model
model = prepare_model_for_kbit_training(model)

# Load and format news dataset
def load_custom_dataset(json_file):
    with open(json_file, 'r') as f:
        data = json.load(f)
        random.shuffle(data)
    return Dataset.from_dict({
        'text': [item['headline'] for item in data],
        'label': [item['label'] for item in data]
    })

dataset = load_custom_dataset('training_data.json')
#print(dataset[0])

In [5]:
# 3: QLoRA stuff
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLASSIFICATION",
    modules_to_save=["score"]
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 5,248,000 || all params: 2,656,849,920 || trainable%: 0.1975


In [6]:
# 4: Tokenize the dataset
# include labels for loss
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/235 [00:00<?, ? examples/s]

Map: 100%|██████████| 235/235 [00:00<00:00, 14649.49 examples/s]


In [7]:
# numero 6: set up training arguments
training_args = TrainingArguments(
    #output_dir="./opt-full-finetuned",
    per_device_train_batch_size=8,  # lower when OOM
    gradient_accumulation_steps=2,
    num_train_epochs=10,  # training epochs, lower for fine-tunning
    learning_rate=2e-4,
    fp16=True,
    logging_steps=10,
    optim="paged_adamw_8bit",
    save_strategy="epoch",
    report_to="none"               # no external logging services
)

In [8]:
# Step 5: init Trainer
# Trainer() automatically uses GPU if necessary 
# libraries are installed and GPU is available
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

  trainer = Trainer(
No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [9]:
# Step 6: Fine tune OPT
print('Training model...')
trainer.train()

Training model...


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
10,0.723
20,0.6998
30,0.7003
40,0.6449
50,0.6065
60,0.5419
70,0.5181
80,0.429
90,0.3659
100,0.272


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


TrainOutput(global_step=150, training_loss=0.4237530628840129, metrics={'train_runtime': 396.7096, 'train_samples_per_second': 5.924, 'train_steps_per_second': 0.378, 'total_flos': 4553340026880000.0, 'train_loss': 0.4237530628840129, 'epoch': 10.0})

In [10]:
# Step 7: Save the fine-tuned model\
print('Saving model...')
trainer.save_model("./opt-finetuned")
tokenizer.save_pretrained("./opt-finetuned")

Saving model...


('./opt-finetuned\\tokenizer_config.json',
 './opt-finetuned\\special_tokens_map.json',
 './opt-finetuned\\vocab.json',
 './opt-finetuned\\merges.txt',
 './opt-finetuned\\added_tokens.json',
 './opt-finetuned\\tokenizer.json')