# Finetune LLama2 model on the ELI5 Dataset

Task Description: Causal Language Modelling (CLM) is text generation. Given a prompt/source sequence, CLM will generate words to continue the source sequence.

Original Tutorial: https://huggingface.co/docs/transformers/tasks/language_modeling

In [None]:
!pip install -q transformers datasets evaluate accelerate bitsandbytes loralib peft python-dotenv

In [None]:
from dotenv import load_dotenv
import os
from pprint import pprint

load_dotenv(override=True)

hf_api_token=os.environ.get("HUGGINGFACEHUB_API_TOKEN")

# Load ELI5 dataset

In [None]:
from datasets import load_dataset

eli5 = load_dataset("eli5", split = "train_asks[:5000]")

In [None]:
# Split the dataset into a train and test set
eli5 = eli5.train_test_split(test_size=0.2)

In [None]:
# Look at the data
pprint(eli5['train'][0])

# The text column is our model input


In [None]:
# Preprocessing
## Load Model
from transformers import AutoTokenizer

checkpoint = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(checkpoint, token=hf_api_token)

In [None]:
# We can use Pytorch to check how the model expects input features
# from transformers import AutoTokenizer, AutoModelForCausalLM

# checkpoint = "meta-llama/Llama-2-70b-chat-hf"
# tokenizer = AutoTokenizer.from_pretrained(checkpoint, token=hf_api_token)
# model = AutoModelForCausalLM.from_pretrained(checkpoint, token=hf_api_token)
# text = "Replace me by any text you'd like."

# print("raw_text:\n", text)
# encoded_input = tokenizer(text, return_tensors='pt')
# print("encoded_input:\n",encoded_input)
# output = model(**encoded_input)

# print("encoded_output:\n", output)

# # Our input has the structure
# '''
# {"input_ids": tensor([[]]), 'attention_mask': tensor([[]])}
# '''

# Make a PEFT Model

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import get_peft_model, LoraConfig, TaskType
import torch

model_name = "meta-llama/Llama-2-7b-chat-hf"
tokenizer_name = "meta-llama/Llama-2-7b-chat-hf"

# Create a config corresponding to the PEFT method
peft_config = LoraConfig(
    task_type = TaskType.CAUSAL_LM,
    inference_mode = False,
    r=8,
    target_modules = ["q_proj", "k_proj", "v_proj"],
    lora_alpha=32,
    lora_dropout=0.1
)

nf4_config = BitsAndBytesConfig(
  # Load Model in 4bit precision
   load_in_4bit=True,
  # use normalized float 4 (default)
   bnb_4bit_quant_type="nf4",
  # uses a second quantization after the first one to save an additional 0.4 bits per parameter
   bnb_4bit_use_double_quant=True,
  # Format in which computations will occur
   bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
# Wrap base model
model = AutoModelForCausalLM.from_pretrained(model_name, quantization_config = nf4_config, device_map="auto", token = hf_api_token)
model = get_peft_model(model, peft_config)


In [None]:
print(model)

In [None]:
model.print_trainable_parameters()

# Preprocessing
We need to create a preprocess function that we will apply to every instance in the dataset. The preprocess function needs to:

1. Flatten the instance so that the text column is easily accessible
2. Join any list of strings
3. Tokenize result

Some token sequences will be **longer** than the maximum input length for the model. Hence we use a second preprocessing function to:

1. concatenate all token sequences
2. Split the concatenated sequences into shorter chunks defined by a `block_size` parameter.

In [None]:
# The text field is nested so we need to flatten each instance
eli5 = eli5.flatten()
pprint(eli5['train'][0])

In [None]:
# Operation to apply to every instance
print(" ".join(eli5['train']['answers.text'][0]), "\n")
print(tokenizer(" ".join(eli5['train']['answers.text'][0])) )

In [None]:
# Wrap in a preprocess function
def preprocess_function(examples):
  return tokenizer([" ".join(x) for x in examples["answers.text"]])

In [None]:
# Apply preprocessing over entire dataset - batched = True process multiple elements of the datasets
tokenized_eli5 = eli5.map(preprocess_function, batched = True, num_proc=4, remove_columns=eli5['train'].column_names)

In [None]:
def group_texts(examples, block_size: int = 128):
  # This function is to cut the length of the text examples

  # Concatencate all texts
  concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
  total_length = len(concatenated_examples[list(examples.keys())[0]])

  if total_length >= block_size:
    total_length = (total_length // block_size) * block_size
  # Split by chunks of block size
  result = {
      k: [t[i: i + block_size] for i in range(0, total_length, block_size)]
      for k, t in concatenated_examples.items()
  }
  result["labels"] = result["input_ids"].copy()
  return result


In [None]:
# Apply second preprocessing over entire dataset
lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)

In [None]:
pprint(lm_dataset['train'][0])

In [None]:
# Create a batch of examples, with dynamic padding. Use the appropriate collator function
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer = tokenizer, mlm = False)

# Train using the Trainer API
The main training steps are:

1. Define training hyperparameters using a model specific TrainingArguments function. At the end of each epoch, the Trainer will evaluate the defined loss metric and save the training checkpoint.

2. Pass the training arguments to a Trainer function alongside the model, dataset, tokenizer, data collator.

3. Call train() to finetune the model

In [None]:
from transformers import TrainingArguments, Trainer

In [None]:
training_args = TrainingArguments(
    output_dir = "eli5_clm",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    fp16=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
    data_collator=data_collator,
)

trainer.train()

In [None]:
# Evaluate the fine tuned model and obtain the perplexity score
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
trainer.save_model("eli5_causal_modell")

In [None]:
# In this case, the tokenizer was not saved automatically, save it manually in the model folder for inference
tokenizer.save_pretrained("eli5_causal_modell", legacy_format=False)

# Inference

Use model for inference using a pipeline wrapper

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

In [None]:
from peft import PeftModel, PeftConfig

peft_model_id = "eli5_causal_modell"
config = PeftConfig.from_pretrained(peft_model_id)

model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", quantization_config = nf4_config, device_map="auto", token = hf_api_token)
model = PeftModel.from_pretrained(model, peft_model_id)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")

In [None]:
device = "cuda"
model = model.to(device)
model.eval()

In [None]:
prompt = "Sherlock Holmes burst into the apartment, out of breath and harried, he looked around anxiously and exclaimed"

In [None]:
# Inference Pipeline using Pytorch
inputs = tokenizer(prompt, return_tensors="pt").input_ids

print(inputs)

In [None]:
import torch

with torch.no_grad():
  # Generate method is used to generate text
  outputs = model.generate(input_ids=inputs.to(device), max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)

In [None]:
# Decode the generated token ids back into text
import pprint
import numpy as np

decoded_output = tokenizer.batch_decode(outputs.detach().cpu().numpy(),
                                     skip_special_tokens=True,
                                     )
decoded_output

In [None]:
print("".join(decoded_output))