# Thisfine-tunes a T5 model with LoRa on the openai/gsm8k dataset, splitting the data into training and validation sets, tokenizing the text, applying LoRa configuration, and training the model using the Seq2SeqTrainer.

In [1]:
!pip install torch transformers datasets accelerate peft
!pip install faiss-cpu faiss-gpu transformers datasets

from IPython.display import clear_output
clear_output()

#### AutoTokenizer: Loads the tokenizer for the model.
#### AutoModelForSeq2SeqLM: Loads the sequence-to-sequence model.
#### Seq2SeqTrainer: Handles training the sequence-to-sequence model.
#### Seq2SeqTrainingArguments: Specifies training arguments.
#### DataCollatorForSeq2Seq: Prepares batches of data.
#### LoraConfig and get_peft_model: Used for applying LoRa fine-tuning to the model.

#### Load the openai/gsm8k dataset.

In [2]:
# Step 2: Import Required Libraries
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, TrainingArguments, Trainer
from transformers import DataCollatorForSeq2Seq
from peft import get_peft_model, LoraConfig, TaskType
from huggingface_hub import notebook_login


# Step 1: Authenticate with Hugging Face
notebook_login('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')

# Step 2: Load the dataset
dataset = load_dataset("openai/gsm8k", "main")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

#### If the dataset doesn't already have a validation split, create one by splitting the training data into 90% training and 10% validation.

In [5]:
from datasets import load_dataset, DatasetDict

# Check if the validation split exists; if not, create it
if "validation" not in dataset.keys():
    dataset = dataset["train"].train_test_split(test_size=0.1)
    dataset = DatasetDict({"train": dataset["train"], "validation": dataset["test"]})

#### The variable model_name specifies the name of the model you want to use for fine-tuning. In this case, "t5-small" refers to the small version of the T5 (Text-To-Text Transfer Transformer) model.
#### T5 is a versatile model that can be used for various tasks, including text generation, translation, summarization, and more.
#### Alternatives: You can use other versions like "t5-base" or "t5-large" depending on your resource availability and task complexity.

In [6]:
# Step 4: Load the Tokenizer and Model
model_name = "t5-small"  # You can also use t5-base or another model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

#### Load the tokenizer for the T5 model.
#### Define a preprocessing function to tokenize the input questions and target answers.
#### Apply this function to the dataset

In [7]:
# Step 4: Preprocess the Data
def preprocess_function(examples):
    inputs = examples['question']
    targets = examples['answer']
    model_inputs = tokenizer(inputs, max_length=256, truncation=True)

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=256, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

#### converting the text data into tokenized input that can be used by the model.

In [8]:
# Apply preprocessing
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/5446 [00:00<?, ? examples/s]



Map:   0%|          | 0/606 [00:00<?, ? examples/s]

#### Set up the LoRa configuration to optimize the model's parameters.
#### Apply the LoRa configuration to the model.

In [9]:
# Step 6: Configure LoRa
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,  # Specify the task type
    inference_mode=False,  # Set to False for training
    r=8,  # Rank
    lora_alpha=32,  # Scaling factor
    lora_dropout=0.1,  # Dropout rate for LoRa
)

# Apply LoRa to the model
model = get_peft_model(model, lora_config)

#### Define the training arguments such as learning rate, batch size, number of epochs, and other parameters.

In [10]:
# Step 7: Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=3,
    save_steps=500,
    fp16=True,  # Mixed precision training
)



#### Use DataCollatorForSeq2Seq to handle padding and batching of input sequences.

In [11]:
# Step 8: Initialize the Data Collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

#### Initialize the Seq2SeqTrainer with the model, training arguments, datasets, data collator, and tokenizer. Then save the model

In [12]:
# Step 9: Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],  # If validation split exists
    data_collator=data_collator,
)

# Step 10: Train the Model
trainer.train()

# Step 11: Save the Model
model.save_pretrained("./lora-finetuned-gsm8k")
tokenizer.save_pretrained("./lora-finetuned-gsm8k")


Epoch,Training Loss,Validation Loss
1,3.1347,2.114411
2,2.4228,1.93896
3,2.2193,1.895703


('./lora-finetuned-gsm8k/tokenizer_config.json',
 './lora-finetuned-gsm8k/special_tokens_map.json',
 './lora-finetuned-gsm8k/spiece.model',
 './lora-finetuned-gsm8k/added_tokens.json',
 './lora-finetuned-gsm8k/tokenizer.json')