In [1]:
!pip install transformers[torch] tokenizers datasets evaluate rouge_score sentencepiece huggingface_hub --upgrade

Collecting transformers[torch]
  Downloading transformers-4.40.0-py3-none-any.whl (9.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m66.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers
  Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m103.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m53.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting se

In [2]:
import nltk
from datasets import load_dataset
import evaluate
import numpy as np
from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

In [12]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [153]:
from datasets import load_dataset
import pandas as pd
dataset = pd.read_csv("gdrive/My Drive/alpaca.csv")

In [154]:
dataset.head()

Unnamed: 0,instruction,input,output,text
0,Give three tips for staying healthy.,,1.Eat a balanced diet and make sure to include...,Below is an instruction that describes a task....
1,What are the three primary colors?,,"The three primary colors are red, blue, and ye...",Below is an instruction that describes a task....
2,Describe the structure of an atom.,,"An atom is made up of a nucleus, which contain...",Below is an instruction that describes a task....
3,How can we reduce air pollution?,,There are a number of ways to reduce air pollu...,Below is an instruction that describes a task....
4,Describe a time when you had to make a difficu...,,I had to make a difficult decision when I was ...,Below is an instruction that describes a task....


In [155]:
dataset.shape

(52002, 4)

In [156]:
dataset.dtypes

instruction    object
input          object
output         object
text           object
dtype: object

In [157]:
dataset['input'].fillna('', inplace=True)

In [158]:
dataset['output'] = dataset['output'].astype(str)
dataset['instruction'] = dataset['instruction'].astype(str)
dataset['input'] = dataset['input'].astype(str)

In [159]:
dataset['prompt'] = dataset['instruction'] + dataset["input"]

In [160]:
dataset['prompt'] = dataset['prompt'].apply(lambda x: x.replace('\n', '').strip())

In [161]:
dataset['output'] = dataset['output'].apply(lambda x: x.replace('\n', '').strip())

In [162]:
dataset.head()

Unnamed: 0,instruction,input,output,text,prompt
0,Give three tips for staying healthy.,,1.Eat a balanced diet and make sure to include...,Below is an instruction that describes a task....,Give three tips for staying healthy.
1,What are the three primary colors?,,"The three primary colors are red, blue, and ye...",Below is an instruction that describes a task....,What are the three primary colors?
2,Describe the structure of an atom.,,"An atom is made up of a nucleus, which contain...",Below is an instruction that describes a task....,Describe the structure of an atom.
3,How can we reduce air pollution?,,There are a number of ways to reduce air pollu...,Below is an instruction that describes a task....,How can we reduce air pollution?
4,Describe a time when you had to make a difficu...,,I had to make a difficult decision when I was ...,Below is an instruction that describes a task....,Describe a time when you had to make a difficu...


In [163]:
from datasets import Dataset
dataset = Dataset.from_pandas(dataset)

In [165]:
dataset[0]

{'instruction': 'Give three tips for staying healthy.',
 'input': '',
 'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. 2. Exercise regularly to keep your body active and strong. 3. Get enough sleep and maintain a consistent sleep schedule.',
 'text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nGive three tips for staying healthy.\n\n### Response:\n1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.',
 'prompt': 'Give three tips for staying healthy.'}

In [166]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model_name = "google/flan-t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [127]:
inputs = tokenizer("I have chicken breast, cabbage, and potato. What can I make with these ingredients?", return_tensors="pt")
outputs = model.generate(**inputs)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

['Make a sour cream']




In [169]:
def encode(examples):
    model_inputs = tokenizer(examples['prompt'], max_length=512, padding=True, truncation=True)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples['output'], max_length=128, padding=True, truncation=True)

    model_inputs['labels'] = labels.input_ids
    return model_inputs

# Apply function to the dataset
train_dataset = dataset.map(encode, batched=True)

Map:   0%|          | 0/52002 [00:00<?, ? examples/s]



In [170]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,  # Adjust batch size based on GPU/TPU memory
    num_train_epochs=3,  # Number of training epochs
    save_steps=1000,
    save_total_limit=2,
    learning_rate=3e-4,
    weight_decay=0.01
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer
)

trainer.train()

Step,Training Loss
500,1.6485
1000,1.0092
1500,1.0111
2000,0.97
2500,0.9765
3000,0.9763
3500,0.9514
4000,0.9497
4500,0.9547
5000,0.9333


TrainOutput(global_step=19503, training_loss=0.8715970003133626, metrics={'train_runtime': 10053.8965, 'train_samples_per_second': 15.517, 'train_steps_per_second': 1.94, 'total_flos': 6.455709627757978e+16, 'train_loss': 0.8715970003133626, 'epoch': 3.0})

In [171]:
model.save_pretrained("gdrive/My Drive/finetuned_t5")
tokenizer.save_pretrained("gdrive/My Drive/finetuned_t5")

('gdrive/My Drive/finetuned_t5/tokenizer_config.json',
 'gdrive/My Drive/finetuned_t5/special_tokens_map.json',
 'gdrive/My Drive/finetuned_t5/spiece.model',
 'gdrive/My Drive/finetuned_t5/added_tokens.json')

In [202]:
model = T5ForConditionalGeneration.from_pretrained("gdrive/My Drive/finetuned_t5", local_files_only=True)
question = f"Give three tips for staying efficient."

# Prepare the input text for T5 format
input_text = question

input_ids = tokenizer.encode(input_text, return_tensors="pt")

#outputs = model.generate(input_ids, max_length=512, num_beams=5, early_stopping=True)
outputs = model.generate(**inputs)
# Decode the generated output to text
#answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
#print(answer)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True, max_length=256))

['1. Set realistic goals and stick to them.2. Break large tasks into smaller, more manageable']


In [173]:
non_finetuned = T5ForConditionalGeneration.from_pretrained(model_name)

In [201]:
inputs = tokenizer(input_text, return_tensors="pt")
outputs = non_finetuned.generate(**inputs)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))

['Staying organized is key to a successful business.']
