In [1]:
# Mount Google Drive (to access dataset and save model)
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from google.colab import files

uploaded = files.upload()  # Upload .zip file

Saving unique_prompts_generated_recipes_v2.csv to unique_prompts_generated_recipes_v2.csv


In [3]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split

# Load the dataset
data = pd.read_csv('/content/unique_prompts_generated_recipes_v2.csv')

In [4]:
data

Unnamed: 0,Prompt,Generated Recipe
0,Generate a dairy-free recipe for lunch with yo...,"Dish: Ingredients: yogurt, chickpeas, spinach,..."
1,Generate a dairy-free recipe for dinner with g...,"Dish: Ingredients: ginger, olive oil, tomato, ..."
2,Generate a vegetarian recipe for dinner with c...,"Dish: Ingredients: cucumber, potato, tofu, bre..."
3,Generate a dairy-free recipe for lunch with le...,"Dish: Ingredients: lentils, basil, spinach, on..."
4,Generate a vegetarian recipe for dinner with b...,"Dish: Ingredients: basil, lemongrass, pasta, b..."
...,...,...
2995,Generate a vegetarian recipe for dinner with t...,"Dish: Ingredients: tomato, soy sauce, spinach,..."
2996,Generate a dairy-free recipe for dinner with e...,"Dish: Ingredients: eggplant, rice, avocado, ol..."
2997,Generate a dairy-free recipe for dinner with p...,"Dish: Ingredients: potato, olive oil, lemongra..."
2998,Generate a dairy-free recipe for dinner with t...,"Dish: Ingredients: tomato, eggplant, pasta, se..."


In [5]:
# Clean the text: Remove special characters and lowercasing
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove non-alphanumeric characters
    return text

data['cleaned_prompt'] = data['Prompt'].apply(clean_text)
data['cleaned_recipe'] = data['Generated Recipe'].apply(clean_text)

# Split the data into train and test sets (80% train, 20% test)
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)


data

Unnamed: 0,Prompt,Generated Recipe,cleaned_prompt,cleaned_recipe
0,Generate a dairy-free recipe for lunch with yo...,"Dish: Ingredients: yogurt, chickpeas, spinach,...",generate a dairyfree recipe for lunch with yog...,dish ingredients yogurt chickpeas spinach carr...
1,Generate a dairy-free recipe for dinner with g...,"Dish: Ingredients: ginger, olive oil, tomato, ...",generate a dairyfree recipe for dinner with gi...,dish ingredients ginger olive oil tomato spina...
2,Generate a vegetarian recipe for dinner with c...,"Dish: Ingredients: cucumber, potato, tofu, bre...",generate a vegetarian recipe for dinner with c...,dish ingredients cucumber potato tofu breadcru...
3,Generate a dairy-free recipe for lunch with le...,"Dish: Ingredients: lentils, basil, spinach, on...",generate a dairyfree recipe for lunch with len...,dish ingredients lentils basil spinach onion i...
4,Generate a vegetarian recipe for dinner with b...,"Dish: Ingredients: basil, lemongrass, pasta, b...",generate a vegetarian recipe for dinner with b...,dish ingredients basil lemongrass pasta breadc...
...,...,...,...,...
2995,Generate a vegetarian recipe for dinner with t...,"Dish: Ingredients: tomato, soy sauce, spinach,...",generate a vegetarian recipe for dinner with t...,dish ingredients tomato soy sauce spinach chic...
2996,Generate a dairy-free recipe for dinner with e...,"Dish: Ingredients: eggplant, rice, avocado, ol...",generate a dairyfree recipe for dinner with eg...,dish ingredients eggplant rice avocado olive o...
2997,Generate a dairy-free recipe for dinner with p...,"Dish: Ingredients: potato, olive oil, lemongra...",generate a dairyfree recipe for dinner with po...,dish ingredients potato olive oil lemongrass c...
2998,Generate a dairy-free recipe for dinner with t...,"Dish: Ingredients: tomato, eggplant, pasta, se...",generate a dairyfree recipe for dinner with to...,dish ingredients tomato eggplant pasta sesame ...


In [6]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [7]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset


# Initialize the tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')


# Tokenize the input prompts and generated recipes
train_encodings = tokenizer(list(train_data['cleaned_prompt']), truncation=True, padding=True, max_length=128)
train_labels = tokenizer(list(train_data['cleaned_recipe']), truncation=True, padding=True, max_length=128)

test_encodings = tokenizer(list(test_data['cleaned_prompt']), truncation=True, padding=True, max_length=128)
test_labels = tokenizer(list(test_data['cleaned_recipe']), truncation=True, padding=True, max_length=128)

# Create a custom dataset for use in the Trainer
train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels['input_ids']
})

test_dataset = Dataset.from_dict({
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'labels': test_labels['input_ids']
})

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [8]:
test_dataset[0]

{'input_ids': [3806,
  3,
  9,
  13688,
  2113,
  2696,
  21,
  3074,
  28,
  12784,
  9417,
  3702,
  12909,
  24395,
  3,
  16217,
  21659,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'labels': [4419,
  3018,
  12784,
  9417,
  3702,
  12909,
  24395,
  3909,
  5148,
  12784,
  9417,
  3702,
  12909,
  617,
  24395,
  3989,
  8583,
  11,
  1716,
  1312,
  1,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]}

In [14]:
# Set up training arguments with validation logging
training_args = TrainingArguments(
    output_dir='./results',  # Output directory
    num_train_epochs=5,  # Number of training epochs
    per_device_train_batch_size=8,  # Batch size for training
    per_device_eval_batch_size=8,  # Batch size for evaluation
    warmup_steps=500,  # Warmup steps
    weight_decay=0.01,  # Weight decay
    logging_dir='./logs',  # Logging directory
    logging_steps=10,
    # Replace 'evaluation_strategy' with 'eval_strategy'
    eval_strategy="epoch",  # Evaluate during training
    eval_steps=500,  # Evaluate every 500 steps
    # Change save_strategy to "epoch" to match eval_strategy
    save_strategy="epoch",
    save_steps=1000,  # Save model checkpoints every 1000 steps
    load_best_model_at_end=True,  # Load the best model when finished
)

# Initialize Trainer
trainer = Trainer(
    model=model,  # The model to train
    args=training_args,  # Training arguments
    train_dataset=train_dataset,  # Training dataset
    eval_dataset=test_dataset,  # Evaluation dataset
)

# Fine-tune the model
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmamoonaramzan2[0m ([33mmamoonaramzan2-nust[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,0.2612,0.028117
2,0.0061,8.4e-05
3,0.0015,3.9e-05
4,0.0011,2.6e-05
5,0.0008,2.3e-05


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=1500, training_loss=1.1192775597621998, metrics={'train_runtime': 403.11, 'train_samples_per_second': 29.769, 'train_steps_per_second': 3.721, 'total_flos': 91990130688000.0, 'train_loss': 1.1192775597621998, 'epoch': 5.0})

In [15]:
# Save the fine-tuned model
model.save_pretrained('./fine_tuned_t5_recipe_model')
tokenizer.save_pretrained('./fine_tuned_t5_recipe_model')

('./fine_tuned_t5_recipe_model/tokenizer_config.json',
 './fine_tuned_t5_recipe_model/special_tokens_map.json',
 './fine_tuned_t5_recipe_model/spiece.model',
 './fine_tuned_t5_recipe_model/added_tokens.json')

In [16]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load the fine-tuned model and tokenizer
model = T5ForConditionalGeneration.from_pretrained('./fine_tuned_t5_recipe_model')
tokenizer = T5Tokenizer.from_pretrained('./fine_tuned_t5_recipe_model')

# Set the device to GPU if available, else use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the device (GPU/CPU)
model.to(device)

# Function to generate a recipe from a prompt
def generate_recipe(prompt, model, tokenizer, max_length=150):
    prompt = clean_text(prompt)  # Clean the input prompt
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=128)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    # Generate the recipe
    output = model.generate(input_ids, attention_mask=attention_mask, max_length=max_length, num_return_sequences=1)

    return tokenizer.decode(output[0], skip_special_tokens=True)

# Test the recipe generation
prompt = "Generate a vegetarian recipe for dinner with tomatoes and spinach"
generated_recipe = generate_recipe(prompt, model, tokenizer)
print(generated_recipe)


dish ingredients tomatoes and spinach instructions combine tomatoes and spinach add spinach cook thoroughly and serve hot


In [17]:
# Test the recipe generation with a different prompt
prompt = "Generate a vegan dessert recipe with chocolate and almonds"
generated_recipe = generate_recipe(prompt, model, tokenizer)
print(generated_recipe)


dish ingredients chocolate and almonds instructions combine chocolate and almonds add almonds cook thoroughly and serve hot


In [18]:
# Test the recipe generation with another prompt
prompt = "Generate a gluten-free recipe for breakfast with eggs and avocado"
generated_recipe = generate_recipe(prompt, model, tokenizer)
print(generated_recipe)


dish ingredients eggs and avocado instructions combine eggs and avocado add avocado cook thoroughly and serve hot


In [19]:
# Save the final model
trainer.save_model("./final_model")

# Save tokenizer (optional but usually needed)
tokenizer.save_pretrained("./final_model")


('./final_model/tokenizer_config.json',
 './final_model/special_tokens_map.json',
 './final_model/spiece.model',
 './final_model/added_tokens.json')

In [20]:
!zip -r final_model.zip final_model


  adding: final_model/ (stored 0%)
  adding: final_model/spiece.model (deflated 48%)
  adding: final_model/special_tokens_map.json (deflated 85%)
  adding: final_model/tokenizer_config.json (deflated 94%)
  adding: final_model/training_args.bin (deflated 52%)
  adding: final_model/config.json (deflated 63%)
  adding: final_model/added_tokens.json (deflated 83%)
  adding: final_model/model.safetensors (deflated 12%)
  adding: final_model/generation_config.json (deflated 29%)


In [22]:
from google.colab import files
files.download("final_model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>