In [1]:
!pip install -U transformers
!pip install -q gradio
!pip install transformers[torch]
!pip install tf-keras

In [1]:
import gradio as gr

## Analisando Dataset

Colunas Originais do Dataset: Index(['Unnamed: 0', 'title', 'ingredients', 'directions', 'link', 'source',
       'NER'],
      dtype='object')

Quantidade de Linhas Originalmente: 2231143

In [1]:
import pandas as pd

df = pd.read_csv("datasets/full_dataset.csv", engine='python')
print(df.head())

                                                 NER  \
0  ["brown sugar", "milk", "vanilla", "nuts", "bu...   
1  ["beef", "chicken breasts", "cream of mushroom...   
2  ["frozen corn", "cream cheese", "butter", "gar...   
3  ["chicken", "chicken gravy", "cream of mushroo...   
4  ["peanut butter", "graham cracker crumbs", "bu...   

                                          directions  
0  ["In a heavy 2-quart saucepan, mix brown sugar...  
1  ["Place chipped beef on bottom of baking dish....  
2  ["In a slow cooker, combine all ingredients. C...  
3  ["Boil and debone chicken.", "Put bite size pi...  
4  ["Combine first four ingredients and press in ...  


In [2]:
print(df.columns)

Index(['NER', 'directions'], dtype='object')


In [3]:
df["NER"].head()

0    ["brown sugar", "milk", "vanilla", "nuts", "bu...
1    ["beef", "chicken breasts", "cream of mushroom...
2    ["frozen corn", "cream cheese", "butter", "gar...
3    ["chicken", "chicken gravy", "cream of mushroo...
4    ["peanut butter", "graham cracker crumbs", "bu...
Name: NER, dtype: object

In [4]:
df = df[['NER', 'directions']]
df = df.dropna()

df.head()

Unnamed: 0,NER,directions
0,"[""brown sugar"", ""milk"", ""vanilla"", ""nuts"", ""bu...","[""In a heavy 2-quart saucepan, mix brown sugar..."
1,"[""beef"", ""chicken breasts"", ""cream of mushroom...","[""Place chipped beef on bottom of baking dish...."
2,"[""frozen corn"", ""cream cheese"", ""butter"", ""gar...","[""In a slow cooker, combine all ingredients. C..."
3,"[""chicken"", ""chicken gravy"", ""cream of mushroo...","[""Boil and debone chicken."", ""Put bite size pi..."
4,"[""peanut butter"", ""graham cracker crumbs"", ""bu...","[""Combine first four ingredients and press in ..."


In [6]:
# # shutill para backup antes de sobreencrever o arquivo
# import shutil

# shutil.copy("datasets/full_dataset.csv", "datasets/full_dataset_backup.csv")

# # Write the updated DataFrame back to the CSV file
# df.to_csv("datasets/full_dataset.csv", index=False)

In [9]:
# Step 2: Select the first 270,000 rows
# df = df.head(270000)

# Write the updated DataFrame back to the CSV file
# df.to_csv("datasets/full_dataset.csv", index=False)

## Tokenização

In [10]:
from datasets import Dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer

  machar = _get_machar(dtype)
2024-10-22 13:09:39.012050: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-22 13:09:39.145667: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-22 13:09:39.189034: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-22 13:09:39.806662: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [11]:
# Load GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set pad_token to eos_token

In [12]:
# Define a function to tokenize the data
def tokenize_function(examples):
    inputs = [ex for ex in examples['NER']]
    targets = [ex for ex in examples['directions']]
    model_inputs = tokenizer(inputs, max_length=512, padding="max_length", truncation=True)
    
    # Tokenize the target text
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=512, padding="max_length", truncation=True)
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Convert DataFrame to Hugging Face Dataset format
dataset = Dataset.from_pandas(df)

# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/700237 [00:00<?, ? examples/s]



## Fine-tuning

In [13]:
from transformers import TrainingArguments

In [14]:
# Load the GPT-2 model
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))  # Resize token embeddings to match the tokenizer

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",          # Output directory
    evaluation_strategy="epoch",     # Evaluate every epoch
    learning_rate=2e-5,
    per_device_train_batch_size=2,   # Batch size for training
    per_device_eval_batch_size=2,    # Batch size for evaluation
    num_train_epochs=3,              # Number of training epochs
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,
    save_total_limit=2,              # Limit the total number of model checkpoints
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,  # You can split train/eval sets for more robust evaluation
)

# Fine-tune the model
trainer.train()

Detected kernel version 4.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# Save the model and tokenizer
model.save_pretrained("./fine_tuned_recipe_model")
tokenizer.save_pretrained("./fine_tuned_recipe_model")

## Gerando receitas

In [None]:
# Load the fine-tuned model
fine_tuned_model = GPT2LMHeadModel.from_pretrained("./fine_tuned_recipe_model")
fine_tuned_tokenizer = GPT2Tokenizer.from_pretrained("./fine_tuned_recipe_model")

In [3]:
# Define a function to generate a recipe based on ingredients
def generate_recipe(ingredients):
    prompt = f"I have the following ingredients: {ingredients}. What recipe can I make?"
    input_ids = fine_tuned_tokenizer.encode(prompt, return_tensors='pt')

    # Tokenize the input with padding and create an attention mask
    inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True)
    input_ids = inputs.input_ids
    attention_mask = inputs.attention_mask  # Attention mask to ignore padding

    # Generate the recipe with the attention mask and pad_token_id properly set
    output_ids = model.generate(
        input_ids, 
        attention_mask=attention_mask,  # Pass attention mask
        max_length=200, 
        num_beams=5, 
        no_repeat_ngram_size=2, 
        early_stopping=True, 
        pad_token_id=tokenizer.pad_token_id  # Properly set the pad_token_id
    )
    
    # Decode the output
    #recipe = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    recipe = fine_tuned_tokenizer.decode(output_ids[0], skip_special_tokens=True)
    
    return recipe

* Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.




In [None]:
# Create the Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# Text-to-Recipe Converter")
    gr.Markdown("Enter a list of ingredients and the model will generate a recipe for you.")
    
    ingredients_input = gr.Textbox(label="Enter Ingredients (comma separated)")
    recipe_output = gr.Textbox(label="Generated Recipe")
    
    generate_button = gr.Button("Generate Recipe")
    
    generate_button.click(generate_recipe, inputs=ingredients_input, outputs=recipe_output)

# Launch the interface
demo.launch()