In [None]:
!pip install datasets
!pip install transformers
!pip install transformers[torch]
!pip install accelerate
!pip install rouge_score
!pip install evaluate

In [None]:
# import transformers
from transformers import Trainer, TrainingArguments, Seq2SeqTrainingArguments
from datasets import load_dataset, Dataset

In [None]:
import accelerate

import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

import torch
# your PyTorch code

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
dataset = load_dataset('csv', data_files={'train': "/content/drive/My Drive/spring24/NLP499group/data/recipe_dataset_train_8k.csv", 'eval': "/content/drive/My Drive/spring24/NLP499group/data/recipe_dataset_eval_2k.csv"})

Generating train split: 0 examples [00:00, ? examples/s]

Generating eval split: 0 examples [00:00, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 8000
    })
    eval: Dataset({
        features: ['input_text', 'target_text'],
        num_rows: 2000
    })
})

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained('facebook/bart-base')
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-base')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
# Load the pretrained BART model
model = AutoModelForSeq2SeqLM.from_pretrained("/content/drive/My Drive/bart5000_basic_model", local_files_only=True)
tokenizer = AutoTokenizer.from_pretrained("/content/drive/My Drive/bart5000_basic_tokenizer")

In [None]:
# Define our preprocessing function
def preprocess_function(examples):
    # The "inputs" are the tokenized answer:
    model_inputs = tokenizer(examples["input_text"], max_length=128, padding='max_length', truncation=True)

    # The "labels" are the tokenized outputs:
    labels = tokenizer(text_target=examples["target_text"], max_length=512, padding='max_length', truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Map the preprocessing function across our dataset
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['input_text', 'target_text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 8000
    })
    eval: Dataset({
        features: ['input_text', 'target_text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 2000
    })
})

In [None]:
import nltk
import evaluate
import numpy as np
nltk.download("punkt", quiet=True)
metric = evaluate.load("rouge")

def compute_metrics(eval_preds):
    preds, labels = eval_preds

    # decode preds and labels
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # rougeLSum expects newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return result

In [None]:
# from transformers import Trainer, TrainingArguments
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer


training_args = Seq2SeqTrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=4,   # batch size for training
    per_device_eval_batch_size=4,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    gradient_accumulation_steps=8,
    learning_rate=3e-4,
    fp16=True,
    eval_accumulation_steps=1,

)

In [None]:
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["eval"],
    tokenizer=tokenizer,
    data_collator=data_collator
    # compute_metrics=compute_metrics
)

In [None]:
trainer.train()

Step,Training Loss
10,14.1855
20,11.7785
30,9.6481
40,8.4576
50,6.4593
60,4.7614
70,3.8352
80,3.0354
90,2.3075
100,1.7021


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


TrainOutput(global_step=750, training_loss=1.6298946278889974, metrics={'train_runtime': 918.5813, 'train_samples_per_second': 26.127, 'train_steps_per_second': 0.816, 'total_flos': 1829209374720000.0, 'train_loss': 1.6298946278889974, 'epoch': 3.0})

In [None]:
# Now call evaluate
trainer.evaluate()

{'eval_loss': 0.7193598747253418,
 'eval_runtime': 25.3575,
 'eval_samples_per_second': 78.872,
 'eval_steps_per_second': 19.718}

In [None]:
model.save_pretrained("/content/drive/My Drive/bart10k_basic_model")
tokenizer.save_pretrained(("/content/drive/My Drive/bart10k_basic_tokenizer"))

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


('/content/drive/My Drive/bart10k_basic_tokenizer/tokenizer_config.json',
 '/content/drive/My Drive/bart10k_basic_tokenizer/special_tokens_map.json',
 '/content/drive/My Drive/bart10k_basic_tokenizer/vocab.json',
 '/content/drive/My Drive/bart10k_basic_tokenizer/merges.txt',
 '/content/drive/My Drive/bart10k_basic_tokenizer/added_tokens.json',
 '/content/drive/My Drive/bart10k_basic_tokenizer/tokenizer.json')

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# Load the pretrained BART model
model = AutoModelForSeq2SeqLM.from_pretrained("/content/drive/My Drive/bart5000_basic_model", local_files_only=True)
tokenizer = AutoTokenizer.from_pretrained("/content/drive/My Drive/bart5000_basic_tokenizer")

# Define your generation parameters
generation_params = {
    "max_length": 200,
    "no_repeat_ngram_size": 1,
    "do_sample": True,
    "top_k": 35,
    "top_p": 0.95,
    "temperature": 0.72,
    "num_return_sequences": 1,
    "repetition_penalty": 1.4,
}

# Encode input context
input_text = "How to make cookie?"
inputs = tokenizer(input_text, return_tensors="pt")

# Generate output using the model and custom parameters
outputs = model.generate(
    **inputs,
    **generation_params
)

# Decode and print the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)  # Prints the most relevant generated text

# If you want to see all generated texts (in case num_return_sequences > 1)
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))


You can make Cookie with 1/2 c. sugar, 2 sticks margarine (room temperature), 3 eggs slightly beaten until light and fluffy but not too thick to roll into a ball or punch bowl; pinch of salt if necessary). Here's the instruction: Mix all ingredients together in large saucepan over medium-high heat stirring constantly for about 5 minutes on low speed till mixture is just moistened when dropped by spoonfuls onto ungreased cookie sheet. Bake at 350° uncovered 30 seconds without removing from pan before cutting out cookies completely.. Let cool while you prepare your frosting! Makes 4 servings per serving - One recipe yields 6 cups Frosted Creme Fra Diavolo chocolate cake mix plus 8 tablespoons extra virgin olive oil as needed FOR THE FILLING : Preheat oven 425 degreesF Grease 9x13 inch baking dish Sift flour & melted butter add egg yolks then beat well after each addition Add vanilla wafer crumbs spread evenly around top
["You can make Cookie with 1/2 c. sugar, 2 sticks margarine (room te

In [None]:
# Encode input context
input_text = "I have some salmon and eggs, give me some cooking idea"
inputs = tokenizer(input_text, return_tensors="pt")

# Generate output using the model and custom parameters
outputs = model.generate(
    **inputs,
    **generation_params
)

# Decode and print the generated text
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)  # Prints the most relevant generated text

You can make Salmon Casserole with 6 salmon fillets, 1/2 c. mayonnaise (I use Miracle Whip), 2 eggs or as needed to moisten the bottom of a 9 x 13-inch baking dish and bake at 350° for 30 minutes until fish flakes easily when dropped into hot water; cool slightly before cutting in half lengthwise from each end so that it doesn't fall apart too much while still warm but not mushy like this is what I usually do after cooking on low heat! Here's my instruction: Preheat broiler over medium coals till lightly browned then drain off any excess fat Add all remaining ingredients except bread crumbs & mix well Pour mixture evenly onto baked crusts Bake about 20 min til cheese melts completely Turn oven upside down turning once more Sprinkle egg yolks if desired Serve immediately Serves 4 Calories 433 Total Fat 8g Cholesterol 0mg Sodium 7 g Carbohydrates 10 mg Vitamin A 5% Calcium


In [None]:
# Encode input context
input_text = "What dishes can we make with cucumber , egg"
inputs = tokenizer(input_text, return_tensors="pt")

# # Generate a response
# outputs = model.generate(inputs['input_ids'], num_beams=5, early_stopping=True)
# print(tokenizer.decode(outputs[0], skip_special_tokens=True))

# Debug: Check input tokens
# print("Tokenized inputs:", tokenizer.convert_ids_to_tokens(inputs['input_ids'][0]))

outputs = model.generate(**inputs, max_length=512)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Debug: Check generated text
print(tokenizer.batch_decode(outputs, skip_special_tokens=True))


['You can make Egg Salad.  Here is the recipe : Mix all ingredients together in a large bowl.  Place in a greased casserole dish.  Bake at 350° for 30 minutes. ']
