In [49]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [46]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [55]:
data = pd.read_csv("/content/drive/MyDrive/recipe_dataset.csv")

In [56]:
data.head()

Unnamed: 0,question,answer
0,What dishes can we make with bite size shredde...,You can make No-Bake Nut Cookies . Here is th...
1,What dishes can we make with cream of mushroom...,You can make Jewell Ball'S Chicken . Here is ...
2,"What dishes can we make with frozen corn , pep...",You can make Creamy Corn . Here is the recipe...
3,"What dishes can we make with chicken gravy , c...",You can make Chicken Funny . Here is the reci...
4,What dishes can we make with graham cracker cr...,You can make Reeses Cups(Candy) . Here is t...


In [57]:
data.shape

(1000, 2)

In [17]:
#get all unique ingredients
ingredients_vocab = set(ingredient for sublist in data['NER'] for ingredient in sublist)

In [18]:
len(ingredients_vocab)

3969

In [19]:
input_query = "what dish I can make with tomato, potato, egg"

# Extract ingredients from the input query
input_ingredients = set(input_query.replace(',', '').split()[6:])


In [20]:
def find_dishes(input_ingredients, data):
    dishes_directions = []
    for index, row in data.iterrows():
        if all(any(ingredient_part in dish_ingredient for dish_ingredient in row['NER'] for ingredient_part in input_ingredient.split()) for input_ingredient in input_ingredients):
            dishes_directions.append((row['title'], row['ingredients'], row['directions']))
    return dishes_directions

In [21]:
# suggested_dishes = find_dishes(input_ingredients, data)
suggested_dishes = find_dishes(input_ingredients, data)[:50] # get the first 50 suggestions
print("You can make these dishes with the ingredients you have:", suggested_dishes, "")
for dish in suggested_dishes:
  print("Dish: ", dish[0])
  print("--Ingredients: ", dish[1])
  print("--Recipe: ", dish[2])

You can make these dishes with the ingredients you have: [('Picnic Potato Salad', '["1 1/2 cooked potatoes", "3 hard-boiled eggs", "3/4 tsp. salt", "1/2 tsp. celery seed", "1/2 c. diced celery", "2 Tbsp. prepared mustard", "1/4 c. bacon bits or shredded cheese", "1/2 c. sweet pickles/relish", "1/4 c. grated carrots", "3/4 c. salad dressing", "1/2 c. diced cucumbers", "1/2 c. chopped onion", "dash of pepper", "1 medium tomato, wedged"]', ['Coarsely dice potatoes and eggs with salt, pepper and celery seed.', 'Add diced cucumbers, onion, carrots, celery and pickles/relish.', 'Mix salad dressing/Miracle Whip with mustard and pour over mixture.', 'Toss until well coated, adding more salad dressing as needed.', 'Add bacon bits or shredded cheese and garnish with tomato wedges.']), ('Meatball Stew', '["2 c. cornflakes cereal", "1 egg", "1 (10 1/2 oz.) can beef broth", "1/2 tsp. salt", "1/8 tsp. pepper", "1 lb. ground beef", "2 Tbsp. vegetable oil", "1 (10 3/4 oz.) can tomato soup", "1/2 c. wa

In [22]:
def preprocess(strings_list):
    return [string.replace(", ", " , ").replace(".", " . ") for string in strings_list]

In [23]:
data['directions'] = data['directions'].apply(preprocess)

In [24]:
data['directions']

0       [In a heavy 2-quart saucepan , mix brown sugar...
1       [Place chipped beef on bottom of baking dish ....
2       [In a slow cooker , combine all ingredients . ...
3       [Boil and debone chicken . , Put bite size pie...
4       [Combine first four ingredients and press in 1...
                              ...                        
9995               [Combine all ingredients and chill . ]
9996    [Cut steaks into strips; brown in cooking oil ...
9997    [Stew and bone fryer . , Saute in small amount...
9998    [Mix together for 2 or 3 minutes .  Put into g...
9999    [Layer each ingredients , prepared according t...
Name: directions, Length: 10000, dtype: object

In [58]:
#the start of gpt-2 baseline
question = list(data['question'])
answer = list(data['answer'])

In [62]:
def preprocess_dataset(question, answer):
    # Initialize an empty list to store preprocessed data
    preprocessed_data = []

    # Iterate over each recipe in the dataset
    for question, answer in zip(question, answer):
        # Combine ingredients and directions into a single string with spaces between words/symbols
        recipe_text = f"<startofstring> {question} <bot>: {answer} <endofstring>"
        # Append the preprocessed recipe to the list
        preprocessed_data.append(recipe_text)

    return preprocessed_data

In [63]:
preprocessed_dataset = preprocess_dataset(question, answer)
preprocessed_dataset[0]

'<startofstring> What dishes can we make with bite size shredded rice biscuits , vanilla , brown sugar , nuts , milk , butter <bot>: You can make No-Bake Nut Cookies .  Here is the recipe : In a heavy 2-quart saucepan , mix brown sugar , nuts , evaporated milk and butter or margarine .  Stir over medium heat until mixture bubbles all over top .  Boil and stir 5 minutes more .  Take off heat .  Stir in vanilla and cereal; mix well .  Using 2 teaspoons , drop and shape into 30 clusters on wax paper .  Let stand until firm , about 30 minutes .  <endofstring>'

In [64]:
X = preprocessed_dataset

In [66]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained('gpt2', return_dict=True)

In [67]:
tokenizer.add_special_tokens({'pad_token': '<pad>',
                              'bos_token': '<startofstring>',
                              'eos_token': '<endofstring>'})
tokenizer.add_tokens(['<bot>:'])

1

In [68]:
X_encoded = tokenizer(X, max_length=40, truncation=True, padding="max_length", return_tensors="pt")
input_ids = X_encoded['input_ids']
attention_mask = X_encoded['attention_mask']


In [69]:
X_encoded

{'input_ids': tensor([[50258,  1867, 16759,  ...,   318,   262,  8364],
        [50258,  1867, 16759,  ...,  8474,   442,  3949],
        [50258,  1867, 16759,  ...,   257,  3105, 37171],
        ...,
        [50258,  1867, 16759,  ..., 12029, 14023, 19145],
        [50258,  1867, 16759,  ...,   220,  3423,   318],
        [50258,  1867, 16759,  ...,  1058, 15561,   474]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1]])}

In [70]:
from torch.optim import Adam
from torch.utils.data import DataLoader

In [71]:
X =  DataLoader(X, batch_size=64)

In [72]:
model.resize_token_embeddings(len(tokenizer))

Embedding(50261, 768)

In [73]:
import tqdm, torch
def train(data, model, optim):
  epochs = 10
  for i in tqdm.tqdm(range(epochs)):
    #input id and attention_mask
    for j in range(len(data)):
      X = input_ids[j]
      a = attention_mask[j]
      optim.zero_grad()
      loss = model(X, attention_mask=a, labels=X).loss
      loss.backward()
      optim.step()
    torch.save(model.state_dict(), "model_state.pt")

In [93]:
def infer(input, model):
  input = f"<startofstring> {input} <bot>:"
  inp = tokenizer(input, return_tensors="pt")
  X = inp["input_ids"]
  a = inp["attention_mask"]
  output = model.generate(X, attention_mask=a, max_new_tokens=50)
  output = tokenizer.decode(output[0])
  return output


In [75]:
model.train()

optim = Adam(model.parameters(), lr=1e-3)

print("training .... ")
train(X, model, optim)


training .... 


100%|██████████| 10/10 [07:25<00:00, 44.52s/it]


In [94]:
print(infer("What dishes can we make with tomato , potato , egg", model))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


<startofstring> What dishes can we make with tomato, potato, egg  <bot>:  You can make Buckeye Candy.  Here is the recipe : Mix sugar, sugar, sugar, sugar, soda, sugar, soda, butter, butter, butter, butter, butter, butter, butter, butter, butter, butter, butter,


In [84]:
non_finetuned = GPT2LMHeadModel.from_pretrained('gpt2', return_dict=True)
inputs = tokenizer("What dishes can we make with tomato , potato , egg", return_tensors="pt")
outputs = non_finetuned(**inputs, labels=inputs["input_ids"])
loss = outputs.loss
print(loss)

tensor(4.7520, grad_fn=<NllLossBackward0>)


In [97]:
inputs = tokenizer("What dishes can we make with tomato , potato , egg", return_tensors="pt")
outputs = model(**inputs, labels=inputs["input_ids"])
loss = outputs.loss
print(loss)

tensor(3.7378, grad_fn=<NllLossBackward0>)
