In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
data = pd.read_csv("gdrive/My Drive/dev_data.csv")

In [4]:
data.shape

(123114, 4)

In [5]:
data.head()

Unnamed: 0,title,ingredients,directions,NER
0,Haddock In Milk,['500 ml skim milk (enough to half fill a sauc...,"[""Pour you milk into a saucepan and leave to b...","['haddock', 'red onion', 'pepper']"
1,Cream Of Paresian Soup,"['1 pkg. frozen California medley vegetables',...","[""Mix all together and heat through."", ""Put in...","['frozen california medley vegetables', 'cream..."
2,Cinnamon Rice Pudding,"['3 1/2 c. (2%) low-fat milk (divided)', '1/2 ...","[""Combine 3 cups milk, uncooked rice and next ...","['golden raisins', 'egg', 'sugar', 'vanilla ex..."
3,Meat Loaf,"['1 c. bread crumbs', '1 c. sweet milk', '1 1/...","[""Soak bread crumbs in the milk; add salt, pep...","['sweet milk', 'ground pork', 'pepper', 'chili..."
4,Sour Cream Latkes,"['1 c. flour or whole wheat', '3/4 tsp. soda',...","[""Whip for mixing, no spoon. Put 2 tablespoons...","['sour cream', 'soda', 'flour', 'milk', 'salt']"


In [6]:
from numpy.random import default_rng
data = data[0:2000]

In [7]:
data.shape

(2000, 4)

In [8]:
import ast
# Convert the string representation of the list into an actual list
data['directions'] = data['directions'].apply(ast.literal_eval)
data['NER'] = data['NER'].apply(ast.literal_eval)
data['ingredients'] = data['ingredients'].apply(ast.literal_eval)

In [9]:
#get all unique ingredients
ingredients_vocab = set(ingredient for sublist in data['NER'] for ingredient in sublist)

In [10]:
ingredients_list = list(data['ingredients'])
directions_list = list(data['directions'])
title_list = list(data['title'])
ner_list = list(data['NER'])

In [11]:
def preprocess_dataset(ingredients_list, directions_list, title_list, ner_list):
    # Initialize an empty list to store preprocessed data
    processed_data = []

    # Iterate over each recipe in the dataset
    for ingredients, directions, title, ner in zip(ingredients_list, directions_list, title_list, ner_list):
        # Combine ingredients and directions into a single string with spaces between words/symbols
        ingredients_text = ', '.join(ingredients)
        directions_text = ' '.join(directions)
        ner_text = ', '.join(ner)
        #context = f"Title: {title} Ingredients: {ingredients_text} Instructions: {directions_text}"
        question = f"I have {ner_text}. Give me some cooking ideas."
        answer = f"You can make {title} with {ingredients_text}. Here's the instruction: {directions_text}"
        processed_data.append({
              "input_text": question,
              "target_text": answer
          })
        question = f"How to make {title}?"
        processed_data.append({
              "input_text": question,
              "target_text": answer
              })

    return processed_data

In [12]:
df = pd.DataFrame(preprocess_dataset(ingredients_list, directions_list, title_list, ner_list))

In [13]:
df.head()

Unnamed: 0,input_text,target_text
0,"I have haddock, red onion, pepper. Give me som...",You can make Haddock In Milk with 500 ml skim ...
1,How to make Haddock In Milk?,You can make Haddock In Milk with 500 ml skim ...
2,"I have frozen california medley vegetables, cr...",You can make Cream Of Paresian Soup with 1 pkg...
3,How to make Cream Of Paresian Soup?,You can make Cream Of Paresian Soup with 1 pkg...
4,"I have golden raisins, egg, sugar, vanilla ext...",You can make Cinnamon Rice Pudding with 3 1/2 ...


In [14]:
df.shape

(4000, 2)

In [15]:
df.to_csv('gdrive/My Drive/recipe_dataset_10000.csv', index=False)