## LLM

In [1]:
import sys
sys.path.insert(0, '../src/')

import matplotlib.pyplot as plt
from tokenizer import TokenizerBPE
from data_handling import normalize_to_ascii, clean_text
from utils import saver
import numpy as np
import re
import os
import random

import pickle as pkl
from tqdm.notebook import tqdm
import json
import kagglehub
import csv

# disable gpu for testing purposes
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"


## Recipe

In [2]:
path = kagglehub.dataset_download("paultimothymooney/recipenlg")



In [3]:
text_list = []

with open(os.path.join(path, 'RecipeNLG_dataset.csv'), newline='') as f:
    reader = csv.reader(f)
    header = next(reader)
    for i, row in tqdm(enumerate(reader)):
        name = row[1]
        ingred = row[2].replace('"', ""). replace("[", "").replace("]", "")
        dir = row[3].replace('"', ""). replace("[", "").replace("]", "")
        text = "Recipe for " + name + ": Ingredients: " + ingred + ". Directions: " + dir
        text = clean_text(normalize_to_ascii(text))
        text_list.append(text)
        if i > 200000:
            break

0it [00:00, ?it/s]

KeyboardInterrupt: 

In [18]:
saver("../corpus/recipe_raw", text_list)

In [4]:
def fused_article(corpus_list):
    sos = "<s>"
    eos = "</s>"

    rcw = re.compile(r"\s+")

    corpus_padded_list = []
    for line in corpus_list:
        line = line.replace("\n", " ").replace("\r", " ")
        line = rcw.sub(" ", line).strip()
        line = [sos, normalize_to_ascii(line).lower(), eos]
        corpus_padded_list.extend(line)
    
    return "".join(corpus_padded_list)

## QA recipe

In [None]:
def fused_qa(question_list, answer_list):
    q ="<q>"
    a = "<a>"
    sos = "<s>"
    eos = "</s>"
    rcw = re.compile(r"\s+")

    corpus_list = []
    for question, answer in tqdm(list(zip(question_list, answer_list))):
        question = question.replace("\n", " ").replace("\r", " ")
        question = rcw.sub(" ", question).strip()
        answer = answer.replace("\n", " ").replace("\r", " ")
        answer = rcw.sub(" ", answer).strip()
        qa = [sos, q, clean_text(normalize_to_ascii(question)), a, clean_text(normalize_to_ascii(answer)), eos]
        corpus_list.extend(qa)
        
    return "".join(corpus_list).lower()

In [20]:
question_list = []
answer_list = []

question_type = ["Make me a recipe for",
                 "What is a recipe for",
                 "How do I make a recipe for",
                 "How can I make a recipe for",
                 "Can you give me a recipe for",
                 "Can you make me a recipe for",
                 "Can you tell me a recipe for",]

with open(os.path.join(path, 'RecipeNLG_dataset.csv'), newline='') as f:
    reader = csv.reader(f)
    header = next(reader)
    for i, row in tqdm(enumerate(reader)):
        if i > 200000:
            name = row[1]
            ingred = row[2].replace('"', "").replace("[", "").replace("]", "")
            dir = row[3].replace('"', ""). replace("[", "").replace("]", "")

            qt = question_type[i % len(question_type)]

            question = qt + " " + name + "?"
            question = clean_text(normalize_to_ascii(question))
            question_list.append(question)

            answer = "Ingredients: " + ingred + ". Directions: " + dir
            answer = clean_text(normalize_to_ascii(answer))
            answer_list.append(answer)

        if i > 400000:
            break

0it [00:00, ?it/s]

In [21]:
saver("../corpus/recipe_qa_raw", [question_list, answer_list])

## QA recipe with ingred

In [20]:
question_list = []
answer_list = []

question_type = ["Make me a recipe for",
                 "What is a recipe for",
                 "How do I make a recipe for",
                 "How can I make a recipe for",
                 "Can you give me a recipe for",
                 "Can you make me a recipe for",
                 "Can you tell me a recipe for",]

with open(os.path.join(path, 'RecipeNLG_dataset.csv'), newline='') as f:
    reader = csv.reader(f)
    header = next(reader)
    for i, row in tqdm(enumerate(reader)):
        if i > 400000:
            name = row[1]
            ingred_clean = row[2].replace('"', "").replace("[", "").replace("]", "")#.replace("\u00b0", " degrees")
            ingred_list = ingred_clean.split(",")
            random.shuffle(ingred_list)
            num_ingred = random.randint(1, 4)
            ingred_list = ingred_list[:num_ingred]
            if len(ingred_list) > 1:
                ingred_request = ", ".join(ingred_list[:-1]) + " and " + ingred_list[-1]
            else:
                ingred_request = ingred_list[0]

            dir = row[3].replace('"', ""). replace("[", "").replace("]", "")
            qt = question_type[i % len(question_type)]



            question = qt + " " + name + " with " + ingred_request + "?"
            question = clean_text(normalize_to_ascii(question))
            question_list.append(question)

            answer = "Ingredients: " + ingred_clean + ". Directions: " + dir
            answer = clean_text(normalize_to_ascii(answer))
            answer_list.append(answer)

        if i > 600000:
            break

0it [00:00, ?it/s]

In [27]:
saver("../corpus/recipe_qa_ingred_raw", [question_list, answer_list])

In [29]:
print(question_list[5])
print(answer_list[5])


can you make me a recipe for texas hash with 2 c. canned tomatoes, chopped and 2 green peppers?
ingredients: 2 large onions, chopped, 2 green peppers, chopped, 3 tbsp. shortening, 1 lb. hamburger, 1 tsp. salt, 2 c. canned tomatoes, 1/2 c. uncooked rice, 1/4 tsp. pepper. directions: cook onions and peppers in shortening until onions are clear. add hamburger and salt., cook until mixture falls apart., add tomatoes, rice, and other seasonings., arrange in large casserole. cover and bake at 375\u00b0 for 45 minutes or until done.


## 