In [16]:
import pandas as pd
import numpy as np
from preprocessing import load_data, filter_tags
import json
import random

In [17]:
def list_to_string(lst):
    return ', '.join(lst)

def create_numbered_steps(lst):
    return '\n'.join([f"{i+1}. {step}" for i, step in enumerate(lst)])

In [18]:
df = load_data("filtered_recipes_ft.csv", preprocess_steps=True)
df = filter_tags(df,["beverages"])

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10452 entries, 0 to 10728
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   name            10452 non-null  object 
 1   id              10452 non-null  int64  
 2   minutes         10452 non-null  int64  
 3   contributor_id  10452 non-null  int64  
 4   submitted       10452 non-null  object 
 5   tags            10452 non-null  object 
 6   nutrition       10452 non-null  object 
 7   n_steps         10452 non-null  int64  
 8   steps           10452 non-null  object 
 9   description     10212 non-null  object 
 10  ingredients     10452 non-null  object 
 11  n_ingredients   10452 non-null  int64  
 12  recipe_id       10452 non-null  int64  
 13  num_ratings     10452 non-null  int64  
 14  average_rating  10452 non-null  float64
dtypes: float64(1), int64(7), object(7)
memory usage: 1.3+ MB


In [22]:
def shuffle_and_split(data, train_ratio=0.87, val_ratio=0.03, test_ratio=0.1):
    # Check if the ratios sum to 1
    assert train_ratio + val_ratio + test_ratio == 1.0, "The ratios must sum to 1"

    # Shuffle the data
    random.shuffle(data)

    # Calculate the split indices
    total_len = len(data)
    train_end = int(total_len * train_ratio)
    val_end = train_end + int(total_len * val_ratio)

    # Split the data
    train_data = data[:train_end]
    val_data = data[train_end:val_end]
    test_data = data[val_end:]

    return train_data, val_data, test_data

In [23]:
json_list = []

# Iterate through each row in the DataFrame
for index, row in df.iterrows():
    ingredients_str = list_to_string(row['ingredients'])
    instructions_str = create_numbered_steps(row['steps'])
    
    json_object = {
        "instruction": "Give me a recipe I can make with the following ingredients.",
        "input": ingredients_str,
        "output": instructions_str
    }
    
    json_list.append(json_object)
json_train, json_val, json_test = shuffle_and_split(json_list)

# Save the list of JSON objects to a file
with open('recipes_train.json', 'w') as json_file:
    json.dump(json_train, json_file, indent=4)
    
with open('recipes_validation.json', 'w') as json_file:
    json.dump(json_val, json_file, indent=4)

with open('recipes_test.json', 'w') as json_file:
    json.dump(json_test, json_file, indent=4)