In [None]:
import pandas as pd
import ast
from snowflake.snowpark import Session
from snowflake.snowpark.context import get_active_session

# Add filter column in raw table

In [None]:
session = get_active_session()

df = session.table("NUTRIRAG_PROJECT.RAW.RAW_RECIPES_110K")
           
recipes = df.to_pandas()
recipes

In [None]:
-- select * from NUTRIRAG_PROJECT.CLEANED.RECIPES_SAMPLE_50K where id=368257;
-- select * from NUTRIRAG_PROJECT.RAW.RAW_RECIPES_110K where id=368257;
-- select * from NUTRIRAG_PROJECT.DEV_SAMPLE.RECIPES_SAMPLE_50K where id=368257;


# Cleaning

In [None]:
# recipes.dropna(inplace=True)
# recipes

In [None]:
recipes["NUTRITION"] = recipes["NUTRITION"].apply(
    lambda x: x if isinstance(x, list) else (ast.literal_eval(x) if (x is not None and x != "") else [])
)

clean_data = recipes[
    (recipes["NAME"].notna()) &
    (recipes["NAME"].apply(lambda x: len(x) > 0)) &
    (recipes["MINUTES"] > 5) &
    (recipes["ID"].notna()) &
    (recipes["SUBMITTED"].notna()) &
    (recipes["TAGS"].apply(lambda x: len(x) > 0)) &
    (recipes["NUTRITION"].apply(lambda x: len(x) == 7)) &
    (recipes["DESCRIPTION"].notna()) &
    (recipes["STEPS"].apply(lambda x: len(x) > 0)) &
    (recipes["INGREDIENTS"].apply(lambda x:  len(x) > 0))
]
clean_data

In [None]:
clean_data = clean_data.sample(n=50000, random_state=42)
clean_data

In [None]:
tag_to_filter = {
    "vegan": "vegan",
    "vegetarian": "vegetarian",
    "veggie": "vegetarian",
    "veggie-burgers": "vegetarian",
    "no meat": "vegetarian",
    "meatless": "vegetarian",

    "kosher": "kosher",
    "jewish-ashkenazi": "kosher",
    "jewish": "kosher",
    "hanukkah": "kosher",

    "egg-free": "egg_free",
    "dairy-free": "dairy_free",
    "salt-free": "salt_free",
    "flour-less": "flour_less",
    "flourless": "flour_less",
    "no flour": "flour_less",
    "grain-free": "grain_free",
    "sugar-free": "sugar_free",
    "sugarless": "sugar_free",
    "carb-free": "carb_free",
    "low-carb":"low_carb",
    "very-low-carbs":"low_carb",
    "low-cholesterol":"low_cholesterol",
    "low-protein":"low_protein",
    "low-calorie":"low_calorie",
    "low-calories":"low_calorie",
    "low-saturated-fat":"low_saturated_fat",
    "gluten-free": "gluten_free",
    "fat-free": "fat_free",
    "no-shell-fish":"no_shell_fish",
    "diabetic": "diabetic",
    "low-sodium":"low_sodium",
    "nut-free":"nut_free",
    "low-fat":"low_fat",
    

    "ramadan": "halal",

    # "kid-friendly": "kid_friendly",
    # "toddler-friendly": "kid_friendly",

    "amish-mennonite": "amish",


    "non-alcoholic": "non_alcoholic",
    # "dehydrator": "dehydrator",

    # "vegetables": "vegetable_based",
    # "main-dish-seafood": "seafood",

    # "hunan": "spicy",
}

def extract_filters(tags):
    if not tags:
        return ""
    
    filters = [ tag_to_filter[t] for t in tag_to_filter if t.lower() in tags ]
    
    return filters



In [None]:
clean_data["FILTERS"] = clean_data["TAGS"].apply(extract_filters)
clean_data

#### Load clean_data in CLEANED folder

In [None]:
CREATE OR REPLACE TABLE NUTRIRAG_PROJECT.CLEANED.RECIPES_SAMPLE_50K (
    NAME                    VARCHAR(16777216),
    ID                      NUMBER(38,0),
    MINUTES                 NUMBER(38,0),
    CONTRIBUTOR_ID          NUMBER(38,0),
    SUBMITTED               DATE,
    TAGS                    ARRAY,
    NUTRITION               ARRAY,
    N_STEPS                 NUMBER(38,0),
    STEPS                   ARRAY,
    DESCRIPTION             VARCHAR(16777216),
    INGREDIENTS             ARRAY,
    N_INGREDIENTS           NUMBER(38,0),
    HAS_IMAGE               NUMBER(38,0),
    IMAGE_URL               VARCHAR(16777216),
    INGREDIENTS_RAW_STR     ARRAY,
    SERVING_SIZE            NUMBER(38,0),
    SERVINGS                NUMBER(38,0),
    SEARCH_TERMS            ARRAY,
    FILTERS                 ARRAY
);


In [None]:
clean_data = clean_data.reset_index(drop=True)
session.write_pandas(
        clean_data,
        table_name="RECIPES_SAMPLE_50K",
        schema="CLEANED",
        database="NUTRIRAG_PROJECT",
        overwrite=True
    )


#### Load clean_data in DEV_SAMPLE folder

In [None]:
dev_data = dev_data.sample(n=1000, random_state=42)
clean_data

In [None]:
CREATE OR REPLACE TABLE NUTRIRAG_PROJECT.DEV_SAMPLE.RECIPES_SAMPLE_50K (
    NAME                    VARCHAR(16777216),
    ID                      NUMBER(38,0),
    MINUTES                 NUMBER(38,0),
    CONTRIBUTOR_ID          NUMBER(38,0),
    SUBMITTED               DATE,
    TAGS                    ARRAY,
    NUTRITION               ARRAY,
    N_STEPS                 NUMBER(38,0),
    STEPS                   ARRAY,
    DESCRIPTION             VARCHAR(16777216),
    INGREDIENTS             ARRAY,
    N_INGREDIENTS           NUMBER(38,0),
    HAS_IMAGE               NUMBER(38,0),
    IMAGE_URL               VARCHAR(16777216),
    INGREDIENTS_RAW_STR     ARRAY,
    SERVING_SIZE            NUMBER(38,0),
    SERVINGS                NUMBER(38,0),
    SEARCH_TERMS            ARRAY,
    FILTERS                 ARRAY
);


In [None]:
clean_data = clean_data.reset_index(drop=True)
session.write_pandas(
        clean_data,
        table_name="RECIPES_SAMPLE_50K",
        schema="DEV_SAMPLE",
        database="NUTRIRAG_PROJECT",
        overwrite=True
    )

## Sep nutrition array

In [None]:
ALTER TABLE NUTRIRAG_PROJECT.CLEANED.RECIPES_SAMPLE_50K_STR
ADD COLUMN
    CALORIES        FLOAT,
    TOTAL_FAT       FLOAT,
    SUGAR           FLOAT,
    SODIUM          FLOAT,
    PROTEIN         FLOAT,
    SATURATED_FAT   FLOAT,
    CARBS           FLOAT;

In [None]:
UPDATE NUTRIRAG_PROJECT.CLEANED.RECIPES_SAMPLE_50K_STR
SET
    CALORIES        = NUTRITION[0]::FLOAT,
    TOTAL_FAT       = NUTRITION[1]::FLOAT,
    SUGAR           = NUTRITION[2]::FLOAT,
    SODIUM          = NUTRITION[3]::FLOAT,
    PROTEIN         = NUTRITION[4]::FLOAT,
    SATURATED_FAT   = NUTRITION[5]::FLOAT,
    CARBS           = NUTRITION[6]::FLOAT;

## Array to string

In [None]:
CREATE OR REPLACE TABLE NUTRIRAG_PROJECT.CLEANED.RECIPES_SAMPLE_50K_STR AS
SELECT
    id,
    name,
    minutes,
    contributor_id,
    submitted,
    ARRAY_TO_STRING(tags, ' | ') AS tags_text,
    ARRAY_TO_STRING(ingredients, ' | ') AS ingredients_text,
    ARRAY_TO_STRING(nutrition, ' | ') AS nutrition_text,
    ARRAY_TO_STRING(steps, ' | ') AS steps_text,
    description,
    n_ingredients,
    n_steps
FROM NUTRIRAG_PROJECT.CLEANED.RECIPES_SAMPLE_50K;


## STR to ARRAY

In [None]:
-- CREATE OR REPLACE TABLE NUTRIRAG_PROJECT.CLEANED.RECIPES_SAMPLE_50K_STR AS
-- SELECT
--     id,
--     name,
--     minutes,
--     contributor_id,
--     submitted,
--     -- Transformation inverse : String vers Array
--     SPLIT(tags_text, ' | ')        AS tags,
--     SPLIT(ingredients_text, ' | ') AS ingredients,
--     SPLIT(nutrition_text, ' | ')   AS nutrition,
--     SPLIT(steps_text, ' | ')       AS steps,
--     description,
--     n_ingredients,
--     n_steps
-- FROM NUTRIRAG_PROJECT.CLEANED.RECIPES_SAMPLE_50K_STR;