In [2]:
import pandas as pd
import ast
import os

## Downloading data

In [4]:
raw_recipes = '1fxvf7ghbgH0xkvHkPFM_K8_JbeL9QX3L'
raw_interactions = '10zdNLf2oKiMY30Zacdwraw_recipes_dfAEpkrbyoUN'
cleaned_ingredients = '1HjT5RiZnxlg2PkcMLlqzxBjeeRGITYvx'

raw_recipes_path = "/Users/cameliamazouz/Documents/M2/projetML/NutriRAG/dataset/RAW_recipes.csv"
raw_interactions_path = "/Users/cameliamazouz/Documents/M2/projetML/NutriRAG/dataset/RAW_INTERACTIONS.csv"
cleaned_ingredients_path = "/Users/cameliamazouz/Documents/M2/projetML/NutriRAG/dataset/CLEANED_INGREDIENTS.csv"

In [1]:
recipes_images = 'behnamrahdari/foodcom-enhanced-recipes-with-images'
recipes_w_search_terms = 'shuyangli94/foodcom-recipes-with-search-terms-and-tags'

recipes_images_path = "/Users/cameliamazouz/Documents/M2/projetML/NutriRAG/dataset/recipes_enhanced_v2.csv"
recipes_w_search_terms_path = "/Users/cameliamazouz/Documents/M2/projetML/NutriRAG/dataset/recipes_w_search_terms.csv"

# Cleaning & Enchaced receipts

In [8]:
nrows=1000

In [12]:
raw_recipes_df = pd.read_csv(raw_recipes_path)

In [13]:
df_img = pd.read_csv(recipes_images_path)[["id", "has_image", 'image_url']]
df_quantity = pd.read_csv(recipes_w_search_terms_path)[["id", "ingredients_raw_str", "serving_size", 'servings', 'search_terms']]
df_quantity["serving_size"] = df_quantity["serving_size"].apply(lambda x: x[3:-3])
df_quantity["search_terms"] = df_quantity.search_terms.apply(lambda x: x.replace("{","[").replace("}","]"))
res = raw_recipes_df.merge(df_img, how="inner", on="id")
res = res.merge(df_quantity, how="inner", on="id")

In [11]:
res.to_csv("mon_truc.csv")

In [None]:
filters = []
tags = res["tags"]
tags_list = tags.tolist()
for tag in tags_list:
    tag = ast.literal_eval(tag)
    filters.extend(tag)
unique_filters = set(filters)
print(unique_filters)

In [None]:
tag_to_filter = {
    "vegan": "vegan",
    "vegetarian": "vegetarian",
    "veggie": "vegetarian",
    "veggie-burgers": "vegetarian",
    "no meat": "vegetarian",
    "meatless": "vegetarian",

    "kosher": "kosher",
    "jewish-ashkenazi": "kosher",
    "jewish": "kosher",
    "hanukkah": "kosher",

    "egg-free": "egg_free",
    "dairy-free": "dairy_free",
    "salt-free": "salt_free",
    "flour-less": "flour_less",
    "flourless": "flour_less",
    "no flour": "flour_less",
    "grain-free": "grain_free",
    "sugar-free": "sugar_free",
    "sugarless": "sugar_free",
    "carb-free": "carb_free",
    "low-carb":"low_carb",
    "low-cholesterol":"low_cholesterol",
    "low-protein":"low_protein",
    "low-calorie":"low_calorie",
    "low-calories":"low_calorie",
    "low-saturated-fat":"low_saturated_fat",
    "gluten-free": "gluten_free",
    "fat-free": "fat_free",
    "no-shell-fish":"no_shell_fish",
    "diabetic": "diabetic",
    "low-sodium":"low_sodium",
    "nut-free":"nut_free",
    "low-fat":"low_fat",
    "ramadan": "halal",
    "amish-mennonite": "amish",
    "non-alcoholic": "non_alcoholic",
}

def extract_filters(tags_str):
    if not tags_str:
        return []
    filters = []
    for t in tag_to_filter.keys():
        if t in tags_str:
            filters.append(t)
    return list(set(filters))  

res["filters"] = res["tags"].apply(extract_filters)


In [None]:
# A faire sur snowflake directement
# raw_recipes_df["nutrition"] = raw_recipes_df["nutrition"].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else [])
# filtered_df = raw_recipes_df[
#     (raw_recipes_df["name"].notna()) &
#     (raw_recipes_df["minutes"] > 5) &
#     (raw_recipes_df["id"].notna()) &
#     (raw_recipes_df["submitted"].notna()) &
#     (raw_recipes_df["tags"].apply(lambda x: len(x) > 0)) &
#     (raw_recipes_df["nutrition"].apply(lambda x: len(x) == 7)) &
#     (raw_recipes_df["description"].notna()) &
#     (raw_recipes_df["steps"].apply(lambda x: len(x) > 0)) &
#     (raw_recipes_df["ingredients"].apply(lambda x:  len(x) > 0))
# ]
# filtered_df

In [None]:
res = res[res["filters"].apply(lambda x:  len(x) > 0)]
res.drop(columns={"filters"},inplace=True)

#### Data quality

In [7]:
def safe_parse_list(list_str):
    try:
        parsed = ast.literal_eval(list_str)
        if isinstance(parsed, list):
            return parsed
        return []
    except (ValueError, SyntaxError):
        return []

def check_consistency(row):
    errors = []
    
    # --- Parsing des champs complexes ---
    # On parse les étapes et les ingrédients qui sont des strings dans le CSV
    steps_list = safe_parse_list(row['steps'])
    ingredients_list = safe_parse_list(row['ingredients'])
    
    # --- Vérifier la cohérence n_steps vs longueur de steps ---
    # Le dataset contient parfois des erreurs où le compteur ne matche pas le contenu
    if len(steps_list) != row['n_steps']:
        errors.append(
            f"Incohérence Steps: n_steps={row['n_steps']} mais {len(steps_list)} étapes trouvées."
        )

    # --- Vérifier la cohérence n_ingredients vs longueur de ingredients ---
    if len(ingredients_list) != row['n_ingredients']:
        errors.append(
            f"Incohérence Ingrédients: n_ingredients={row['n_ingredients']} mais {len(ingredients_list)} ingrédients trouvés."
        )

    # --- Logique métier (Temps de préparation) ---
    if row['minutes'] < 0:
        errors.append(f"Temps invalide: minutes négatives ({row['minutes']})")
    
    # ---  Champs obligatoires ---
    if pd.isna(row['name']) or str(row['name']).strip() == "":
        errors.append("Nom de recette manquant")

    return errors

print(f"Analyse de {len(res)} lignes de données...\n")


res['validation_errors'] = res.apply(check_consistency, axis=1)


# Filtrer pour ne garder que les lignes avec des erreurs
invalid_rows = res[res['validation_errors'].map(len) > 0]

if invalid_rows.empty:
    print("✅ Aucune erreur de cohérence détectée !")
else:
    print(f"❌ {len(invalid_rows)} recettes présentent des problèmes de qualité :\n")
    
    for index, row in invalid_rows.iterrows():
        recette_id = row['id']
        nom = row['name']
        erreurs = row['validation_errors']
        
        print(f"Recette ID {recette_id} ('{nom}'):")
        for err in erreurs:
            print(f"  - {err}")
        print("-" * 40)

Analyse de 222705 lignes de données...

❌ 1 recettes présentent des problèmes de qualité :

Recette ID 368257 ('nan'):
  - Nom de recette manquant
----------------------------------------


#### Sample

In [None]:
df_sample = res.iloc[0:110000]
df_sample.to_csv("clean_recipes_to_snowflake.csv", index=False, sep=",")

# Clean Clusters

In [None]:
import pandas as pd
df = pd.read_csv("../ingredients_with_clusters.csv", sep=";")

In [None]:
df['Magnesium_mg'] = df['Magnesium_mg'].replace({"X": "0"})
df['Magnesium_mg'] = df['Magnesium_mg'].astype('float')

In [None]:
df['VitE_mg'] = df['VitE_mg'].replace({"0.33 0.83": "0.33", "0.29 0.72" : "0.29", '0.58 1.46': '0.58', '0.69 1.73': '0.69', '0.58 1.44':'0.58', '0.49 1.22':'0.49'})
df['VitE_mg'] = df['VitE_mg'].astype('float')

In [None]:
df['VitD2_mcg'] = df['VitD2_mcg'].replace({"5,00E-05": "0.00005", '4,00E-05': "0.00004"})
df['VitD2_mcg'] = df['VitD2_mcg'].astype('float')

In [None]:
df.to_csv("../ingredients_with_clusters_1.csv", index=False, sep=";")