# Здесь мы препроцессим данные для обучения EASE

#### Грузим датафрейм с рецептами

In [None]:
import pandas as pd

data = pd.read_csv("recipes_normalized.csv")

In [68]:
import ast

all_ingredients = set()
for ingredients_str in data["ingredients_normalized"]:
    ingredients_parsed = ast.literal_eval(ingredients_str)
    all_ingredients.update(ingredients_parsed.keys())

all_ingredients = sorted(list(all_ingredients))
ingredient_to_idx = {ing: idx for idx, ing in enumerate(all_ingredients)}

print(f"Ингредиентов: {len(all_ingredients)}")
print(f"Рецептов: {len(data)}")

Ингредиентов: 979
Рецептов: 146581


In [69]:
stop_words_drop = [
    "Соль",
    "Сахар-песок",
    "Перец черный молотый",
    "Мука пшеничная",
    "Сода",
    "Сода гашеная уксусом",
]

In [70]:
from tqdm import tqdm

interactions = []
for idx, row in tqdm(data.iterrows(), total=len(data)):
    ingredients_parsed = ast.literal_eval(row["ingredients_normalized"])
    recipe_id = row.get("url", idx)

    for ingredient in ingredients_parsed.keys():
        interactions.append((recipe_id, ingredient))

interactions_df = pd.DataFrame(interactions, columns=["recipe_id", "ingredient_id"])
print(f"Interactions {len(interactions_df)}")

unique_recipes = interactions_df["recipe_id"].unique()
all_unique_ingredients = interactions_df["ingredient_id"].unique()
unique_ingredients = [
    ingredient
    for ingredient in all_unique_ingredients
    if ingredient not in stop_words_drop
]

recipe2id = {recipe: i for i, recipe in enumerate(unique_recipes)}
item2id = {ingredient: i for i, ingredient in enumerate(unique_ingredients)}

id2recipe = {i: recipe for recipe, i in recipe2id.items()}
id2item = {i: ingredient for ingredient, i in item2id.items()}

interactions_df["user_id"] = interactions_df["recipe_id"].map(recipe2id)
interactions_df["item_id"] = interactions_df["ingredient_id"].map(item2id)
interactions_df.dropna(subset=["item_id"], inplace=True)
interactions_df["item_id"] = interactions_df["item_id"].astype(int)


100%|██████████| 146581/146581 [00:08<00:00, 16625.23it/s]


Interactions 1278324


In [None]:
interactions_df
interactions_df.rename(columns={"item_id": "all_interactions"}, inplace=True)

#### Сохраняем  interactions_df для обучения EASE


In [None]:
interactions_df.to_csv("interactions_df.csv")
