In [None]:
import os
import json
import pickle
import numpy as np
import shutil
import json

In [None]:
def cosine_similarity(array1, array2):
    dot_product = np.dot(array1, array2)
    magnitude1 = np.linalg.norm(array1)
    magnitude2 = np.linalg.norm(array2)
    return dot_product / (magnitude1 * magnitude2)

In [None]:
data_path = "./data/"
MIN_INGREDIENTS = 5
from config import config

In [None]:
with open(os.path.join(data_path, "foodon_entities_embeddings.pkl"), "rb") as file:
    foodon_embeddings = pickle.load(file)
    

with open(os.path.join(data_path, "ingridients_embeddings.pkl"), "rb") as file:
    ingridients_embeddings = pickle.load(file)

In [None]:
with open(os.path.join(data_path, "ingridients_cooc.pkl"), "rb") as file:
    cooc = pickle.load(file)

In [None]:
cooc

In [None]:
def ingredients_existing_in_cooc(
    ingredients: list[str], cooc_ingredients: dict, full_short_matches: dict
):
    existing_ingredients = []
    for ing in ingredients:
        if ing in cooc_ingredients:
            existing_ingredients.append(ing)
        else:
            short_ing = full_short_matches.get(ing, None)[0]
            if short_ing is not None and short_ing in cooc_ingredients:
                existing_ingredients.append(short_ing)
    return existing_ingredients


def determine_threshold(
    cooc_matrix: list[list[int]],
    ing_to_idx: dict,
    ingredients: list[str],
    percentile=95,
):
    relevant_indices = [ing_to_idx[ing] for ing in ingredients]
    relevant_scores = cooc_matrix[np.ix_(relevant_indices, relevant_indices)].flatten()
    threshold = np.percentile(relevant_scores, percentile)
    if threshold < 0:
        raise ValueError(f"threshold cannot be non-positive, threshold: {threshold}")
    return threshold


def select(ingredients: list[str], cooc_matrix: list[list[int]], ing_to_idx: dict):
    if len(ingredients) < config.MIN_INGREDIENTS:
        raise ValueError(f"At least {config.MIN_INGREDIENTS} ingredients are required.")

    selected_ingredients = []

    # Choose first ingredient as the most common among all
    most_common_ing, highest_score = None, -float("inf")
    for ing in ingredients:
        score = sum(
            cooc_matrix[ing_to_idx[ing]][ing_to_idx[ing2]] for ing2 in ingredients
        )
        if score > highest_score:
            highest_score = score
            most_common_ing = ing
    print(f"First ingredient is {most_common_ing}")
    selected_ingredients.append(most_common_ing)

    # Choose other ingredients needed to reach the minimum number
    for _ in range(config.MIN_INGREDIENTS - 1):
        best_ing, highest_score = None, -float("inf")
        for ing in ingredients:
            if ing not in selected_ingredients:
                score = sum(
                    cooc_matrix[ing_to_idx[ing]][ing_to_idx[selected_ing]]
                    for selected_ing in selected_ingredients
                )
                if score > highest_score:
                    highest_score = score
                    best_ing = ing
        selected_ingredients.append(best_ing)
    print(f"Minimal needed ingredients are {selected_ingredients}")

    threshold = determine_threshold(cooc_matrix, ing_to_idx, ingredients)
    print(f"Treshold is {threshold}")

    # Iteratively add more ingredients until threshold is met
    while True:
        best_ing, highest_score = None, -float("inf")
        for ing in ingredients:
            if ing not in selected_ingredients:
                score = sum(
                    cooc_matrix[ing_to_idx[ing]][ing_to_idx[selected_ing]]
                    for selected_ing in selected_ingredients
                )
                if score > highest_score:
                    highest_score = score
                    best_ing = ing
        if highest_score < threshold:
            break
        selected_ingredients.append(best_ing)
    print(
        f"Selected {len(selected_ingredients)} ingredients from {len(ingredients)}, they are {selected_ingredients}"
    )

    return selected_ingredients


def select_ingredients():
    print("Selecting ingredients ...")

    with open(os.path.join(data_path, config.INGREDIENTS_FILE)) as json_file:
        ingredients = json.load(json_file)
        assert isinstance(ingredients, list)

    with open(os.path.join(data_path, config.COOC_FILE), "rb") as file:
        cooc = pickle.load(file)

    with open(os.path.join(data_path, config.FULL_SHORT_FILE), "rb") as file:
        full_short_matches = pickle.load(file)

    ingredients = ingredients_existing_in_cooc(
        ingredients, cooc["ing_to_index"], full_short_matches
    )
    ingredients = select(
        ingredients, cooc["ing_cooc_matrix"].toarray(), cooc["ing_to_index"]
    )

    return ingredients

        
select_ingredients()

In [None]:
def ingredients_existing_in_cooc(
    ingredients: list[str], cooc_ingredients: dict, full_short_matches: dict
):
    existing_ingredients = []
    for ing in ingredients:
        if ing in cooc_ingredients:
            existing_ingredients.append(ing)
        else:
            short_ing = full_short_matches.get(ing, None)[0]
            if short_ing is not None and short_ing in cooc_ingredients:
                existing_ingredients.append(short_ing)
    return existing_ingredients


def determine_threshold(
    cooc_matrix: list[list[int]],
    ing_to_idx: dict,
    ingredients: list[str],
    percentile=65,
):
    relevant_indices = [ing_to_idx[ing] for ing in ingredients]
    relevant_scores = cooc_matrix[np.ix_(relevant_indices, relevant_indices)].flatten()
    threshold = np.percentile(relevant_scores, percentile)
    if threshold < 0:
        raise ValueError(f"threshold cannot be non-positive, threshold: {threshold}")
    return threshold


def select(ingredients: list[str], cooc_matrix: list[list[int]], sim_matrix: list[list[float]], ing_to_idx: dict):
    if len(ingredients) < config.MIN_INGREDIENTS:
        raise ValueError(f"At least {config.MIN_INGREDIENTS} ingredients are required.")

    selected_ingredients = []

    # Choose first ingredient as the most common among all
    most_common_ing, highest_score = None, -float("inf")
    for ing in ingredients:
        score = sum(
            cooc_matrix[ing_to_idx[ing]][ing_to_idx[ing2]] for ing2 in ingredients
        )
        if score > highest_score:
            highest_score = score
            most_common_ing = ing
    print(f"First ingredient is {most_common_ing}")
    selected_ingredients.append(most_common_ing)

    # Choose other ingredients needed to reach the minimum number
    for _ in range(config.MIN_INGREDIENTS - 1):
        best_ing, highest_score = None, -float("inf")
        for ing in ingredients:
            if ing not in selected_ingredients:
                score = sum(
                    sim_matrix[ing_to_idx[ing]][ing_to_idx[selected_ing]]
                    for selected_ing in selected_ingredients
                )
                if score > highest_score:
                    highest_score = score
                    best_ing = ing
        selected_ingredients.append(best_ing)
    print(f"Minimal needed ingredients are {selected_ingredients}")

    threshold = determine_threshold(sim_matrix, ing_to_idx, ingredients)
    print(f"Treshold is {threshold}")

    # Iteratively add more ingredients until threshold is met
    while True:
        best_ing, highest_score = None, -float("inf")
        for ing in ingredients:
            if ing not in selected_ingredients:
                score = sum(
                    sim_matrix[ing_to_idx[ing]][ing_to_idx[selected_ing]]
                    for selected_ing in selected_ingredients
                ) / len(selected_ingredients)
                if score > highest_score:
                    highest_score = score
                    best_ing = ing
        print(highest_score)
        if highest_score < threshold:
            break
        selected_ingredients.append(best_ing)
    print(
        f"Selected {len(selected_ingredients)} ingredients from {len(ingredients)}, they are {selected_ingredients}"
    )

    return selected_ingredients


def select_ingredients():
    print("Selecting ingredients ...")

    with open(os.path.join(data_path, config.INGREDIENTS_FILE)) as json_file:
        ingredients = json.load(json_file)
        assert isinstance(ingredients, list)

    with open(os.path.join(data_path, config.COOC_FILE), "rb") as file:
        cooc = pickle.load(file)

    with open(os.path.join(data_path, config.FULL_SHORT_FILE), "rb") as file:
        full_short_matches = pickle.load(file)

    ingredients = ingredients_existing_in_cooc(
        ingredients, cooc["ing_to_index"], full_short_matches
    )
    ingredients = select(
        ingredients, cooc["ing_cooc_matrix"].toarray(), cooc['sim_matrix'], cooc["ing_to_index"]
    )

    return ingredients

        
select_ingredients()

In [None]:
def determine_threshold(cooc_matrix, ing_to_idx, ingredients, percentile=75):
    relevant_indices = [ing_to_idx[ing] for ing in ingredients]
    relevant_scores = cooc_matrix[np.ix_(relevant_indices, relevant_indices)].flatten()
    np.percentile(relevant_scores, percentile)
    if threshold <= 0:
        raise ValueError(f'threshold cannot be non-positive, threshold: {threshold}')
    return threshold

In [None]:
ingredients = ['green bean', 'blackberry jam', 'vermicelli', 'chicken', 'water', 'chili sauce', 'corn', 'canned kidney bean', 'ground cardamom', 'rice milk', 'almond', 'chocolate', 'sugar', 'bread']
cooc_matrix = cooc['ing_cooc_matrix'].toarray()
ing_to_idx = cooc['ing_to_index']
relevant_indices = [ing_to_idx[ing] for ing in ingredients]
relevant_scores = cooc_matrix[np.ix_(relevant_indices, relevant_indices)].flatten()

In [None]:
relevant_scores

In [None]:
np.percentile(relevant_scores, 75)

In [None]:
with open(os.path.join(data_path, "ingredients_cooc.pkl"), "rb") as file:
    cooc = pickle.load(file)

In [None]:
cooc['ing_cooc_matrix'].toarray()[0].max()

In [None]:
with open(os.path.join(data_path, "ingredients_cooc.pkl"), "rb") as file:
    data = pickle.load(file)

In [None]:
data