In [12]:
import numpy as np
import pandas as pd
import json
import tqdm

In [2]:
with open("../generated/high_score_repr_recipes.json") as json_file:  
    high_score_repr_recipes = json.load(json_file)
    
with open("../generated/low_score_repr_recipes.json") as json_file:  
    low_score_repr_recipes = json.load(json_file)
    
with open("../generated/high_score_key_representative.json") as json_file:  
    high_score_key_representative = json.load(json_file)
    
nutrients_mapping = pd.read_hdf("../generated/nut_data.h5", 'table')

In [3]:
high_score_repr_recipes_with_index = list(zip(range(0, len(high_score_repr_recipes)), high_score_repr_recipes))
high_score_repr_recipes_with_index = dict((key, value) for key, value in high_score_repr_recipes_with_index)

In [4]:
nutrients_mapping = nutrients_mapping[~nutrients_mapping.index.duplicated(keep='first')]

In [5]:
def map_recipes(recipes, mapping):
    threshold = 0.3
    profile = []
    mismatches_ratio_in_recipes = []

    for recipe in tqdm.tqdm(recipes):
        nb_ingr = len(recipe)
        mismatches = 0.0
        recipe_mapped = []
        
        for ingredient in recipe:
            if ingredient.startswith('usda_id'):
                usda_id = int(ingredient.split('=')[1])
                res_map = nutrients_mapping.loc[usda_id]
                recipe_mapped.append(res_map.tolist())
            else:
                mismatches += 1
                
        recipe_mapped = pd.DataFrame(recipe_mapped)
        mismatches_ratio_in_recipes.append(mismatches / nb_ingr)
        
        if mismatches / nb_ingr < threshold:
            recipe_mapped = recipe_mapped.sum(axis = 0)
            profile.append(recipe_mapped.tolist())
        else:
            profile.append(list())

    profile = pd.DataFrame(profile)
    profile.columns = mapping.columns
    
    return profile, mismatches_ratio_in_recipes

In [6]:
profile, mismatches_ratio_in_recipes = map_recipes(high_score_repr_recipes, nutrients_mapping)

100%|██████████| 96529/96529 [06:36<00:00, 243.49it/s]


In [7]:
average_mismatch = sum(mismatches_ratio_in_recipes) / len(mismatches_ratio_in_recipes)
average_mismatch

0.2106633786001671

In [8]:
profile_low, mismatches_ratio_in_recipes_low = map_recipes(low_score_repr_recipes, nutrients_mapping)

100%|██████████| 96529/96529 [07:12<00:00, 223.25it/s]


In [9]:
average_mismatch_low = sum(mismatches_ratio_in_recipes_low) / len(mismatches_ratio_in_recipes_low)
average_mismatch_low

0.024805635020499376

In [10]:
profile.to_json('../generated/high_score_profile_nutrients.json')
profile_low.to_json('../generated/low_score_profile_nutrients.json')