In [None]:
#TODO: Create a .py script

# Imports

In [1]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import warnings

from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize 
from numpy.linalg import norm

# Configuration

In [2]:
pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_columns', 5000)
pd.set_option('display.max_colwidth', 5000)


warnings.filterwarnings('ignore')

In [46]:
native_mexican_entities = ['Agave', 'Allspice', 'Amaranth', 'Avocado', 'Beans', 'Bitter Orange', 'Byrsonima crassifolia', 'Capsicum', 'Chayote', 'Chia', 'Cocoa', 'Cocoa butter', 'Cocoa powder', 'Corn', 'Epazote', 'Green zucchini', 'Guava', 'Jackfruit', 'Jalapeno', 'Jicama', 'Mexican groundcherry', 'Mexican oregano', 'Nance', 'Narrowleaf cattail', 'Nopal', 'Papaya', 'Prickly Pear', 'Purslane', 'Sapodilla', 'Soursop', 'Spirulina', 'Tomato', 'Tortilla', 'Vanilla', 'Yam', 'Yautia']



# Read data

In [42]:
edamam_flavor = pd.read_pickle('../data/raw/edamam_flavor_dbs_nutrients.pkl')

# Functions

In [4]:
def cosine_similarity(w1, w2, base_df):
    A = base_df.loc[w1]
    B = base_df.loc[w2]
    cosine = np.dot(A,B)/(norm(A)*norm(B))
    return cosine

def jaccard_similarity(w1, w2, base_df):
    set1 = base_df.loc[w1].common_name
    set2 = base_df.loc[w2].common_name
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union

def get_similar_ingredients(w, base_df):
    similarities = []
    relative_base_df = base_df.loc[base_df.index!=w]
    ingredients = pd.Series(relative_base_df.index)
    similarity = ingredients.apply(lambda x: cosine_similarity(w, x, base_df))
    answers = pd.DataFrame(similarity)
    return answers.set_index(relative_base_df.index).rename(columns={'entity_alias_readable': w})
     

def get_top_similarities(w, base_df, similarity_metric='jaccard'):
    similarities = []
    words = graph_flavor_molecules_df[graph_flavor_molecules_df.entity_alias_readable != w].entity_alias_readable.drop_duplicates()
    for entity in words:
        if similarity_metric == 'jaccard':
            similarities.append(jaccard_similarity(w, entity, base_df=base_df))
        elif similarity_metric == 'cosine':
            similarities.append(cosine_similarity(w, entity, base_df=base_df))
    response = pd.DataFrame({'entity':words, 'similarity':similarities})
    return  response.sort_values('similarity', ascending=False)

In [5]:
columns_defined = [
    'taste',
    'functional_groups',
    'fooddb_flavor_profile',
    'supersweetdb_id',
    'fooddb_id',
    'common_name',
    'bitterdb_id',
    'super_sweet',
    'flavornet_id',
    'pubchem_id',
    'bitter',
    'iupac_name',
    'odor',
    'smile',
    'inchi',
    'cas_id',
    'flavor_profile',
    'fema_flavor_profile',
]
flavor_molecules_df_list = []

nutrition_facts_df_list = []

for row_index in range(edamam_flavor.shape[0]):
    df_temporal_index = edamam_flavor.iloc[row_index]
    flavor_molecules_df = pd.DataFrame(df_temporal_index.molecules)
    flavor_molecules_df['entity_alias_readable'] = df_temporal_index.entity_alias_readable
    flavor_molecules_df_list.append(flavor_molecules_df)
    nutrition_facts_df = pd.DataFrame(df_temporal_index.nutritional_info).iloc[-1:]
    nutrition_facts_df['entity_alias_readable'] = df_temporal_index.entity_alias_readable
    nutrition_facts_df_list.append(nutrition_facts_df)
    
    
flavor_molecules = pd.concat(flavor_molecules_df_list)[columns_defined + ['entity_alias_readable']]
nutrition_facts = pd.concat(nutrition_facts_df_list).set_index("entity_alias_readable").fillna(0)

In [14]:
nutrition_facts_transformed = nutrition_facts.T
nutrition_facts_transformed = pd.DataFrame(
    normalize(nutrition_facts_transformed), 
    index=nutrition_facts_transformed.index,
    columns=nutrition_facts_transformed.columns
).T

In [20]:
flavor_molecules['list_flavor_profile'] = flavor_molecules.flavor_profile.str.split('@')
flavor_molecules['list_functional_groups'] = flavor_molecules.functional_groups.str.split('@')

In [34]:
molecules = flavor_molecules[['entity_alias_readable', 'inchi']]
molecules['dummy'] = 1
molecules = molecules.groupby(['entity_alias_readable', 'inchi']).count().reset_index()
molecules = pd.pivot_table(
    molecules, 
    index='entity_alias_readable',
    columns='inchi', 
    aggfunc='sum', 
    values="dummy", 
    fill_value=0
)
molecules_pca_obj = PCA(
    n_components=100,
    copy=True, 
    whiten=False,
    svd_solver='auto',
    tol=0.0, 
    iterated_power='auto',
    n_oversamples=10,
    power_iteration_normalizer='auto', 
    random_state=99
)

molecules_per_entity_reduced = pd.DataFrame((molecules_pca_obj.fit_transform(molecules)), index=molecules.index)


In [35]:
flavor_molecules[['entity_alias_readable', 'list_functional_groups']].explode('list_functional_groups')

Unnamed: 0,entity_alias_readable,list_functional_groups
0,Egg,amine
0,Egg,primary amine
0,Egg,primary aliphatic amine (alkylamine)
0,Egg,carboxylic acid derivative
0,Egg,carboxylic acid
...,...,...
98,Japanese pumpkin,aldehyde
99,Japanese pumpkin,hydroxy compound
99,Japanese pumpkin,alcohol
99,Japanese pumpkin,primary alcohol


In [36]:
functional_groups = flavor_molecules[['entity_alias_readable', 'list_functional_groups']].explode('list_functional_groups')
functional_groups['dummy'] = 1
functional_groups = functional_groups.groupby(['entity_alias_readable', 'list_functional_groups']).count().reset_index()

functional_groups = pd.pivot_table(
    functional_groups, 
    index='entity_alias_readable',
    columns='list_functional_groups', 
    aggfunc='sum', 
    values="dummy", 
    fill_value=0
)
functional_groups_pca_obj = PCA(
    n_components=20,
    copy=True, 
    whiten=False,
    svd_solver='auto',
    tol=0.0, 
    iterated_power='auto',
    n_oversamples=10,
    power_iteration_normalizer='auto', 
    random_state=99
)

functional_groups_per_entity_reduced = pd.DataFrame((functional_groups_pca_obj.fit_transform(functional_groups)), index=functional_groups.index)

In [37]:
functional_groups_per_entity_reduced

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
entity_alias_readable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Abalone,27.836259,-12.453556,4.845235,-6.155267,2.767709,-2.24602,-0.553974,1.108652,-0.381087,-0.039032,0.029794,0.300293,-0.061736,-0.317466,-0.132893,0.148237,-0.306766,-0.103868,0.003075,-0.115902
Abiyuch,26.336626,-12.124318,4.25243,-6.109671,3.43709,-3.269519,-1.641279,0.954153,-0.611864,0.8529,0.379221,-0.621387,0.483572,-0.291726,-0.092738,-0.128549,-0.685405,-0.746331,0.241204,-0.343022
Acerola,27.836259,-12.453556,4.845235,-6.155267,2.767709,-2.24602,-0.553974,1.108652,-0.381087,-0.039032,0.029794,0.300293,-0.061736,-0.317466,-0.132893,0.148237,-0.306766,-0.103868,0.003075,-0.115902
Acorn,27.836259,-12.453556,4.845235,-6.155267,2.767709,-2.24602,-0.553974,1.108652,-0.381087,-0.039032,0.029794,0.300293,-0.061736,-0.317466,-0.132893,0.148237,-0.306766,-0.103868,0.003075,-0.115902
Adzuki bean,33.371709,-12.944513,6.482946,-6.333078,-1.410401,0.209731,0.755217,-0.272186,-0.421799,-0.043244,-0.685632,0.137588,0.061498,0.534429,-0.204638,0.35312,0.547344,0.573397,-0.462492,-0.136278
Agar,27.836259,-12.453556,4.845235,-6.155267,2.767709,-2.24602,-0.553974,1.108652,-0.381087,-0.039032,0.029794,0.300293,-0.061736,-0.317466,-0.132893,0.148237,-0.306766,-0.103868,0.003075,-0.115902
Agave,27.836259,-12.453556,4.845235,-6.155267,2.767709,-2.24602,-0.553974,1.108652,-0.381087,-0.039032,0.029794,0.300293,-0.061736,-0.317466,-0.132893,0.148237,-0.306766,-0.103868,0.003075,-0.115902
Alaska blackfish,-68.538173,0.337993,-2.070029,1.924455,-2.058697,-0.585966,-0.465036,-0.148584,0.281991,-0.032727,-0.131023,-0.051452,0.078993,-0.472948,0.20484,-0.146083,-0.074838,0.125561,-0.037311,0.147847
Alaska pollock,-66.322606,0.597424,-1.476111,1.14996,-1.549937,0.185459,0.552949,-0.081726,-0.439736,0.164421,0.396443,-0.244622,0.141824,0.178089,-0.0415,-0.126154,-0.180823,-0.168612,0.199139,-0.163298
Alaska wild rhubarb,27.836259,-12.453556,4.845235,-6.155267,2.767709,-2.24602,-0.553974,1.108652,-0.381087,-0.039032,0.029794,0.300293,-0.061736,-0.317466,-0.132893,0.148237,-0.306766,-0.103868,0.003075,-0.115902


In [38]:
# NEW FLAVOR PROFILE OBTENSION
flavor_profiles = flavor_molecules[['entity_alias_readable', 'list_flavor_profile']].explode('list_flavor_profile')
flavor_profiles['dummy'] = 1
flavor_profiles = flavor_profiles.groupby(['entity_alias_readable', 'list_flavor_profile']).count().reset_index()

flavor_profiles = pd.pivot_table(
    flavor_profiles, 
    index='entity_alias_readable',
    columns='list_flavor_profile', 
    aggfunc='sum', 
    values="dummy", 
    fill_value=0
)

flavor_profile_pca_obj = PCA(
    n_components=100,
    copy=True, 
    whiten=False,
    svd_solver='auto',
    tol=0.0, 
    iterated_power='auto',
    n_oversamples=10,
    power_iteration_normalizer='auto', 
    random_state=99
)

flavor_profile_per_entity_reduced = pd.DataFrame((flavor_profile_pca_obj.fit_transform(flavor_profiles)), index=flavor_profiles.index)


In [39]:
general_flavor = pd.concat([flavor_profile_per_entity_reduced, functional_groups_per_entity_reduced, molecules_per_entity_reduced], axis=1)

general_flavor_pca_obj = PCA(
    n_components=50,
    copy=True, 
    whiten=False,
    svd_solver='auto',
    tol=0.0, 
    iterated_power='auto',
    n_oversamples=10,
    power_iteration_normalizer='auto', 
    random_state=99
)

general_flavor_per_entity_reduced = pd.DataFrame(general_flavor_pca_obj.fit_transform(normalize(general_flavor)), index=general_flavor.index)
general_flavor_per_entity_reduced

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49
entity_alias_readable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1
Abalone,-0.893551,-0.321017,-0.094985,-0.007072,-0.071476,0.03555,-0.01909,-0.015651,0.001747,-0.007457,0.0004,0.010421,0.000868,-0.000228,-0.011962,0.00437,-4.8e-05,0.001552,-0.001194,-0.001733,0.005479,0.002589,0.001527,0.001369,0.000264,-0.001962,-0.004877,0.000623,0.001026,0.002848,-0.001521,0.002184,-0.000256,-0.001043,-0.003344,0.000105,0.002133,-0.000302,0.002394,0.003256,0.002132,0.000531,-0.0006,0.003,0.001245,0.000765,0.00123,-0.001274,-0.00358,-0.000702
Abiyuch,-0.885641,-0.322989,-0.092483,-0.005689,-0.104249,0.067346,-0.022093,-0.033611,-0.016223,0.019356,-0.012266,0.014384,-0.007496,-0.008869,-0.010006,0.016437,0.009497,-0.008337,0.002071,-0.010748,-0.000453,0.004257,-0.007196,0.002362,0.011595,-0.005604,0.003395,5.2e-05,-0.008196,0.0027,0.005181,-0.001329,0.000918,0.001297,0.001479,-0.000262,0.000548,0.007365,0.001654,-0.003618,-0.004214,-0.001916,-0.001001,-0.009905,0.000771,0.002879,-0.001157,0.001077,0.00551,0.000601
Acerola,-0.893551,-0.321017,-0.094985,-0.007072,-0.071476,0.03555,-0.01909,-0.015651,0.001747,-0.007457,0.0004,0.010421,0.000868,-0.000228,-0.011962,0.00437,-4.8e-05,0.001552,-0.001194,-0.001733,0.005479,0.002589,0.001527,0.001369,0.000264,-0.001962,-0.004877,0.000623,0.001026,0.002848,-0.001521,0.002184,-0.000256,-0.001043,-0.003344,0.000105,0.002133,-0.000302,0.002394,0.003256,0.002132,0.000531,-0.0006,0.003,0.001245,0.000765,0.00123,-0.001274,-0.00358,-0.000702
Acorn,-0.893551,-0.321017,-0.094985,-0.007072,-0.071476,0.03555,-0.01909,-0.015651,0.001747,-0.007457,0.0004,0.010421,0.000868,-0.000228,-0.011962,0.00437,-4.8e-05,0.001552,-0.001194,-0.001733,0.005479,0.002589,0.001527,0.001369,0.000264,-0.001962,-0.004877,0.000623,0.001026,0.002848,-0.001521,0.002184,-0.000256,-0.001043,-0.003344,0.000105,0.002133,-0.000302,0.002394,0.003256,0.002132,0.000531,-0.0006,0.003,0.001245,0.000765,0.00123,-0.001274,-0.00358,-0.000702
Adzuki bean,-0.905575,-0.291223,-0.097593,-0.02901,0.053047,-0.061284,0.005256,0.017312,-0.01292,-0.017021,4.7e-05,-0.021662,-0.000723,-0.002689,0.016458,-0.009093,-0.01052,9.4e-05,0.007687,0.009272,0.007143,0.015556,-5.1e-05,-0.005777,-0.009451,-0.005289,-0.007542,-0.005044,0.016593,-0.00468,-0.000587,-0.003791,-0.005513,0.002995,0.016049,-0.013095,-0.003588,-0.000243,0.007319,-0.003138,0.008322,-0.003609,0.002581,-0.001159,0.002778,0.001645,-0.008984,-0.00161,-0.005994,0.005694
Agar,-0.893551,-0.321017,-0.094985,-0.007072,-0.071476,0.03555,-0.01909,-0.015651,0.001747,-0.007457,0.0004,0.010421,0.000868,-0.000228,-0.011962,0.00437,-4.8e-05,0.001552,-0.001194,-0.001733,0.005479,0.002589,0.001527,0.001369,0.000264,-0.001962,-0.004877,0.000623,0.001026,0.002848,-0.001521,0.002184,-0.000256,-0.001043,-0.003344,0.000105,0.002133,-0.000302,0.002394,0.003256,0.002132,0.000531,-0.0006,0.003,0.001245,0.000765,0.00123,-0.001274,-0.00358,-0.000702
Agave,-0.893551,-0.321017,-0.094985,-0.007072,-0.071476,0.03555,-0.01909,-0.015651,0.001747,-0.007457,0.0004,0.010421,0.000868,-0.000228,-0.011962,0.00437,-4.8e-05,0.001552,-0.001194,-0.001733,0.005479,0.002589,0.001527,0.001369,0.000264,-0.001962,-0.004877,0.000623,0.001026,0.002848,-0.001521,0.002184,-0.000256,-0.001043,-0.003344,0.000105,0.002133,-0.000302,0.002394,0.003256,0.002132,0.000531,-0.0006,0.003,0.001245,0.000765,0.00123,-0.001274,-0.00358,-0.000702
Alaska blackfish,0.996909,-0.086417,0.008316,-0.024144,0.005975,0.006394,-0.00588,-0.002117,5.8e-05,0.008882,-0.002033,0.002779,0.006803,-0.001208,-0.012388,0.006867,0.002566,-0.000939,-0.002969,0.001587,-0.000461,-0.001632,0.00259,0.002643,0.001675,-0.001342,0.001934,-0.000882,-0.000268,-0.000976,-0.002827,0.002807,0.000195,-0.001814,-0.000329,-0.000733,-0.000554,-0.002581,0.000201,0.000423,0.001671,0.000699,-0.002256,0.003317,-5.1e-05,-0.00053,0.000653,-0.001792,-0.001351,0.000879
Alaska pollock,0.996578,-0.087926,-0.001663,-0.024405,0.00776,-0.008489,0.006786,-0.001015,0.006741,-0.002102,-0.001394,0.000584,-0.005417,-0.00524,0.005868,-0.002602,0.002384,0.002848,-0.001311,-0.000616,-0.004252,0.002511,-0.001036,-0.003997,0.002634,-0.001945,0.001911,-0.00145,-0.001246,-0.003307,0.001845,-0.000241,-0.000468,0.001415,-0.000959,-0.000124,-0.002542,0.000645,0.000447,-0.000701,-0.000443,-0.001663,-0.000457,-0.002828,-0.000986,0.002017,-0.000234,0.001295,0.001409,-0.001458
Alaska wild rhubarb,-0.893551,-0.321017,-0.094985,-0.007072,-0.071476,0.03555,-0.01909,-0.015651,0.001747,-0.007457,0.0004,0.010421,0.000868,-0.000228,-0.011962,0.00437,-4.8e-05,0.001552,-0.001194,-0.001733,0.005479,0.002589,0.001527,0.001369,0.000264,-0.001962,-0.004877,0.000623,0.001026,0.002848,-0.001521,0.002184,-0.000256,-0.001043,-0.003344,0.000105,0.002133,-0.000302,0.002394,0.003256,0.002132,0.000531,-0.0006,0.003,0.001245,0.000765,0.00123,-0.001274,-0.00358,-0.000702


In [40]:
food_characteristics = pd.concat([general_flavor_per_entity_reduced, nutrition_facts_transformed], axis=1)
food_characteristics.columns = food_characteristics.columns.astype(str)
food_characteristics_pca_obj = PCA(
    n_components=50,
    copy=True, 
    whiten=False,
    svd_solver='auto',
    tol=0.0, 
    iterated_power='auto',
    n_oversamples=10,
    power_iteration_normalizer='auto', 
    random_state=99
)

food_characteristics_per_entity_reduced = pd.DataFrame(food_characteristics_pca_obj.fit_transform(food_characteristics), index=general_flavor.index)

In [41]:
food_characteristics_per_entity_reduced

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49
entity_alias_readable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1
Abalone,-0.891945,-0.31896,-0.104537,-0.000254,-0.073866,0.018017,0.01762,-0.00798,-0.043651,-0.031307,-0.02472,-0.025086,-0.00296,0.005981,-0.006852,-0.001968,0.000509,0.011997,0.003267,-0.016217,-0.011849,0.005412,0.001302,0.008415,0.02354,-0.013132,0.013526,-0.009358,0.015033,0.016129,-0.002727,-0.002116,-0.008506,-0.019073,0.019757,-0.026236,-0.018054,0.007781,-0.02227097,-0.015418,0.015254,-0.01326,-0.005041,0.014679,0.006871,-0.01119446,0.014655,-0.007361,-0.001705,0.005862
Abiyuch,-0.885807,-0.32159,-0.103367,-0.014433,-0.101602,0.069892,0.04393,-0.016033,0.005766,-0.026972,-0.041212,0.019436,-0.006923,0.007745,0.017476,-0.007583,-0.025726,-0.004727,0.001096,0.000476,0.009386,0.011638,-0.006183,-0.001171,-0.007568,-0.004087,-0.000484,-0.00752,-0.010704,0.011621,0.004665,0.023485,-0.006056,-0.000377,0.010681,-0.000627,0.003969,-0.005875,-0.01351025,-0.010305,0.004565,0.003489,0.005606,0.006922,0.003643,0.006980388,0.002111,-0.009756,0.011449,0.003953
Acerola,-0.898564,-0.324697,-0.09718,-0.026225,-0.093648,0.108477,-0.006516,-0.01071,-0.024632,-0.016048,-0.012759,0.105208,0.111423,0.088429,-0.011949,-0.042761,-0.167944,-0.058271,0.452746,0.504989,0.395037,-0.183602,0.107453,-0.078355,-0.005341,-0.070229,-0.014829,-0.064854,-0.00513,-0.038999,-0.033271,0.008618,0.007931,-0.005191,0.037865,0.003813,0.005332,0.028083,-0.01695574,-0.003415,0.005899,-0.007227,0.01081,0.00813,0.007076,-0.01413013,0.000175,-0.002041,-0.017485,0.011857
Acorn,-0.893862,-0.321316,-0.093526,0.003481,-0.068527,-0.032881,0.064918,0.011735,0.014046,-0.02106,0.000867,0.004201,-0.008116,-0.007626,-0.009978,-0.000379,0.019004,-0.008479,-0.01395,-0.015104,0.017247,-0.012855,-0.003618,-0.010058,-0.011135,-0.00514,0.00231,0.022681,-0.009419,-0.011265,0.030188,0.008954,0.003224,-0.025014,-0.016893,-0.02384,0.000182,0.006758,0.004969061,0.013637,-0.000518,0.030945,-0.022667,0.010052,0.003345,0.01706795,-0.00372,0.017436,-0.01182,-0.008081
Adzuki bean,-0.906379,-0.292469,-0.100999,-0.02809,0.05413,-0.033375,-0.049688,-0.005703,0.049454,0.013482,0.025832,0.00768,-0.019719,-0.011666,-0.017124,-0.001088,0.002282,0.000179,-0.008742,-0.009067,-0.008394,-0.000188,0.026294,-0.000378,-0.031283,-0.011377,0.018611,0.015126,-0.004141,-0.023769,-0.009506,-0.017422,-0.011975,0.006151,-0.002122,-0.017598,0.01358,0.030414,0.006273875,0.028625,-0.002118,0.010871,-0.011101,0.00969,-0.003385,-0.01209346,-0.000268,0.007547,-0.01758,-0.017616
Agar,-0.894339,-0.320292,-0.103279,-0.011241,-0.069595,0.044689,0.012472,-0.017492,-0.01009,-0.024318,-0.002585,0.024139,0.00287,0.004368,-0.021464,-0.015162,0.000886,-0.00773,-0.018324,-0.026075,0.006772,-0.000873,0.005888,-0.003862,-0.005642,-0.001069,0.004224,0.003521,-0.006251,0.024698,-0.001211,0.009746,0.001212,-0.002112,0.00184,0.005632,0.00148,0.0056,-0.00267471,-0.007485,-0.006071,3.7e-05,-0.005844,-0.002596,-0.012071,-0.0005528041,-0.004617,-0.001355,0.000784,-0.005257
Agave,-0.891842,-0.321627,-0.109668,-0.031531,-0.054858,0.042951,0.068658,-0.032601,0.292277,0.025089,-0.070846,-0.128032,0.042339,0.099501,0.053281,-0.021414,0.043866,0.016891,-0.026249,0.008681,0.011677,0.003496,0.026668,0.019429,0.034836,0.009994,-0.004656,-0.013718,-0.01303,0.058076,0.012361,0.015354,0.023708,-0.014689,0.002135,0.015406,0.002972,-0.013107,-0.04276522,0.002723,-0.003018,-0.010676,0.012594,0.02,0.005123,-0.004510431,0.024186,-0.001459,-0.007126,0.005307
Alaska blackfish,0.997873,-0.086741,0.003961,-0.025645,0.006392,0.020707,-0.015663,-0.008895,-0.048408,-0.010939,-0.006911,-0.011965,-0.011847,0.015132,0.013629,-0.008201,0.013028,-0.007678,-0.010738,0.016166,-0.008818,-0.01013,-0.019968,-0.020665,-0.031098,0.031414,-0.016642,0.013499,0.006948,0.020053,0.00421,3.2e-05,-0.00862,-0.001661,0.002432,-0.008963,-0.011557,-0.003733,-0.001358832,-0.019898,0.013316,-0.009555,-0.007583,0.000571,-0.005678,-0.004180727,0.000724,-0.002783,-0.014964,0.008029
Alaska pollock,0.996993,-0.086551,-0.010172,-0.029885,0.006763,0.034421,-0.034288,0.004478,-0.019126,-0.001728,-0.010327,0.009073,0.006272,-0.012546,-0.008952,0.004854,-0.023088,-0.010568,-0.003091,-0.024497,0.003723,-0.000789,0.008495,-0.010879,-0.003862,0.004943,-0.000519,-0.012102,-0.006557,0.013043,-0.017241,-0.011057,-0.004749,-0.005043,0.00893,-0.008334,-0.01764,0.00313,-0.009418362,-0.016227,0.006315,-0.011752,0.002505,0.010616,0.010787,0.00156574,0.000641,-0.002852,-0.002676,0.00466
Alaska wild rhubarb,-0.893545,-0.319089,-0.107913,-0.014899,-0.070801,0.064478,0.006834,-0.00237,-0.012254,-0.025982,-0.026141,0.013975,0.007299,0.002002,-0.01028,-0.00194,-0.007487,-0.002231,-0.002937,-0.013308,-0.008542,0.008571,-0.008833,0.004868,0.003115,0.002731,-0.007809,-0.005859,-0.004165,0.020255,0.006276,0.009682,0.004102,-0.001681,5.4e-05,-5.9e-05,-0.000979,-0.001378,-0.002094312,-0.002719,0.003176,-0.000372,-0.005248,0.003681,-0.005525,-0.0005219341,-0.004275,-0.00265,0.000844,-0.002901


In [58]:
final_dict = {}
max_min = {}

for word in food_characteristics_per_entity_reduced.index:
    nme = native_mexican_entities.copy()
    w = word
    t = get_similar_ingredients(w, food_characteristics_per_entity_reduced)
    top10 = t.sort_values(by=w, ascending=False).head(10)
    if w in nme:
        nme.remove(w)
    top10_mexican = t.loc[nme].sort_values(by=w, ascending=False).head(10)
    f = top10.index.to_list()
    f_mex = top10_mexican.index.to_list()
    final_dict[w] = [f, f_mex]
    
    max_min[w] = [top10.max(), top10.min()]
    #final_dict_99[w] = [t[t[w]>=0.99].index.to_list()] 
    #final_dict_95[w] = [t[(t[w]>=0.95) & (t[w]<0.99)].index.to_list()]
    #final_dict_90[w] = [t[(t[w]>=0.90) & (t[w]<0.95)].index.to_list()]
    #final_dict_80[w] = [t[(t[w]>=0.80) & (t[w]<0.90)].index.to_list()]

#food_characteristics_per_entity_reduced.loc[food_characteristics_per_entity_reduced.index!='Abalone']

In [57]:
pd.DataFrame(final_dict).T

Unnamed: 0,0,1
Abalone,"[[American butterfish, Japanese chestnut, Taro, Arrowhead, Common salsify, Mexican groundcherry, Mountain yam, Horned melon, Devilfish, Narrowleaf cattail]]","[[Mexican groundcherry, Narrowleaf cattail, Yautia, Nopal, Jicama, Purslane, Nance, Yam, Chayote, Epazote]]"
Abiyuch,"[[Skunk currant, Rowal, Deerberry, Mulberry, Black crowberry, Black huckleberry, Natal plum, Jicama, Jujube, Longan]]","[[Jicama, Mexican groundcherry, Nopal, Narrowleaf cattail, Nance, Yautia, Purslane, Yam, Chayote, Epazote]]"
Acerola,"[[Rose hip, Purple laver, Komatsuna, Longan, Kai lan, Sesbania flower, Jujube, Kale, Nance, Lambsquarters]]","[[Nance, Jicama, Purslane, Mexican groundcherry, Nopal, Yautia, Yam, Narrowleaf cattail, Chayote, Epazote]]"
Acorn,"[[Breadnut tree seed, Japanese chestnut, Oregon yampah, Prairie turnip, Taro, Arrowhead, Yautia, Common salsify, Mountain yam, Yardlong bean]]","[[Yautia, Nance, Mexican groundcherry, Narrowleaf cattail, Jicama, Amaranth, Nopal, Yam, Purslane, Epazote]]"
Adzuki bean,"[[Alfalfa, Pasta, Yam, Pak choy, Jerusalem artichoke, Chestnut, Millet, Japanese pumpkin, Purple mangosteen, Bamboo shoots]]","[[Yam, Chayote, Purslane, Yautia, Nance, Amaranth, Jicama, Mexican groundcherry, Narrowleaf cattail, Nopal]]"
Agar,"[[Yardlong bean, Moth bean, Boysenberry, Sesbania flower, Winter squash, Malabar spinach, Jew's ear, Jicama, Alaska wild rhubarb, Mountain yam]]","[[Jicama, Nopal, Mexican groundcherry, Narrowleaf cattail, Yautia, Nance, Purslane, Epazote, Chayote, Yam]]"
Agave,"[[Great horned owl, Dates, Carob, Groundcherry, Rowal, Breadfruit, Sago palm, Irish moss, Nance, Abiyuch]]","[[Nance, Yautia, Mexican groundcherry, Jicama, Nopal, Narrowleaf cattail, Yam, Purslane, Amaranth, Chayote]]"
Alaska blackfish,"[[Pacific rockfish, Spanish mackerel, Lemon sole, Atlantic menhaden, Striped mullet, Scup, Flatfish, Shark, Snapper, Northern pike]]","[[Jalapeno, Byrsonima crassifolia, Tortilla, Bitter Orange, Cocoa powder, Cocoa butter, Spirulina, Allspice, Mexican oregano, Agave]]"
Alaska pollock,"[[Pollock, Sockeye salmon, Elk, Orange roughy, Pacific ocean perch, Norway pout, Buffalo, Bison, Grouper, Green turtle]]","[[Byrsonima crassifolia, Tortilla, Jalapeno, Bitter Orange, Cocoa powder, Cocoa butter, Spirulina, Allspice, Mexican oregano, Agave]]"
Alaska wild rhubarb,"[[Tinda, Nopal, Jicama, Horned melon, Mexican groundcherry, Narrowleaf cattail, Winter squash, Jew's ear, Natal plum, Mountain yam]]","[[Nopal, Jicama, Mexican groundcherry, Narrowleaf cattail, Yautia, Nance, Purslane, Chayote, Yam, Epazote]]"


In [60]:
#!mkdir ../data/external
pd.DataFrame(final_dict).T.rename(columns={0:'top10_ingredients', 1: 'top10_native_mexican_ingredients'}).to_csv('../data/external/Top10sustitutos_de_productos_with_mexican.csv')

In [117]:
#molecules_per_entity_reduced
#functional_groups_per_entity_reduced
#flavor_profile_per_entity_reduced

# How to include a new food-ingredient?
- 1) Search the molecules of the corresponding food, make an intersection of those molecules with the corresponding flavor molecules in the flavor molecules list. Then just take the intersection as a list of molecules. 
    - 1.1) For example: `new_ingredient: [flavor_molecule1, flavor_molecule2, ... flavor_moleculeN]`
    - 1.2) The best case scenario is that this ingredient can be found in [foodb](https://foodb.ca/), since it contains the molecules information of an ingredient, and only a rapid search must be done within the compounds and molecules that are founded there and the flavor_molecules_list that are found in flavordb. 
- 2) Get the nutritional information for the food, thinking on 100g of the ingredient. 
    - 2.1) The nutrients must be the same format as the one found in the [edamam's website](https://developer.edamam.com/food-database-api-docs) (in the Nutrient Guide section). 
    - 2.2) The best case scenario is that this ingredient can be found in [edammams api](https://developer.edamam.com/food-database-api-demo). 
