# Imports

In [13]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import warnings

from sklearn.decomposition import PCA
from numpy.linalg import norm

# Configuration

In [2]:
pd.set_option('display.max_rows', 2000)
warnings.filterwarnings('ignore')

# Read data

In [3]:
edamam_flavor = pd.read_pickle('../data_discovery/edamam_flavor_dbs_nutrients.pkl')

# Functions

In [4]:
def cosine_similarity(w1, w2, base_df):
    A = base_df.loc[w1]
    B = base_df.loc[w2]
    cosine = np.dot(A,B)/(norm(A)*norm(B))
    return cosine

def jaccard_similarity(w1, w2, base_df):
    set1 = base_df.loc[w1].common_name
    set2 = molecules_per_entity_jaccard.loc[w2].common_name
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union

def get_top_similarities(w, base_df, similarity_metric='jaccard'):
    similarities = []
    words = graph_flavor_molecules_df[graph_flavor_molecules_df.entity_alias_readable!=w].entity_alias_readable.drop_duplicates()
    for entity in words:
        if similarity_metric == 'jaccard':
            similarities.append(jaccard_similarity(w, entity, base_df=base_df))
        elif similarity_metric == 'cosine':
            similarities.append(cosine_similarity(w, entity, base_df=base_df))
    response = pd.DataFrame({'entity':words, 'similarity':similarities})
    return  response.sort_values('similarity', ascending=False)

In [69]:
columns_defined = [
    'taste',
    'functional_groups',
    'fooddb_flavor_profile',
    'supersweetdb_id',
    'fooddb_id',
    'common_name',
    'bitterdb_id',
    'super_sweet',
    'flavornet_id',
    'pubchem_id',
    'bitter',
    'iupac_name',
    'odor',
    'smile',
    'inchi',
    'cas_id',
    'flavor_profile',
    'fema_flavor_profile',
]
flavor_molecules_df_list = []

nutrition_facts_df_list = []

for row_index in range(edamam_flavor.shape[0]):
    df_temporal_index = edamam_flavor.iloc[row_index]
    flavor_molecules_df = pd.DataFrame(df_temporal_index.molecules)
    flavor_molecules_df['entity_alias_readable'] = df_temporal_index.entity_alias_readable
    flavor_molecules_df_list.append(flavor_molecules_df)
    nutrition_facts_df = pd.DataFrame(df_temporal_index.nutritional_info).iloc[-1:]
    nutrition_facts_df['entity_alias_readable'] = df_temporal_index.entity_alias_readable
    nutrition_facts_df_list.append(nutrition_facts_df)
    
    
flavor_molecules = pd.concat(flavor_molecules_df_list)[columns_defined + ['entity_alias_readable']]
nutrition_facts = pd.concat(nutrition_facts_df_list).set_index("entity_alias_readable").fillna(0)

In [70]:
# Create graph
graph_flavor_molecules_df = flavor_molecules[['entity_alias_readable', 'common_name']]
graph_flavor_molecules_df['dummy'] = 1
graph_flavor_molecules = nx.from_pandas_edgelist(graph_flavor_molecules_df.rename(
    columns={'entity_alias_readable': 'source', 'common_name':'target'}
))

graph_flavor_molecules_df_vector = pd.get_dummies(graph_flavor_molecules_df.common_name)
graph_flavor_molecules_df_vector['entity_alias_readable'] = graph_flavor_molecules_df['entity_alias_readable']

In [71]:
molecules_per_entity = pd.pivot_table(
    graph_flavor_molecules_df, 
    index='entity_alias_readable',
    columns='common_name', 
    aggfunc='count', 
    values="dummy", 
    fill_value=0
)

molecules_per_entity_jaccard = pd.DataFrame(
    graph_flavor_molecules_df.groupby('entity_alias_readable').common_name.apply(lambda x: set(x))
)

In [144]:
flavor_pca_obj = PCA(
    n_components=100,
    copy=True, 
    whiten=False,
    svd_solver='auto',
    tol=0.0, 
    iterated_power='auto',
    n_oversamples=10,
    power_iteration_normalizer='auto', 
    random_state=99
)

flavor_per_entity_reduced = pd.DataFrame(flavor_pca_obj.fit_transform(molecules_per_entity), index=molecules_per_entity.index)

In [233]:
ingredients = pd.Series(flavor_per_entity_reduced.index)

In [236]:
ingredients[ingredients.str.contains("pear")]

577        Spearmint
607    Tapioca pearl
Name: entity_alias_readable, dtype: object

In [216]:
word = 'Oregano'
def get_similar_foods(word, consideration_of_elements=20):
    total_similitude = {
        'flavor': get_top_similarities(
            word, 
            similarity_metric='cosine', 
            #base_df=molecules_per_entity
            base_df=flavor_per_entity_reduced

        ).rename(columns={'entity':'flavor'}),
        'nutrients': get_top_similarities(
            word, 
            similarity_metric='cosine',
            base_df=nutrients_per_entity_reduced
            #base_df=nutrients_per_entity_reduced
        ).rename(columns={'entity':'nutrient'}),
    }

    comparison = pd.concat(
        [total_similitude['flavor'], total_similitude['nutrients']], axis=1
    ).head(consideration_of_elements)
    return comparison[comparison.flavor.isin(comparison.nutrient)].flavor.to_list()

In [245]:
#get_similar_foods("", consideration_of_elements=30)

In [223]:
similar_ing_list10 = pd.Series(flavor_per_entity_reduced.index).apply(get_similar_foods, consideration_of_elements=10)

In [224]:
similar_ing_list20 = pd.Series(flavor_per_entity_reduced.index).apply(get_similar_foods, consideration_of_elements=20)

In [225]:
similar_ing_list30 = pd.Series(flavor_per_entity_reduced.index).apply(get_similar_foods, consideration_of_elements=30)

In [230]:
similar_ing_list = pd.DataFrame()
similar_ing_list['list10'] = similar_ing_list10
similar_ing_list['list20'] = similar_ing_list20
similar_ing_list['list30'] = similar_ing_list30
similar_ing_list.set_index(flavor_per_entity_reduced.index, inplace=True)

Unnamed: 0_level_0,list10,list20,list30
entity_alias_readable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Abalone,[],[Japanese chestnut],[Japanese chestnut]
Abiyuch,[Skunk currant],[Skunk currant],"[Skunk currant, Natal plum, Jujube]"
Acerola,[],"[Jujube, Sesbania flower]","[Jujube, Sesbania flower, Longan]"
Acorn,[],[Colorado pinyon],[Colorado pinyon]
Adzuki bean,[],[Groundcherry],[Groundcherry]
Agar,[],"[Moth bean, Sesbania flower]","[Moth bean, Sesbania flower]"
Agave,[],[],[]
Alaska blackfish,[],[Freshwater drum],"[Freshwater drum, Atlantic croaker]"
Alaska pollock,"[Pacific rockfish, Channel catfish, Buffalo]","[Elk, Pacific rockfish, Channel catfish, Buffalo]","[Elk, Pacific rockfish, Channel catfish, Buffa..."
Alaska wild rhubarb,[],"[Natal plum, Mexican groundcherry]","[Natal plum, Mexican groundcherry, Nopal]"


In [254]:
similar_ing_list.to_pickle('../data_discovery/similar_food_ingredients.pkl')

In [231]:
#similar_ing_list

In [247]:
#flavor_per_entity_reduced.sample(10).index

In [248]:
#for base in (molecules_per_entity, flavor_per_entity_reduced):
#    word = 'True frog'
#    total_similitude = {
#        'flavor': get_top_similarities(
#            word, 
#            similarity_metric='cosine', 
#            base_df=base
#
#        ).rename(columns={'entity':'flavor'}),
#        'nutrients': get_top_similarities(
#            word, 
#            similarity_metric='cosine',
#            base_df=nutrients_per_entity_reduced
#            #base_df=nutrients_per_entity_reduced
#        ).rename(columns={'entity':'nutrient'}),
#    }

#    comparison = pd.concat([total_similitude['flavor'], total_similitude['nutrients']], axis=1).head(50)
#    print(comparison[comparison.flavor.isin(comparison.nutrient)].flavor.to_list())

In [10]:
#total_similitude['nutrients']

In [249]:
#flavor_molecules_list = graph_flavor_molecules_df_vector.columns.drop('entity_alias_readable')

# How to include a new food-ingredient?
- 1) Search the molecules of the corresponding food, make an intersection of those molecules with the corresponding flavor molecules in the flavor molecules list. Then just take the intersection as a list of molecules. 
    - 1.1) For example: `new_ingredient: [flavor_molecule1, flavor_molecule2, ... flavor_moleculeN]`
    - 1.2) The best case scenario is that this ingredient can be found in [foodb](https://foodb.ca/), since it contains the molecules information of an ingredient, and only a rapid search must be done within the compounds and molecules that are founded there and the flavor_molecules_list that are found in flavordb. 
- 2) Get the nutritional information for the food, thinking on 100g of the ingredient. 
    - 2.1) The nutrients must be the same format as the one found in the [edamam's website](https://developer.edamam.com/food-database-api-docs) (in the Nutrient Guide section). 
    - 2.2) The best case scenario is that this ingredient can be found in [edammams api](https://developer.edamam.com/food-database-api-demo). 
