In [122]:
import pandas as pd
import json
import numpy as np

from sklearn.model_selection import train_test_split

import re

In [123]:
with open('../../data/train.json', 'r') as myfile:
    data=myfile.read()

# parse file
obj = json.loads(data)

In [124]:
recipe_ings = []

for recipe in obj:
    cuisine = recipe["cuisine"]
    
    for ingredient in recipe["ingredients"]:
        recipe_ings.append([clean_ingr(ingredient), cuisine])

In [128]:
def clean_ingr(items, patterns, word_patterns):

    fixed_items = []
    for item in items:
        for pattern in patterns:
            if re.match(pattern, item):
                fixed = re.sub(pattern, "", item)
                logger.debug("Exchanged %s to: %s", item, fixed)
                item = fixed
        fixed_items.append(item)

    return fixed_items


def clean(DATA_PATH, PATTERNS, REMOVE_WORDS):
    with open(DATA_PATH, "r") as file:
        data = file.read()
        logger.info("Read file from %s", DATA_PATH)

    # parse file
    obj = json.loads(data)
    logger.info("Obtained %i records", len(obj))
    
    recipe_ings = []

    for recipe in obj:
        cuisine = recipe["cuisine"]
        ingredients = recipe["ingredients"]

        recipe_ings += [
            (x, cuisine)
            for x in clean_ingr(ingredients, PATTERNS, REMOVE_WORDS)
        ]
        
    df = pd.DataFrame(data=recipe_ings, columns=["ingredient", "cuisine"])
    return df

In [129]:
patterns = ['^\(.*?\) ', ',.*$']
words = ['^.*' + word + ' ' for word in ["low-fat", "lowfat", "low fat", "sodium", "lb\.", "ounc"]]
patterns += words

clean('../../data/train.json', patterns, words)

Unnamed: 0,ingredient,cuisine
0,romaine lettuce,greek
1,black olives,greek
2,grape tomatoes,greek
3,garlic,greek
4,pepper,greek
...,...,...
428270,garlic,mexican
428271,white sugar,mexican
428272,roma tomatoes,mexican
428273,celery,mexican


In [77]:
df = pd.DataFrame(data = recipe_ings, columns = ["ingredient", "cuisine"])

In [78]:
cuisine_series = df.groupby("ingredient").cuisine.value_counts()

In [79]:
cuisinedf = cuisine_series.unstack().fillna(0)
cuisinedf = cuisinedf.drop(["salt", "water", "sugar"], axis=0)

In [80]:
def clean_ingr(item):
    patterns = ['^\(.*?\) ', ',.*$']
    words = ['^.*' + word + ' ' for word in ["low-fat", "lowfat", "low fat", "sodium", "lb\.", "ounc"]]
    patterns += words
    
    for pattern in patterns:
        if re.search(pattern, item):
            item = re.sub(pattern, '', item)
    
    return item

In [81]:
cuisinedf['ingr_sum'] = cuisinedf.iloc[:, 1:].sum(axis=1, skipna=True)
small = cuisinedf[cuisinedf.ingr_sum >= 25]

In [82]:
cuisinedf[cuisinedf.ingr_sum >= 100]

cuisine,brazilian,british,cajun_creole,chinese,filipino,french,greek,indian,irish,italian,...,japanese,korean,mexican,moroccan,russian,southern_us,spanish,thai,vietnamese,ingr_sum
ingredient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Gochujang base,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,136.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,142.0
Italian bread,0.0,0.0,9.0,0.0,0.0,6.0,1.0,0.0,1.0,89.0,...,0.0,0.0,1.0,1.0,0.0,7.0,3.0,0.0,0.0,118.0
Italian parsley leaves,2.0,1.0,3.0,1.0,1.0,17.0,1.0,1.0,1.0,74.0,...,1.0,0.0,9.0,5.0,0.0,3.0,5.0,0.0,0.0,123.0
Mexican cheese blend,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,169.0,0.0,0.0,3.0,1.0,0.0,0.0,174.0
Mexican oregano,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,98.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yellow onion,21.0,20.0,86.0,44.0,29.0,49.0,15.0,140.0,17.0,153.0,...,21.0,33.0,288.0,48.0,9.0,73.0,35.0,33.0,50.0,1163.0
yellow squash,0.0,0.0,6.0,3.0,1.0,12.0,2.0,2.0,1.0,37.0,...,3.0,1.0,20.0,5.0,0.0,15.0,2.0,5.0,1.0,116.0
yoghurt,1.0,2.0,0.0,0.0,0.0,2.0,10.0,211.0,0.0,2.0,...,6.0,0.0,6.0,1.0,0.0,4.0,0.0,1.0,0.0,246.0
yukon gold potatoes,1.0,8.0,1.0,0.0,2.0,41.0,4.0,27.0,24.0,39.0,...,6.0,0.0,13.0,10.0,6.0,8.0,21.0,4.0,1.0,217.0


In [83]:
def mean_center(row):
    avg = row[:-1].mean()
    row[:-1] = row[:-1] - avg
    
    return row

def normalize(col, scale=1, exclude=None):
    if col.name in exclude:
        return col
    
    avg = col.mean()
    total = col.sum()
    
    col = scale*(col - avg)/total
    return col

In [None]:
choices = ["garam masala", "curry leaves", "ginger"]

ex = train[train.ingredient.isin(choices)].drop("ingredient", axis=1).sum(axis=0)
softmax(ex)

In [84]:
# Helper
def softmax(raw):
    return np.e ** raw / np.sum(np.e ** raw)

In [85]:
def predict(df, ingredients, num_guesses=3, exclude=["ingr_sum"]):
    
    try:
        calc = df.loc[ingredients]
    except KeyError:
        print("key not found")
        return
    
    calc = calc.drop(exclude, axis=1).sum(axis=0)
    
    ordered = softmax(calc).sort_values(ascending=False)
    
    return ordered[:num_guesses]

In [None]:
pred

In [96]:
trained = small.apply(normalize, exclude=["ingr_sum"], scale=1000, axis=0).apply(mean_center, raw=True, axis=1)

list(predict(trained, ["olive oil", "feta cheese"]).index)

['greek', 'italian', 'moroccan']

In [98]:
def recommend(df, cuisine, num_ingredients=5, selected=None):
    
    if selected:
        df = df.drop(labels=selected, axis=0)
    
    ordered = df.loc[:, cuisine].sort_values(ascending=False)
    
    return list((ordered[:num_ingredients]).index)

recommend(trained, "greek")

['olive oil',
 'feta cheese crumbles',
 'dried oregano',
 'feta cheese',
 'fresh lemon juice']

In [65]:
rectrain = small.drop("ingr_sum", axis=1).apply(mean_center, raw=True, axis=0)

In [71]:
rectrain.mean(axis=1).sort_values(ascending=False).head(20)

ingredient
salt                   894.139798
onions                 390.189798
olive oil              390.189798
water                  364.439798
garlic                 360.589798
sugar                  313.289798
garlic cloves          303.439798
butter                 233.989798
ground black pepper    230.839798
all-purpose flour      223.189798
pepper                 213.489798
vegetable oil          210.839798
soy sauce              188.689798
eggs                   160.989798
kosher salt            147.239798
green onions           145.489798
tomatoes               144.539798
large eggs             138.989798
carrots                132.289798
unsalted butter        130.689798
dtype: float64

In [72]:
rectrain.drop(["salt", "water", "sugar"], axis=0)

cuisine,brazilian,british,cajun_creole,chinese,filipino,french,greek,indian,irish,italian,jamaican,japanese,korean,mexican,moroccan,russian,southern_us,spanish,thai,vietnamese
ingredient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2% reduced-fat milk,-0.64404,0.003232,-2.684444,-12.621818,-2.94101,1.898586,-3.712727,-14.152727,0.591515,-19.486869,-2.52404,-4.376162,-3.667879,-16.625051,-4.197576,-0.945455,4.690909,-4.058182,-7.643636,-4.106667
Alfredo sauce,-1.64404,-2.996768,-4.684444,-12.621818,-2.94101,-9.101414,-4.712727,-15.152727,-2.408485,12.513131,-2.52404,-5.376162,-3.667879,-26.625051,-4.197576,-1.945455,-16.309091,-4.058182,-7.643636,-4.106667
Amaretti Cookies,-1.64404,-2.996768,-7.684444,-12.621818,-2.94101,-9.101414,-4.712727,-15.152727,-2.408485,-20.486869,-2.52404,-5.376162,-3.667879,-27.625051,-4.197576,-1.945455,-16.309091,-4.058182,-7.643636,-4.106667
American cheese,-1.64404,-2.996768,-6.684444,-12.621818,-1.94101,-10.101414,-4.712727,-15.152727,-2.408485,-25.486869,-2.52404,-5.376162,-3.667879,-3.625051,-4.197576,-1.945455,-13.309091,-4.058182,-7.643636,-4.106667
Anaheim chile,-1.64404,-2.996768,-7.684444,-12.621818,-1.94101,-10.101414,-4.712727,-14.152727,-2.408485,-30.486869,-2.52404,-5.376162,-3.667879,9.374949,-2.197576,-1.945455,-14.309091,-1.058182,-2.643636,-4.106667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zest,-1.64404,-2.996768,-7.684444,-11.621818,-2.94101,-10.101414,-2.712727,-15.152727,-2.408485,-28.486869,-2.52404,-4.376162,-3.667879,-25.625051,-4.197576,-1.945455,-14.309091,-4.058182,-7.643636,-4.106667
zesty italian dressing,-1.64404,-2.996768,-7.684444,-12.621818,-2.94101,-10.101414,-3.712727,-15.152727,-2.408485,-19.486869,-2.52404,-5.376162,-3.667879,-23.625051,-4.197576,-1.945455,-15.309091,-4.058182,-7.643636,-4.106667
zinfandel,-1.64404,-2.996768,-7.684444,-12.621818,-2.94101,-5.101414,-2.712727,-15.152727,-2.408485,-29.486869,-2.52404,-5.376162,-3.667879,-27.625051,-4.197576,-1.945455,-16.309091,-2.058182,-7.643636,-4.106667
ziti,-1.64404,-2.996768,-7.684444,-12.621818,-2.94101,-10.101414,-1.712727,-15.152727,-2.408485,-4.486869,-2.52404,-5.376162,-3.667879,-27.625051,-4.197576,-1.945455,-16.309091,-4.058182,-7.643636,-4.106667


In [120]:
def predict_and_recommend(df, ingredients, scale_const = 1000, num_cuisines=3, num_ingredients=5):
    
    predict_train = df.apply(normalize, exclude=["ingr_sum"], scale=scale_const, axis=0).apply(mean_center, raw=True, axis=1)
    pred_cuisines = predict(predict_train, ingredients, num_guesses=num_cuisines)
    pred_list = list(pred_cuisines.index)
    
    rec_train = df.drop("ingr_sum", axis=1).apply(mean_center, raw=True, axis=0)
    rec_list = []
    for cuisine in pred_list:
        recommended = recommend(rec_train, cuisine, num_ingredients=num_ingredients, selected=ingredients)
        rec_list.append(recommended)
    
    return dict(zip(pred_list, rec_list))
        
    
    

In [121]:
predict_and_recommend(cuisinedf, ["peanuts", "tamarind", "garlic", "palm sugar", "fish sauce"])

{'vietnamese': ['carrots',
  'soy sauce',
  'shallots',
  'garlic cloves',
  'vegetable oil'],
 'thai': ['soy sauce',
  'coconut milk',
  'vegetable oil',
  'garlic cloves',
  'fresh lime juice'],
 'filipino': ['onions', 'soy sauce', 'pepper', 'oil', 'carrots']}

In [130]:
cuisinedf

cuisine,brazilian,british,cajun_creole,chinese,filipino,french,greek,indian,irish,italian,...,japanese,korean,mexican,moroccan,russian,southern_us,spanish,thai,vietnamese,ingr_sum
ingredient,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2% milk,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2% milk shredded mozzarella cheese,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2% reduced-fat milk,1.0,3.0,5.0,0.0,0.0,12.0,1.0,1.0,3.0,11.0,...,1.0,0.0,11.0,0.0,1.0,21.0,0.0,0.0,0.0,70.0
7 Up,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,4.0
95% lean ground beef,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zinfandel,0.0,0.0,0.0,0.0,0.0,5.0,2.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,10.0
ziti,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,26.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,29.0
ziti pasta,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
zucchini,3.0,3.0,13.0,30.0,6.0,86.0,47.0,42.0,4.0,326.0,...,17.0,51.0,140.0,40.0,1.0,14.0,17.0,44.0,3.0,889.0


In [132]:
max(map(len, list(cuisinedf.index)))

71