In [1]:
# Load the data

import json
import numpy as np

with open('cooking_train.json') as raw_data:    
    data = json.load(raw_data)

for recipe in data:
    recipe['cuisine'] = recipe['cuisine'].lower().encode('utf-8')
    recipe['ingredients'] = [x.lower().encode('utf-8') for x in recipe['ingredients']]

In [2]:
# Prepare the data

import re

remove_numbers = re.compile(r"\d+")
remove_number_percentage = re.compile(r"\d+%")
remove_less_something = re.compile(r"less [A-z]*") # i.e. less sodium
remove_low_something = re.compile(r"low [A-z]*") # i.e. low fat
remove_reduced_something = re.compile(r"reduced [A-z]*") # i.e. reduced fat
remove_no_sth_added = re.compile(r"no [A-z]+ added")
remove_sth_free = re.compile(r"[A-z]* free")
remove_parentheses = re.compile(r"\([^)]*\)")
remove_excessive_whitespaces = re.compile(r" +")

words_to_remove = ["oz", "oz.", "cm", "centimeter", "ounce", "gram", "g", "lb", "pound",
                    "tbsp", "tablespoon", "ml", "mililiter", "pint", "lowfat", "light",
                    "shredded", "skim", "nonfat", "pure", "large", "extra large", "small",
                    "medium", "fine", "free range", "vegetarian", "natural", "low-fat", "lean",
                    "less sodium", "drained", "washed", "homemade", "whole wheat", "diced",
                    "washed", "chopped", "grated", "less", "lowfat", "fresh", "low sodium",
                    "all-purpose", "sweetened", "unsweetened", "condensed", "natural", "unsalted",
                    "instant", "powdered", "unflavored", "halves", "skim", "fine", "drained",
                    "drain", "part skim", "gourmet", "allspice", "mix", "aged", "traditional",
                    "of", "pint", "baby", "whole-grain", "-grain", "all", "purpose", "aritficial",
                    "back", "unbleached", "alaskan", "alexia", "breakstones", "kraft", "bertolli", "best foods",
                    "betty crocker", "bisquick", "bob evans", "breyers", "curry guy", "camellia", "campbells", 
                    "country crock", "crisco", "crystal farms", "delallo", "diamond crystal", "domino", 
                    "doritos", "earth balance", "egglands best", "foster farms", "franks", "gold medal", 
                    "goya", "green giant steamers niblets", "green giant", "heinz", "hellmanns", "herdez", 
                    "hidden valley", "honeysuckle white", "jacksonville",  "jimmy dean", "johnsonville", 
                    "knorr", "krudsen", "kikkoman", "lipton", "land o lakes", "mazola", "lea and perrins", 
                    "mccormick", "meyer", "mission", "old el paso", "old bay", "pam", "pepperidge farm", 
                    "oscar mayer", "pace", "pillsbury", "progresso", "pure wesson", "pompeian", "san marzano", 
                    "sargento", "soy vay", "taco bell", "yoplait", "spice islands", "stonefire", "success", 
                    "swanson", "truvía", "uncle bens", "wish bone","jameson", "tapatio", "philadelphia", "original",
                    "homemade", "best", "good", "and", "a", "of"]

words_to_map = [(("sea salt", "table salt", "white salt", "kosher salt"), "salt"),
               (("uncooked"), "raw"),
               (("black pepper", "white pepper"), "pepper"),
               (("yellow onion", "green onion", "purple onion"), "onion"),
               (("white bread", "wheat bread", "whole-wheat bread", "grain bread"), "bread"),
               (("extra-virgin olive oil", "extra virgin olive oil", "virgin olive oil"), "olive oil"),
               (("whie vinegar"), "vinegar"),
               (("tomatoes"), "tomato"),
               (("paprikas"), "paprika"),
               (("eggs"), "egg"),
               (("mushrooms"), "mushroom"),
               (("cucumbers"), "cucumber"),
               (("garlic cloves"), "garlic"),
               (("carrots"), "carrot"),
               (("apples"), "apple"),
               (("spices"), "spice"),
               (("sausages"), "sausage"),
               (("potatoes"), "potato"),
               (("chilies"), "chili"),
               (("apricots"), "apricot"),
               (("pounds"), "pound"),
               (("spinach leaves"), "spinach"),
               (("radishes"), "radish"),
               (("cauliflower flowerets", "cauliflower florets", "cauliflowerets"), "cauliflower"),
               (("collards", "collard green leaves", "collard leaves"), "collard greens"),
               (("basil dried leaves", "basil leaves"), "basil"),
               (("artificial sweetener"), "sweetener")]

def process_ingredient(ingredient):
    
    ingredient = remove_number_percentage.sub("", ingredient)
    ingredient = remove_numbers.sub("", ingredient)
    ingredient = remove_less_something.sub("", ingredient)
    ingredient = remove_low_something.sub("", ingredient)
    ingredient = remove_reduced_something.sub("", ingredient)
    ingredient = remove_no_sth_added.sub("", ingredient)
    ingredient = remove_sth_free.sub("", ingredient)
    ingredient = remove_parentheses.sub("", ingredient)

    for word in words_to_remove:
        ingredient = re.sub(r"\b{}\b".format(word), "", ingredient)

    for words, map_to in words_to_map:
        if type(words) is str:
            ingredient = re.sub(r"\b{}\b".format(words), map_to, ingredient)
        else:
            for entry in words:
                ingredient = re.sub(r"\b{}\b".format(entry), map_to, ingredient)

    ingredient = remove_excessive_whitespaces.sub(" ", ingredient)
    ingredient = ingredient.strip() 

    return ingredient
    
def process_recipe(recipe):
    recipe['ingredients'] = map(process_ingredient, recipe['ingredients'])
    
    return recipe
          
prepared_data = np.array(map(process_recipe, data))

In [3]:
# Change recipes to vectorized binary representation of ingredients

from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

cuisines = []
ingredients = []

for recipe in prepared_data:
    cuisines.append(recipe['cuisine'])
    ingredients.extend(recipe['ingredients'])

ingredients = np.unique(ingredients)

string_recipe_matrix = ['$'.join(recipe['ingredients']) for recipe in prepared_data]

count_vectorizer = CountVectorizer(binary=True, token_pattern='[^$]+', vocabulary=ingredients)

recipes = count_vectorizer.fit_transform(string_recipe_matrix).toarray()

known_ingredients = ingredients

# Divide data into train and test

X_train, X_test, Y_train, Y_test = train_test_split(recipes, cuisines, test_size=0.3)



In [4]:
# Train and test different classifiers

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import VotingClassifier

clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = LinearSVC(multi_class='ovr')

eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('svc', clf3)], voting='hard')

for clf, label in zip([clf1, clf2, clf3, eclf], ['Logistic Regression', 'Random Forest', 'linearSVC', 'Ensemble']):
    scores = cross_val_score(clf, X_train, Y_train, cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.77 (+/- 0.01) [Logistic Regression]
Accuracy: 0.65 (+/- 0.00) [Random Forest]
Accuracy: 0.74 (+/- 0.00) [linearSVC]
Accuracy: 0.76 (+/- 0.01) [Ensemble]


In [5]:
# Load competition test data

import json
import numpy as np

with open('cooking_test.json') as raw_data:    
    data = json.load(raw_data)

for recipe in data:
    recipe['ingredients'] = [x.lower().encode('utf-8') for x in recipe['ingredients']]

In [8]:
# Map unknown ingredients to best matching known ingredients

import distance

def get_best_matching_known_ingredient(ingredient):
    for known_ingredient in known_ingredients:
        if ingredient in known_ingredient:
            return known_ingredient
        
    for known_ingredient in reversed(known_ingredients):
        if known_ingredient in ingredient:
            return known_ingredient
        
    max_dist = 0
    
    best_match = known_ingredients[0]
    
    for known_ingredient in known_ingredients:
        dist = distance.jaccard(ingredient, known_ingredient)
        
        if dist > max_dist:
            best_match = known_ingredient
            max_dist = dist
            
    return best_match

def process_unknown_ingredients(recipe):
    recipe['ingredients'] = [ingredient if ingredient in known_ingredients else 
                             get_best_matching_known_ingredient(ingredient) for ingredient 
                             in recipe['ingredients'] ]
    return recipe

prepared_data = np.array(map(process_recipe, data))
prepared_data = np.array(map(process_unknown_ingredients, prepared_data))

ingredients = []

for recipe in prepared_data:
    ingredients.extend(recipe['ingredients'])

ingredients = np.unique(ingredients)

In [15]:
x = []
y = []

x.extend(X_train)
x.extend(X_test)

y.extend(Y_train)
y.extend(Y_test)

eclf.fit(x, y)

VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('rf', RandomFore...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))],
         n_jobs=1, voting='hard', weights=None)

In [16]:
# Do prediction and save results

from pandas import DataFrame
from collections import OrderedDict

string_recipe_matrix = ['$'.join(recipe['ingredients']) for recipe in prepared_data]

count_vectorizer = CountVectorizer(binary=True, token_pattern='[^$]+', vocabulary=known_ingredients)

recipes = count_vectorizer.fit_transform(string_recipe_matrix).toarray()

prediction_ids = [recipe['id'] for recipe in prepared_data]

predictions = eclf.predict(recipes)

data_frame = DataFrame(data=OrderedDict([('id', prediction_ids), ('cuisine', predictions)]))

data_frame.to_csv('my_predictions.csv', index=False)