In [70]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Load the datasets

In [71]:
import pandas as pd

In [72]:
recipes_df = pd.read_csv('../input/foodcom-recipes-and-reviews/recipes.csv')
reviews_df = pd.read_csv('../input/foodcom-recipes-and-reviews/reviews.csv')

In [73]:
recipes_df.head()

In [74]:
recipes_df.iloc[0]

In [75]:
reviews_df.head()

In [76]:
reviews_df.iloc[0]

# Pre-processing

## Drop columns, reduce rows, and join reviews

In [77]:
recipe_cols = ['RecipeId', 'Name', 'CookTime', 'PrepTime', 'TotalTime', 'Description', 'Images',\
              'RecipeCategory', 'Keywords', 'RecipeIngredientQuantities', 'RecipeIngredientParts', \
              'AggregatedRating', 'Calories', 'RecipeServings', 'RecipeYield', 'RecipeInstructions']
review_cols = ['ReviewId', 'RecipeId', 'Rating', 'Review']

In [78]:
# limit recipe entries
rp_df = recipes_df[recipe_cols]
rv_df = reviews_df[review_cols]
# extract rows with reviews
rp_df = pd.DataFrame.merge(rp_df, rv_df.RecipeId, on='RecipeId').drop_duplicates('RecipeId')

## Delete rows with null values

In [79]:
rp_df.info()

In [80]:
recipe_col_subset = ['RecipeId', 'Name', 'CookTime', 'PrepTime', 'TotalTime', 'Description', 'Images',\
              'RecipeCategory', 'Keywords', 'RecipeIngredientQuantities', 'RecipeIngredientParts', \
              'AggregatedRating', 'Calories', 'RecipeInstructions']

rp_df = rp_df.dropna(subset=recipe_col_subset)
rp_df = rp_df[rp_df.Images != 'character(0)']
rp_df.info()

In [81]:
# reduce rows
rp_df = rp_df[:20000]
rv_df = pd.DataFrame.merge(rv_df, rp_df.RecipeId, how='inner')

In [82]:
%%time
# This take about 35s with 10000 rows
# RecipeId as a key, list of dict as a value
rc_rv_dict = dict()
for name, df in rv_df.groupby('RecipeId'):
    # drop unnecessary column, set index and transpose (to create dict)
    df = df.drop('RecipeId', axis=1).set_index('ReviewId').T
    dict1 = df.to_dict('dict')
    rc_rv_dict[name] = dict1

In [83]:
# add a new column
rp_df['reviews_in_dict'] = rp_df.RecipeId.apply(lambda x: rc_rv_dict[x])

# Clean the following columns:

Images, Keywords, RecipeIngredientQuantities, RecipeIngredientParts, RecipeInstructions

In [84]:
import re

def map_str_to_list(string):
    #pattern = re.compile(r'\"(.+)\"')
    pattern = re.compile(r'\"([^"]+)\"')
    return pattern.findall(string)

def map_for_series(series: pd.Series):
    return series.apply(lambda i: map_str_to_list(i))

In [85]:
modify_col_list = ['Images', 'Keywords', 'RecipeIngredientQuantities', 'RecipeIngredientParts', 'RecipeInstructions']
rp_df = rp_df.apply(lambda x: map_for_series(x) if x.name in modify_col_list else x)

In [94]:
rp_df.Images.iat[4578][0]

In [87]:
rp_df.reset_index().to_csv('recipe_10000.csv')

# Give reccomendations

In [88]:
def score_recipes(user_input, df, best_num):
    '''
    user_input: list of strings
    df: our list of recipes
    best_num: number of best matching result to return
    '''
    df = df.copy()
    
    def score(ingredient_list):
        score = 0
        for w in user_input:
            if w in ingredient_list:
                score += 1
        return score
    
    df['score'] = df['RecipeIngredientParts'].apply(lambda x: score(x))
    df = df.sort_values(by='score', ascending=False).iloc[:best_num]
    return df

In [89]:
user_input = ['potato', 'carrots', 'pork', 'chestnuts', 'butter', 'salmon']
score_recipes(user_input, rp_df, 10)

In [90]:
rp_df['reviews_in_dict'][0]

In [91]:
score_recipes()

In [None]:
user_input = ['potatoes', 'carrots', 'pork']
score_recipes(user_input, rp_df, 3)


        


