In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
from ast import literal_eval

pd.set_option('display.width', 1400)

#### Load Data

In [2]:
# Load necessary dataframes
converters = { k: literal_eval for k in ['tags', 'ingredients', 'steps', 'nutrition'] } # for evaluating strings as arrays (eg. tags)

df_recipes = pd.read_csv('dataset/RAW_recipes.csv', converters=converters, index_col='id')
df_recipe_reviews = pd.read_csv('dataset/Recipe_Reviews.csv', index_col='id')

# df_interact = pd.read_csv('dataset/RAW_interactions.csv', dtype={'review': str})
# converters = { k: literal_eval for k in ['rated_recipes', 'ingredients', 'rating_list'] }
# df_userdata = pd.read_csv('dataset/User_Data.csv', converters=converters, index_col='user_id')

#### Create Training and Test Data

In [3]:
def get_train_and_test_data(df):
    from sklearn.model_selection import train_test_split
    df_exploded = df.explode(['rated_recipes', 'rating_list'])
    df_train_exploded, df_test_exploded = train_test_split(df_exploded, test_size=0.2, random_state=42)
    df_train = df_train_exploded.groupby(level=0).agg(list).drop('ingredients', axis=1)
    df_test = df_test_exploded.groupby(level=0).agg(list).drop('ingredients', axis=1)
    return df_train, df_test

In [4]:
# Create or load training and test data
converters = { k: literal_eval for k in ['rated_recipes', 'ingredients', 'rating_list'] }
train_fn = 'dataset/User_Data_Train.csv'
test_fn = 'dataset/User_Data_Test.csv'
if os.path.exists(train_fn):
    print('Loading train and test userdata')
    df_train = pd.read_csv(train_fn, converters=converters, index_col='user_id')
    df_test =  pd.read_csv(test_fn, converters=converters, index_col='user_id')
else:
    print('Splitting userdata into training and test data ...')
    df_userdata = pd.read_csv('dataset/User_Data.csv', converters=converters, index_col='user_id')
    df_train, df_test = get_train_and_test_data(df_userdata.head(None).copy())
    df_train.to_csv(train_fn)
    df_test.to_csv(test_fn)
print('Done.')

Loading train and test userdata
Done.


#### Recommender Model

In [3]:
# Functions for performing jaccard similarity between recipes
def jaccard_similarity(s1, s2):
    intersect = set([ x for x in s1 + s2 if (x in s1 and x in s2) ])
    union = list(set(s1 + s2))
    return len(intersect) / len(union)

# Given a target set, get array of jaccard similarity for all sets
def jaccard_similarity_array(target_set, sets):
    return [ jaccard_similarity(target_set, set_cmp) for set_cmp in sets ]

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

STOPWORDS = list(nltk.corpus.stopwords.words('english'))
STEMMER = nltk.stem.snowball.SnowballStemmer('english')
LEMMATIZER = nltk.stem.WordNetLemmatizer()

def text_preprocessor(document):
    tokens = []
    for sentence in sent_tokenize(document.lower()):
        words = word_tokenize(sentence)
        words = [ word for word in words if (word.isalpha() and word not in STOPWORDS) ]
        words = [ STEMMER.stem(word) for word in words ]
        words = [ LEMMATIZER.lemmatize(word, pos="v") for word in words ]
        tokens.extend(words)
    return ' '.join(tokens)

In [5]:
# Get similar items functions
def get_similar_items_tags(tags_corpus, ingredients_corpus, target_i, top_n=5):
    target_tags, target_ingredients = tags_corpus[target_i], ingredients_corpus[target_i]
    sims_tags = jaccard_similarity_array(target_tags, tags_corpus)
    sims_ingredients = jaccard_similarity_array(target_ingredients, ingredients_corpus)
    sims_ave = [ (sim1 + sim2) / 2 for sim1, sim2 in zip(sims_tags, sims_ingredients) ]
    sims_items = [ (i, sim) for i, sim in enumerate(sims_ave) ]
    sims_items.sort(reverse=True, key=lambda item: item[1])
    return sims_items[1:top_n+1]

def get_similar_items_TFIDF(matrix, target_i, top_n=10):
    target_vector = matrix[target_i]
    cosine_sims = cosine_similarity(target_vector, matrix)[0]
    sims_items = [ (i, sim) for i, sim in enumerate(cosine_sims) ]
    sims_items.sort(reverse=True, key=lambda item: item[1])
    return sims_items[1:top_n+1]

In [6]:
class RecipeRecommenderSystem:
    
    def __init__(self):
        self.id_to_index = None
        self.index_to_id = None
        self.tags_corpus = None
        self.ingredients_corpus = None
        self.desc_tfidf_matrix = None
        self.reviews_tfidf_matrix = None
    
    # train
    def train(self, recipes_dataframe, recipe_reviews_dataframe):
        self.id_to_index = { id_: i for i, id_ in enumerate(df_recipes.index) }
        self.index_to_id = { i: id_ for i, id_ in enumerate(df_recipes.index) }
        n = None # Set to None to load whole dataframe
        
        self.tags_corpus =        list(recipes_dataframe.head(n)['tags'].values)
        self.ingredients_corpus = list(recipes_dataframe.head(n)['ingredients'].values)

        # Train TFIDF model on recipe descriptions
        print('Training descriptions TFIDF model ...')
        descriptions = list(recipes_dataframe.head(n)['description'].fillna('').values)
        desc_tfidf = TfidfVectorizer(preprocessor=text_preprocessor, ngram_range=(1, 2))
        self.desc_tfidf_matrix = desc_tfidf.fit_transform(descriptions)

        # Train TFIDF model on recipe reviews
        print('Training reviews TFIDF model ...')
        reviews = list(recipe_reviews_dataframe.head(n)['reviews'].fillna('').values)
        reviews_tfidf = TfidfVectorizer(preprocessor=text_preprocessor, ngram_range=(1, 1))
        self.reviews_tfidf_matrix = reviews_tfidf.fit_transform(reviews)
    
    
    # Given the index of a recipe, get similar recipes based on tags, description and reviews
    def get_similar_recipes(self, target_i, top_n=5):
        top_results_tags = get_similar_items_tags(self.tags_corpus, self.ingredients_corpus, target_i, top_n=top_n)
        top_results_desc = get_similar_items_TFIDF(self.desc_tfidf_matrix, target_i, top_n=top_n)
        top_results_reviews = get_similar_items_TFIDF(self.reviews_tfidf_matrix, target_i, top_n=top_n)
        top_results = sorted(top_results_tags + top_results_desc + top_results_reviews, reverse=True, key=lambda item: item[1])[:top_n]
        return top_results
    
    
    # Given a list of recipes and their ratings (assumed to represent a user), get list of
    # k recommendations based on recipe ratings
    def get_recommendations_from_recipe_ratings(self, recipe_ratings, ratings_list, k=10):
        if recipe_ratings == []:
            return []
        rating_thresh = 2
        # recipe_items = sorted(zip(recipe_ratings, ratings_list), reverse=1, key=lambda x: x[1])
        # recipe_items = [ (recipe, rating) for recipe, rating in recipe_items if rating > rating_thresh ]
        recipes_dict = {}
        for recipe_id, rating in zip(recipe_ratings, ratings_list):
            weight = (rating - rating_thresh) / (5 - rating_thresh) # rating of 5 -> 1, rating of rating_thresh -> 0, rating of 0 -> negative
            recipe_index = self.id_to_index[recipe_id]
            similar_recipes = self.get_similar_recipes(recipe_index, top_n=k)
            for i, sim in similar_recipes:
                result_id = self.index_to_id[i]
                recipes_dict[result_id] = recipes_dict.get(result_id, 0) + sim * weight
        
        recommend_items = [ (id_, score) for id_, score in recipes_dict.items() ]
        recommend_items.sort(reverse=1, key=lambda x: x[1])
        return recommend_items[:k]

In [None]:
# Init model
n = None # Set to None to load whole dataframe
tags_corpus =        list(df_recipes.head(n)['tags'].values)
ingredients_corpus = list(df_recipes.head(n)['ingredients'].values)

# Train TFIDF model on recipe descriptions
print('Training descriptions TFIDF model ...')
descriptions = list(df_recipes.head(n)['description'].fillna('').values)
desc_tfidf = TfidfVectorizer(preprocessor=text_preprocessor, ngram_range=(1, 2))
desc_tfidf_matrix = desc_tfidf.fit_transform(descriptions)

# Train TFIDF model on recipe reviews
print('Training reviews TFIDF model ...')
reviews = list(df_recipe_reviews.head(n)['reviews'].fillna('').values)
reviews_tfidf = TfidfVectorizer(preprocessor=text_preprocessor, ngram_range=(1, 1))
reviews_tfidf_matrix = reviews_tfidf.fit_transform(reviews)

In [None]:
recommender = RecipeRecommenderSystem()
recommender.train(df_recipes, df_recipe_reviews)

In [None]:
recommendations = []
for i, (id, sim) in enumerate(recommendations):
    recipe = df_recipes.iloc[id]
    print('{:>2}: [{:.3f}] ({}) "{}"'.format(i+1, sim, id, recipe['name'].title()))

In [None]:
user_id = df_train.iloc[4].name
user_data = df_train.loc[user_id]
user_ratings = sorted(zip( user_data['rated_recipes'], user_data['rating_list'] ), reverse=0, key=lambda x: x[1])
print(user_ratings)

In [None]:

for user_id in df_train.index[:100]:
    if user_id in df_test.index:
        recommendations = recommender.get_recommendations_from_ratings_list(df_recipes)

#### Testing

In [None]:
# 
...