In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os
from ast import literal_eval

pd.set_option('display.width', 1400)

#### Helper Functions

In [86]:
def format_text(text, line_width=120):
    lines = []
    line = ''
    for word in text.split():
        if len(line) == 0:
            line = word
        elif len(line + ' ' + word) > line_width:
            lines.append(line)
            line = ''
        else:
            line += ' ' + word
    lines.append(line)
    return '\n'.join(lines)

#### Load Data

In [2]:
# Load necessary dataframes
converters = { k: literal_eval for k in ['tags', 'ingredients', 'steps', 'nutrition'] } # for evaluating strings as arrays (eg. tags)

df_recipes = pd.read_csv('dataset/RAW_recipes.csv', converters=converters, index_col='id')
df_recipe_reviews = pd.read_csv('dataset/Recipe_Reviews.csv', index_col='id')

# df_interact = pd.read_csv('dataset/RAW_interactions.csv', dtype={'review': str})
# converters = { k: literal_eval for k in ['rated_recipes', 'ingredients', 'rating_list'] }
# df_userdata = pd.read_csv('dataset/User_Data.csv', converters=converters, index_col='user_id')

#### Create Training and Test Data

In [109]:
def get_train_and_test_data(df):
    from sklearn.model_selection import train_test_split
    df_exploded = df.explode(['rated_recipes', 'rating_list'])
    df_train_exploded, df_test_exploded = train_test_split(df_exploded, test_size=0.2, random_state=42)
    df_train = df_train_exploded.groupby(level=0).agg(list).drop('ingredients', axis=1)
    df_test = df_test_exploded.groupby(level=0).agg(list).drop('ingredients', axis=1)
    all_user_ids = df.index
    df_train = df_train.reindex(all_user_ids, fill_value=[]) # Re-index to ensure all user_ids are included
    df_test = df_test.reindex(all_user_ids, fill_value=[])
    return df_train, df_test

In [113]:
# Create or load training and test data
converters = { k: literal_eval for k in ['rated_recipes', 'ingredients', 'rating_list'] }
train_fn = 'dataset/User_Data_Train.csv'
test_fn = 'dataset/User_Data_Test.csv'
if os.path.exists(train_fn):
    print('Loading train and test userdata ...')
    df_train = pd.read_csv(train_fn, converters=converters, index_col='user_id')
    df_test =  pd.read_csv(test_fn, converters=converters, index_col='user_id')
else:
    print('Reading userdata dataframe ...')
    df_userdata = pd.read_csv('dataset/User_Data.csv', converters=converters, index_col='user_id')
    print('Splitting userdata into training and test data ...')
    df_train, df_test = get_train_and_test_data(df_userdata.head(None).copy())
    df_train.to_csv(train_fn)
    df_test.to_csv(test_fn)
print('Done.')

Reading userdata dataframe ...
Splitting userdata into training and test data ...
Done.


#### TF-IDF Models

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

STOPWORDS = list(nltk.corpus.stopwords.words('english'))
STEMMER = nltk.stem.snowball.SnowballStemmer('english')
LEMMATIZER = nltk.stem.WordNetLemmatizer()

def text_preprocessor(document):
    tokens = []
    for sentence in sent_tokenize(document.lower()):
        words = word_tokenize(sentence)
        words = [ word for word in words if (word.isalpha() and word not in STOPWORDS) ]
        words = [ STEMMER.stem(word) for word in words ]
        words = [ LEMMATIZER.lemmatize(word, pos="v") for word in words ]
        tokens.extend(words)
    return ' '.join(tokens)

In [8]:
def get_similar_items_TFIDF(matrix, target_i, top_n=10):
    target_vector = matrix[target_i]
    cosine_sims = cosine_similarity(target_vector, matrix)[0]
    sims_items = [ (i, sim) for i, sim in enumerate(cosine_sims) ]
    sims_items.sort(reverse=True, key=lambda item: item[1])
    return sims_items[1:top_n+1]

In [196]:
from scipy.sparse import hstack, vstack, csr_matrix, save_npz, load_npz

In [None]:
# Init model
n = 50000 # Set to None to load whole dataframe
recipes_dataframe = df_recipes.head(n).copy()
recipe_reviews_dataframe = df_recipe_reviews.head(n).copy()

id_to_index_RECIPES = { id_: i for i, id_ in enumerate(df_recipes.index) }
index_to_id_RECIPES = { i: id_ for i, id_ in enumerate(df_recipes.index) }

id_to_index_USERS = { id_: i for i, id_ in enumerate(df_train.index) }
index_to_id_USERS = { i: id_ for i, id_ in enumerate(df_train.index) }

print('Generating keywords TFIDF model ...')
tags_corpus = [ ' '.join(x).replace('-', '') for x in recipes_dataframe['tags'].values ]
ingr_corpus = [ ' '.join(x)                  for x in recipes_dataframe['ingredients'].values ]
keywords_corpus = [ f'{tag} {ing}' for tag, ing in zip(tags_corpus, ingr_corpus) ]
keywords_tfidf = TfidfVectorizer()
keywords_tfidf_matrix = keywords_tfidf.fit_transform(keywords_corpus)

# Train TFIDF model on recipe descriptions
print('Generating descriptions TFIDF model ...')
descriptions_corpus = list(recipes_dataframe['description'].fillna('').values)
desc_tfidf = TfidfVectorizer(preprocessor=text_preprocessor, ngram_range=(1, 2))
desc_tfidf_matrix = desc_tfidf.fit_transform(descriptions_corpus)

# Train TFIDF model on recipe reviews
print('Generating reviews TFIDF model ...')
reviews_corpus = list(recipe_reviews_dataframe['reviews'].fillna('').values)
reviews_tfidf = TfidfVectorizer(preprocessor=text_preprocessor) # Only unigrams for lost computational cost
reviews_tfidf_matrix = reviews_tfidf.fit_transform(reviews_corpus)

# Combine to one matrix which represents all item profiles
tfidf_matrix = hstack([keywords_tfidf_matrix, desc_tfidf_matrix, reviews_tfidf_matrix], format='csr')

In [None]:
# Show results for similar items
target_i = 425
target_recipe = df_recipes.iloc[target_i]
top_results = get_similar_items_TFIDF(tfidf_matrix, target_i, top_n=5)
print('TARGET RECIPE:')
print('"{}" (index: {:_}):'.format(target_recipe['name'].replace('  ', ' - ').title(), target_i))
print(format_text(target_recipe['description']))
print('\nRECOMMENDATIONS:')
for j, (i, sim) in enumerate(top_results):
    recipe = df_recipes.iloc[i]
    print('\n  {:>2}: SIM: {:.3f}   NAME: "{}"  (index: {:_})'.format( j+1, sim, recipe['name'].replace('  ', ' - ').title(), i))
    print(format_text(recipe['description']))

#### Generating user profiles as weighted average of item profiles

In [98]:
# 
user_id = 1533
userdata_train = df_train.loc[user_id]

rated_recipes = userdata_train['rated_recipes']
rating_list = userdata_train['rating_list']
rated_recipes_indices = [ id_to_index_RECIPES[id_] for id_ in rated_recipes if id_ in id_to_index_RECIPES ]

print('Found {} rated recipes'.format(len(rated_recipes)))

print('Extracting and scaling user items ...')
user_items = [ tfidf_matrix[idx].multiply(rating-2) for idx, rating in zip(rated_recipes_indices, rating_list) if rating > 2 ]

print('Calculating user profile ...')
stacked_item_profiles = vstack(user_items) # Stack vertically in sparce matrix
user_profile = stacked_item_profiles.mean(axis=0) # Get row-wise mean

Found 102 rated recipes
Extracting and scaling user items ...
Calculating user profile ...


In [186]:
# Generate user profile matrix
user_profiles_fn = 'dataset/user_profile_matrix.npz'

user_profile_matrix = []

IDX = 0
for i, row in df_train.iterrows():
    IDX += 1
    print('\rGenerating user profile for user {:_}/{:_}'.format(IDX, len(df_train)), end='')
    rated_recipes = row['rated_recipes']
    rating_list = row['rating_list']
    rated_recipes_indices = [ id_to_index_RECIPES[id_] for id_ in rated_recipes if id_ in id_to_index_RECIPES ]
    
    user_items = [ csr_matrix(tfidf_matrix[idx].multiply(rating-2)) for idx, rating in zip(rated_recipes_indices, rating_list) if rating > 2 ]

    if user_items != []: # if scaled item profiles found
        stacked_item_profiles = vstack(user_items) # Stack vertically in sparce matrix
        user_profile = stacked_item_profiles.mean(axis=0) # Get row-wise mean
        user_profile_matrix.append(csr_matrix(user_profile))
    else:
        user_profile_matrix.append(csr_matrix((1, tfidf_matrix.shape[1])))
    
    if IDX >= 1000: break
print()

user_profile_matrix = vstack(user_profile_matrix)
user_profile_matrix.shape

(1000, 1586424)

In [None]:
# SAVE Matrix
save_npz(user_profiles_fn, user_profile_matrix)

In [None]:
# LOAD Matrix
user_profile_matrix = load_npz(user_profiles_fn)

In [200]:
matrix = user_profile_matrix
target_i = 3
top_n = 5

target_vector = matrix[target_i]
cosine_sims = cosine_similarity(target_vector, matrix)[0]
sims_items = [ (i, sim) for i, sim in enumerate(cosine_sims) ]
sims_items.sort(reverse=True, key=lambda item: item[1])
sims_items[1:top_n+1]

[(193, 0.33051352195026495),
 (234, 0.3261000913703869),
 (953, 0.3260481101308946),
 (653, 0.3247498214395159),
 (562, 0.31402298085117164)]

In [201]:
# 
i = 0
userdata_test = df_test.iloc[i]
rated_recipes_test = userdata_test['rated_recipes']
rating_list_test = userdata_test['rating_list']

for recipe_id, rating in zip(rated_recipes_test, rating_list_test):
    similar_users_who_rated = [ idx for idx, row in df_train.iterrows() if recipe_id in row['rated_recipes'] ]
    print(similar_users_who_rated)
    break

[48136]


In [None]:
# Create csr matrix

data, rows, cols = [], [], []

for i, (user_id, row) in enumerate(df_train.iterrows()):
    print('\r{:_}/{:_}'.format(i+1, len(df_train)), end='')
    for recipe_id, rating in zip(row['rated_recipes'], row['rating_list']):
        user_IDX = id_to_index_USERS[user_id]
        recipe_IDX = id_to_index_RECIPES[recipe_id]
        data.append(rating)
        rows.append(user_IDX)
        cols.append(recipe_IDX)
print('\nDone.')

user_item_matrix = csr_matrix((data, (rows, cols)), shape=(len(df_train), len(df_recipes)))
print('Matrix made!')

In [223]:
user_item_matrix[0]
np.corrcoef([1,2,4,3], [1,2,2,3])[0][1]
i = 0
user_vector = user_item_matrix[i].toarray()
ccs = []
for j, sp_vector in enumerate(user_item_matrix):
    # vector = user_item_matrix[j].toarray()
    cc = np.corrcoef(user_vector, sp_vector.toarray())[0][1]
    ccs.append(cc)
ccs

  c /= stddev[:, None]
  c /= stddev[None, :]


[0.9999999999999999,
 0.003441840612631216,
 -4.290861793564e-05,
 -0.00026602326526232865,
 -0.00021891404012567395,
 -6.0310929533657284e-05,
 -7.38843220806153e-05,
 -0.00021112048252240192,
 -0.00020959667704185126,
 nan,
 -0.0001633959107577994,
 -4.290861793564236e-05,
 -4.290861793564022e-05,
 -4.290861793564105e-05,
 -9.918793228616558e-05,
 -4.290861793563934e-05,
 -9.594741496091209e-05,
 -4.290861793564248e-05,
 -4.2908617935641265e-05,
 -4.2908617935640696e-05,
 -0.00013543798607321627,
 nan,
 -0.00025085144793214197,
 nan,
 -4.2908617935641936e-05,
 -4.2908617935643426e-05,
 -0.0007777137527597544,
 -0.0005268757677901661,
 -9.012678843347741e-05,
 -4.290861793564273e-05,
 nan,
 -4.290861793563983e-05,
 -4.290861793564147e-05,
 nan,
 -4.290861793564078e-05,
 -4.290861793564332e-05,
 -6.068208041326429e-05,
 -0.0003029814836321278,
 -0.0001349275114899074,
 nan,
 -5.8870250256097946e-05,
 -4.290861793564084e-05,
 -0.00020682812339477066,
 nan,
 -4.2908617935640804e-05,
 -4.

In [222]:
for i, vector in enumerate(user_item_matrix):
    print(vector.toarray())
    if i > 5: break

[[0 0 0 ... 0 0 0]]
[[0 0 0 ... 0 0 0]]
[[0 0 0 ... 0 0 0]]
[[0 0 0 ... 0 0 0]]
[[0 0 0 ... 0 0 0]]
[[0 0 0 ... 0 0 0]]
[[0 0 0 ... 0 0 0]]
