In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import defaultdict
import random
import string
import sklearn
import statsmodels
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from gensim.models import Word2Vec

In [2]:
raw_recipes = pd.read_csv('RecipesDataset/RAW_recipes.csv', delimiter=",", low_memory=False)
raw_interactions = pd.read_csv('RecipesDataset/RAW_interactions.csv', delimiter=",", low_memory=False)

In [3]:
# Filter out recipes with invalid minute times or take weeks of preparation
filtered_recipes = raw_recipes[raw_recipes['minutes'] < 1000]

In [4]:
# Filter out interactions that include recipes that were filtered out
PP_interactions = raw_interactions[raw_interactions['recipe_id'].isin(filtered_recipes['id'])]

In [5]:
# GOAL:
# Create a DataFrame off of interactions with the following columns:
# user_id
# recipe_id
# rating
# month - 1: January, 2: February, ..., 12: December
# season - 0: winter, 1: spring, 2: summer, 3: fall
# is_holiday_month - 1: yes, 0: no
# pm_rating_avg - rating average of the recipe in the previous month
# pm_review_count - number of reviews of the recipe in the previous month

In [6]:
# GOAL:
# Create a DataFrame off of recipes with the following columns:
# recipe_id
# minutes - time to prepare the recipe
# rating_avg - (0-5)
# review_count - number of reviews
# has_seasonal_tag - 1: yes, 0: no
# age - Time since the recipe was created to 2018-12

In [7]:
print(raw_recipes.columns)
print(raw_recipes.head())

Index(['name', 'id', 'minutes', 'contributor_id', 'submitted', 'tags',
       'nutrition', 'n_steps', 'steps', 'description', 'ingredients',
       'n_ingredients'],
      dtype='object')
                                         name      id  minutes  \
0  arriba   baked winter squash mexican style  137739       55   
1            a bit different  breakfast pizza   31490       30   
2                   all in the kitchen  chili  112140      130   
3                          alouette  potatoes   59389       45   
4          amish  tomato ketchup  for canning   44061      190   

   contributor_id   submitted  \
0           47892  2005-09-16   
1           26278  2002-06-17   
2          196586  2005-02-25   
3           68585  2003-04-14   
4           41706  2002-10-25   

                                                tags  \
0  ['60-minutes-or-less', 'time-to-make', 'course...   
1  ['30-minutes-or-less', 'time-to-make', 'course...   
2  ['time-to-make', 'course', 'preparation', 'ma

In [8]:
# From the columns, id and minutes are already in the DataFrame
# Let's add the age and has_seasonal_tag columns
PP_recipes = filtered_recipes.drop(columns=['name', 'contributor_id', 'nutrition', 'n_steps', 'steps', 'description', 'ingredients', 'n_ingredients'])
PP_recipes['age'] = 2018 - pd.to_datetime(PP_recipes['submitted']).dt.year

seasonal_tags = ['holiday-event', 'seasonal', 'christmas', 'summer', 'fall', 'winter', 'spring', 'thanksgiving']

PP_recipes['has_seasonal_tag'] = [1 if tag in seasonal_tags else 0 for tag in PP_recipes['tags']]

print(PP_recipes.columns)
print(PP_recipes.head())

Index(['id', 'minutes', 'submitted', 'tags', 'age', 'has_seasonal_tag'], dtype='object')
       id  minutes   submitted  \
0  137739       55  2005-09-16   
1   31490       30  2002-06-17   
2  112140      130  2005-02-25   
3   59389       45  2003-04-14   
4   44061      190  2002-10-25   

                                                tags  age  has_seasonal_tag  
0  ['60-minutes-or-less', 'time-to-make', 'course...   13                 0  
1  ['30-minutes-or-less', 'time-to-make', 'course...   16                 0  
2  ['time-to-make', 'course', 'preparation', 'mai...   13                 0  
3  ['60-minutes-or-less', 'time-to-make', 'course...   15                 0  
4  ['weeknight', 'time-to-make', 'course', 'main-...   16                 0  


In [9]:
# Let's add rating average and review count
# Create a dictionary of recipe_id to a tuple of (rating_sum, review_count)
# For each review in interactions, add the rating to the sum and increment the review count

rating_dict = defaultdict(tuple)

for index, row in PP_interactions.iterrows():
    recipe_id = row['recipe_id']
    rating = row['rating']
    if recipe_id in rating_dict:
        rating_dict[recipe_id] = (rating_dict[recipe_id][0] + rating, rating_dict[recipe_id][1] + 1)
    else:
        rating_dict[recipe_id] = (rating, 1)

In [10]:
# Add the rating average and review count to the DataFrame
PP_recipes['rating_avg'] = [round(rating_dict[recipe_id][0] / rating_dict[recipe_id][1], 1) for recipe_id in PP_recipes['id']]
PP_recipes['review_count'] = [rating_dict[recipe_id][1] for recipe_id in PP_recipes['id']]

In [11]:
# Drop the submitted and tags columns.
if 'submitted' in PP_recipes.columns and 'tags' in PP_recipes.columns:
    PP_recipes = PP_recipes.drop(columns=['submitted', 'tags'])

print(PP_recipes.columns)
print(PP_recipes.head())

Index(['id', 'minutes', 'age', 'has_seasonal_tag', 'rating_avg',
       'review_count'],
      dtype='object')
       id  minutes  age  has_seasonal_tag  rating_avg  review_count
0  137739       55   13                 0         5.0             3
1   31490       30   16                 0         3.5             4
2  112140      130   13                 0         4.0             1
3   59389       45   15                 0         4.5             2
4   44061      190   16                 0         5.0             1


In [12]:
# Now let's do interactions.
# GOAL:
# Create a DataFrame off of interactions with the following columns:
# user_id
# recipe_id
# rating
# month - 1: January, 2: February, ..., 12: December
# season - 0: winter, 1: spring, 2: summer, 3: fall
# is_holiday_month - 1: yes, 0: no
print(PP_interactions.columns)

Index(['user_id', 'recipe_id', 'date', 'rating', 'review'], dtype='object')


In [13]:
# Drop the review column
if 'review' in PP_interactions.columns:
    PP_interactions = PP_interactions.drop(columns=['review'])
print(PP_interactions.columns)

Index(['user_id', 'recipe_id', 'date', 'rating'], dtype='object')


In [14]:
# Create month, season, is_holiday_month columns off of date
PP_interactions['date'] = pd.to_datetime(PP_interactions['date'])
PP_interactions['month'] = PP_interactions['date'].dt.month
PP_interactions['season'] = [0 if month in [12, 1, 2] else 1 if month in [3, 4, 5] else 2 if month in [6, 7, 8] else 3 for month in PP_interactions['month']]
PP_interactions['is_holiday_month'] = [1 if month in [11, 12, 1] else 0 for month in PP_interactions['month']]

print(PP_interactions.columns)

Index(['user_id', 'recipe_id', 'date', 'rating', 'month', 'season',
       'is_holiday_month'],
      dtype='object')


In [15]:
print(PP_interactions.head())
print(PP_recipes.head())

   user_id  recipe_id       date  rating  month  season  is_holiday_month
0    38094      40893 2003-02-17       4      2       0                 0
1  1293707      40893 2011-12-21       5     12       0                 1
2     8937      44394 2002-12-01       4     12       0                 1
3   126440      85009 2010-02-27       5      2       0                 0
4    57222      85009 2011-10-01       5     10       3                 0
       id  minutes  age  has_seasonal_tag  rating_avg  review_count
0  137739       55   13                 0         5.0             3
1   31490       30   16                 0         3.5             4
2  112140      130   13                 0         4.0             1
3   59389       45   15                 0         4.5             2
4   44061      190   16                 0         5.0             1


In [16]:
# Create DataFrames that filter out all recipes with under 5 interactions
PP_recipes_5Rev = PP_recipes[PP_recipes['review_count'] >= 5]
PP_interactions_5Rev = PP_interactions[PP_interactions['recipe_id'].isin(PP_recipes_5Rev['id'])]

In [17]:
print(PP_interactions_5Rev.head())
print(PP_recipes_5Rev.head())

print(len(PP_interactions), len(PP_interactions_5Rev))
print(len(PP_recipes), len(PP_recipes_5Rev))

    user_id  recipe_id       date  rating  month  season  is_holiday_month
8     76535     134728 2005-09-02       4      9       3                 0
9    273745     134728 2005-12-22       5     12       0                 1
10   353911     134728 2006-09-26       5      9       3                 0
11   190375     134728 2007-03-09       5      3       1                 0
12   468945     134728 2008-02-20       0      2       0                 0
       id  minutes  age  has_seasonal_tag  rating_avg  review_count
9   75452       70   15                 0         4.4             5
15  63986      500   15                 0         4.4            19
16  43026       45   16                 0         4.0            22
17  23933       15   16                 0         4.8            12
33  54100       26   15                 0         4.4            14
1121540 803903
229396 53150


In [18]:
new_data = raw_interactions[raw_interactions['recipe_id'].isin(filtered_recipes['id'])]
new_data = new_data[new_data['recipe_id'].isin(PP_recipes_5Rev['id'])]
new_data['date'] = pd.to_datetime(new_data['date'])
new_data = new_data.sort_values(by='date')

In [19]:
new_data_train = new_data[new_data['date'].dt.year < 2013]
new_data_valid = new_data[(new_data['date'].dt.year >= 2013) & (new_data['date'].dt.year <= 2015)]
new_data_test = new_data[new_data['date'].dt.year > 2015]

In [20]:
valid_MSEs = []
"""
for l in range(100, 101):
    model = AutoReg(new_data_train['rating'], lags=l)
    model_fit = model.fit()

    valid_predictions = model_fit.predict(start=len(new_data_train), end=len(new_data_train) + len(new_data_valid) - 1, dynamic=False)

    valid_ratings = new_data_valid['rating'].values
    valid_predictions = valid_predictions[:len(valid_ratings)]

    valid_mse = mean_squared_error(new_data_valid['rating'], valid_predictions)
    valid_MSEs.append((valid_mse, l))
"""

"\nfor l in range(100, 101):\n    model = AutoReg(new_data_train['rating'], lags=l)\n    model_fit = model.fit()\n\n    valid_predictions = model_fit.predict(start=len(new_data_train), end=len(new_data_train) + len(new_data_valid) - 1, dynamic=False)\n\n    valid_ratings = new_data_valid['rating'].values\n    valid_predictions = valid_predictions[:len(valid_ratings)]\n\n    valid_mse = mean_squared_error(new_data_valid['rating'], valid_predictions)\n    valid_MSEs.append((valid_mse, l))\n"

In [21]:
# (2.8264165802308145, 1)
# (2.8264120072829284, 40)
# (2.826362389377399, 60)
# (2.8262528827609636, 100)
print(valid_MSEs)

[]


In [None]:
"""
valid_ratings = new_data_valid['rating'].values
valid_predictions = valid_predictions[:len(valid_ratings)]
"""

In [None]:
"""
valid_mse = mean_squared_error(new_data_valid['rating'], valid_predictions)
print(f'Validation MSE: {valid_mse}')
"""

Validation MSE: 2.8264165802308145


In [None]:
"""
test_predictions = model_fit.predict(start=len(new_data_train) + len(new_data_valid), end=len(new_data_train) + len(new_data_valid) + len(new_data_test) - 1, dynamic=False)

test_ratings = new_data_test['rating'].values
test_predictions = test_predictions[:len(test_ratings)]

test_mse = mean_squared_error(test_ratings, test_predictions)
print(f'Test MSE: {test_mse}')
"""

  return get_prediction_index(
  return get_prediction_index(
  fcast_index = self._extend_index(index, steps, forecast_index)


Test MSE: 5.07475858639176


In [27]:
PP_interactions_5Rev = PP_interactions_5Rev.set_index('date')
PP_interactions_5Rev = PP_interactions_5Rev.sort_index()

KeyError: "None of ['date'] are in the columns"

In [29]:
print(PP_interactions_5Rev.head())
print(PP_interactions_5Rev.index)

            user_id  recipe_id  rating  month  season  is_holiday_month
date                                                                   
2000-01-25     2008       3603       4      1       0                 1
2000-01-25     2008        992       5      1       0                 1
2000-02-25     2046        517       5      2       0                 0
2000-02-25     2046       4684       5      2       0                 0
2000-02-25     2046       4523       2      2       0                 0
DatetimeIndex(['2000-01-25', '2000-01-25', '2000-02-25', '2000-02-25',
               '2000-02-25', '2000-03-13', '2000-03-13', '2000-04-07',
               '2000-05-21', '2000-09-05',
               ...
               '2018-12-19', '2018-12-19', '2018-12-19', '2018-12-19',
               '2018-12-19', '2018-12-19', '2018-12-19', '2018-12-19',
               '2018-12-19', '2018-12-20'],
              dtype='datetime64[ns]', name='date', length=803903, freq=None)


In [31]:
# Set dataTest to the most recent data, from 2016-2018.
# Set dataValid to the second most recent data, from 2013-2015.
# Leave the remainder for dataTrain.
dataTrain = PP_interactions_5Rev[PP_interactions_5Rev.index.year < 2013]
dataValid = PP_interactions_5Rev[(PP_interactions_5Rev.index.year >= 2013) & (PP_interactions_5Rev.index.year <= 2015)]
dataTest = PP_interactions_5Rev[(PP_interactions_5Rev.index.year > 2015)]
print(len(dataTrain), len(dataValid), len(dataTest))

676445 73521 53937


In [41]:
exog_features = ['season', 'is_holiday_month']
model = AutoReg(dataTrain['rating'], lags=100, exog=dataTrain[exog_features])
model_fit = model.fit()

  self._init_dates(dates, freq)


In [47]:
print(dataTest.head())

               user_id  recipe_id  rating  month  season  is_holiday_month
date                                                                      
2016-01-01  2000697576     150863       5      1       0                 1
2016-01-01     1360126     297251       0      1       0                 1
2016-01-01  2000791565      10744       0      1       0                 1
2016-01-01  2000622331       2886       5      1       0                 1
2016-01-01      223656      87881       5      1       0                 1


In [61]:
valid_predictions = model_fit.predict(start=len(dataTrain), end=len(dataTrain)+len(dataValid)-1, exog_oos=dataValid[exog_features], dynamic=False)

  return get_prediction_index(
  return get_prediction_index(
  fcast_index = self._extend_index(index, steps, forecast_index)


In [62]:
valid_ratings = dataValid['rating'].values
valid_predictions = valid_predictions[:len(valid_ratings)]

In [None]:
#                 2.8264165802308145
# Validation MSE: 2.826118619758396, lags=100
valid_mse = mean_squared_error(valid_ratings, valid_predictions)
print(f'Validation MSE: {valid_mse}')

Validation MSE: 2.826118619758396
Validation MSE: 2.826118619758396


In [56]:
start = len(dataTrain)+len(dataValid)
end = len(dataTrain)+len(dataValid)+len(dataTest)-1

print(start, end)
print(end-start)


749966 803902
53936


In [66]:
test_predictions = model_fit.predict(start=len(dataTrain)+len(dataValid), end=len(dataTrain)+len(dataValid)+len(dataTest)-1, exog_oos=dataValid[exog_features]+dataTest[exog_features], dynamic=False)

  return get_prediction_index(
  return get_prediction_index(
  fcast_index = self._extend_index(index, steps, forecast_index)


In [67]:
test_ratings = new_data_test['rating'].values
test_predictions = test_predictions[:len(test_ratings)]

In [68]:
test_mse = mean_squared_error(test_ratings, test_predictions)
print(f'Test MSE: {test_mse}')

ValueError: Input contains NaN.