## Setup

In [1]:
import os
# give googe drive the required permission
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Create a folder in your drive and add the name of that folder here.
# For example, for the code below to run correctly, you need to have a folder named FoodRecSys in 'My Drive'.
# The said folder will be your home directory for the rest of the project.
# You will be able to save and read data from the folder.

os.chdir("/content/drive/MyDrive/FoodRecSys/")
os.getcwd()

'/content/drive/MyDrive/FoodRecSys'

In [3]:
os.chdir("/content/drive/MyDrive/food_recsys_project/Code_Files/")
os.getcwd()

'/content/drive/MyDrive/food_recsys_project/Code_Files'

In [4]:
try:
  import pyspark
except:
  !pip install pyspark==3.1.2
  import pyspark

Collecting pyspark==3.1.2
  Downloading pyspark-3.1.2.tar.gz (212.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.4/212.4 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py4j==0.10.9 (from pyspark==3.1.2)
  Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m198.6/198.6 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.2-py2.py3-none-any.whl size=212880745 sha256=bebbade96cb9a06eb73d0f4dfc65fde475adc02ec3c909ac977f319ce5f242e9
  Stored in directory: /root/.cache/pip/wheels/ef/70/50/7882e1bcb5693225f7cc86698f10953201b48b3f36317c2d18
Successfully built pyspark
Installing collected packages: py4j, pyspark
  Attempting uninstall: py4j
    Found existing installation: py4j 0

In [5]:
from pyspark.sql import SparkSession
from pyspark.context import SparkContext

In [6]:
spark = SparkSession.builder.master("local").config('spark.ui.port', '4050').getOrCreate()

In [7]:
spark

## Imports

In [8]:
import pandas as pd
import numpy as np

from pyspark.sql import functions as F
# Import for typecasting columns
from pyspark.sql.types import IntegerType,BooleanType,DateType,FloatType,StringType, ArrayType

## Read the data

In [9]:
raw_ratings_df = (spark.read.csv("/content/raw_ratings_small.csv", # modify the path to read the data
                                 header=True,
                                 inferSchema= True))

In [10]:
raw_ratings_df.show()

+-------+---------+------+--------------------+-----------+
|user_id|recipe_id|rating|              review|review_date|
+-------+---------+------+--------------------+-----------+
| 483827|   306785|     5|Being a  huge fan...| 2008-07-15|
|   6258|    20930|     5|Jan  what an inte...| 2002-07-09|
| 102602|    20930|     5|Jan  we love your...| 2003-10-26|
| 296027|   182985|     4|Very nice fresh s...| 2007-05-19|
|   9580|   208980|     5|I saw this on 30 ...| 2007-02-05|
| 462571|   208980|     5|These were a snap...| 2007-07-05|
| 376098|   208980|     5|This was great! M...| 2007-08-01|
| 222139|   208980|     5|I chose this reci...| 2007-09-08|
| 674484|   208980|     5|Since this is one...| 2008-09-15|
|1251627|   208980|     5|This recipie was ...| 2009-06-23|
| 456221|   208980|     5|This is a favorit...| 2010-02-20|
| 229619|   208980|     5|I was flipping th...| 2011-06-26|
| 280271|   219118|     4|This is so yummy....| 2008-11-09|
| 305531|   219118|     4|Really good ca

In [11]:
raw_recipes_df = spark.read.csv("/content/raw_recipies_small.csv", # modify the path to read the data
                                header=True,
                                inferSchema=True)

In [12]:
raw_recipes_df.show()

+--------------------+------+-------+--------------+----------+--------------------+--------------------+-------+--------------------+--------------------+--------------------+-------------+--------------+
|                name|    id|minutes|contributor_id| submitted|                tags|           nutrition|n_steps|               steps|         description|         ingredients|n_ingredients|year_of_review|
+--------------------+------+-------+--------------+----------+--------------------+--------------------+-------+--------------------+--------------------+--------------------+-------------+--------------+
|beat this  banana...| 75452|     70|         15892|2003-11-04|['weeknight', 'ti...|[2669.3, 160.0, 9...|     12|['preheat oven to...|  from ann hodgman's|['sugar', 'unsalt...|            9|          2003|
|      chinese  candy| 23933|     15|         35268|2002-03-29|['15-minutes-or-l...|[232.7, 21.0, 77....|      4|['melt butterscot...|a little differen...|['butterscotch ch...|

In [13]:
assert (raw_recipes_df.count(), len(raw_recipes_df.columns)) == (20340, 13)
assert (raw_ratings_df.count(), len(raw_ratings_df.columns)) == (93357, 5)

#### Decide a split date based on the ratings dataframe.

In [14]:
# Find the number of data points in the interaction dataset.
# You can use the count() method.
# The output must be an integer.

num_review_int = raw_ratings_df.count()
num_review_int

93357

# Task 01 - Train Test Split

Divide the data into train and test based on the 80 - 20 split using the approach discussed. You will have to save the data in a parquet file.

In [15]:
train_num_reviews_int = round(num_review_int *0.8)
train_num_reviews_int

74686

In [16]:
test_num_reviews_int = round(num_review_int *0.2)
test_num_reviews_int

18671

In [17]:
# Sort the interactions dataset in descending order of review date.
# Extract ```test_num_reviews_int``` most recent reviews.

temp_ratings_df = (raw_ratings_df.sort("review_date", ascending=False)
                                 .limit(test_num_reviews_int)
                  )

In [18]:
assert temp_ratings_df.count()  == 18671
assert raw_recipes_df.collect()[11][4] <= raw_recipes_df.collect()[10][4]

In [19]:
temp_ratings_df.collect()[-1][4]

'2011-07-17'

Split the data into two parts before and after 2011-07-17.

- All reviews in the ratings data after 2011-07-17 will not exsist in the training set.
- For all future predictions the date will be set at 2011-07-18.   

In [20]:
# Join raw_recipes and raw_ratings
# Use recipe_id as the key to join these dataframes
# The resulting dataframe must have all rows from the raw_ratings dataframe.

interaction_level_df = raw_ratings_df.join(raw_recipes_df,
                                           raw_ratings_df.recipe_id == raw_recipes_df.id
                                          )

In [21]:
interaction_level_df.show()

+----------+---------+------+--------------------+-----------+--------------------+-----+-------+--------------+----------+--------------------+--------------------+-------+--------------------+--------------------+--------------------+-------------+--------------+
|   user_id|recipe_id|rating|              review|review_date|                name|   id|minutes|contributor_id| submitted|                tags|           nutrition|n_steps|               steps|         description|         ingredients|n_ingredients|year_of_review|
+----------+---------+------+--------------------+-----------+--------------------+-----+-------+--------------+----------+--------------------+--------------------+-------+--------------------+--------------------+--------------------+-------------+--------------+
|    152552|    29601|     5|Excellent. It was...| 2004-08-20|zippy cold spicy ...|29601|    270|         30367|2002-05-29|['time-to-make', ...|[277.2, 15.0, 13....|     12|['in a heavy 10-i...|shrimp d

In [22]:
# Use the filter command to separate the datasets.
# All interactions which were rated BEFORE '2011-07-17' will be train data.

train_interaction_level_df  = (interaction_level_df.filter(interaction_level_df.review_date < '2011-07-17'))# add code to filter

In [23]:
# Use the filter command to separate the datasets.
# All interactions which were rated ON OR AFTER '2011-07-17' will be test data.

test_interaction_level_all_recipies_df  = (interaction_level_df.filter(interaction_level_df.review_date >= '2011-07-17'))# add code to filter

In [24]:
assert (test_interaction_level_all_recipies_df.count(), len(test_interaction_level_all_recipies_df.columns)) == (18684, 18)
assert (train_interaction_level_df.count(), len(train_interaction_level_df.columns)) == (74673, 18)

In [25]:
# create data files for modeling

(train_interaction_level_df.coalesce(1)
                           .write.mode('overwrite')
                           .parquet('train_interaction_level_df.parquet'))  # change the file name and file path

(test_interaction_level_all_recipies_df.coalesce(1)
                                       .write.mode('overwrite')
                                       .parquet('test_interaction_level_all_recipies_df.parquet'))  # change the file name and file path