In [23]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re
import json
sns.set()

import warnings
warnings.filterwarnings("ignore")

#### Import and clean recipes

In [40]:
with open('allrecipes-recipes.json') as g : 
    urls=[json.loads(line)["url"] for  line in g]

recipe_IDs=pd.Series(urls).str.findall(r"Recipe/([^/]+)").str[0].apply(lambda x : int(x))

In [44]:
with open('allrecipes-recipes.json') as g : 
    ingredients=[json.loads(line)["ingredients"] for  line in g]
ingredients=pd.Series(ingredients)

In [31]:
with open('allrecipes-recipes.json') as g : 
    titles=[json.loads(line)["title"] for  line in g]
titles=pd.Series(titles)

In [45]:
with open('allrecipes-recipes.json') as g : 
    prepare_time=[json.loads(line)["total_time_minutes"] for  line in g]
    
prepare_time=pd.Series(prepare_time).apply(lambda x : int(x))

In [46]:
with open('allrecipes-recipes.json') as g : 
    instructions=[json.loads(line)["instructions"] for  line in g]
    
instructions=pd.Series(instructions)

In [98]:
recipes=pd.DataFrame(recipe_IDs,columns=["Recipe_ID"])
recipes["Recipe_name"]=titles
recipes["Prepare_time"]=prepare_time
recipes["Ingredients"]=ingredients
recipes["Instructions"]=instructions

#### Import and clean reviews

In [3]:
#import and clean recipes
f=open("Data/reviews.csv")

In [4]:
#Split recipes descriptions on ";"
lines=[]

import codecs
with codecs.open("data/raw-data_recipe.csv", 'r', encoding='utf-8', errors='ignore') as fdata:
    for line in fdata:
        lines.append(line)

lines_cut=[]
for i in range(len(lines)):
    if len(lines[i])>10 :
        lines_cut.append(lines[i])

In [5]:
#Extract Recipe IDs
recipe_ID=pd.Series(lines_cut[1:]).str.split(",").str[0]
recipe_ID=recipe_ID.apply(lambda x : (int(x)))

In [6]:
#Extract Recipe Ratings
ratings=[re.findall(r"\'rating\': (.)", line) for line in lines_cut[1:]]

In [7]:
#Extract Reviewer ID
reviewer_ID=[re.findall(r"(\w+): {\'rating\':",line) for line in lines_cut[1:]]

In [8]:
#Extract Recipe Reviews
reviews=pd.Series(lines_cut[1:]).str.findall(r"\'text\': u.(.+?)follow")
reviews=[[re.sub(r"[\"\',\\]", r"",comment) for comment in review] for review in reviews]

In [9]:
#Group Reviewer ID, Rating and Review into DataFrame
df=pd.DataFrame({"Reviewer_ID":reviewer_ID, "Review" : reviews ,"Rating" : ratings, },index=recipe_ID)

In [10]:
df.to_csv("df_temp.csv",index=False)

In [11]:
#Unroll reviews for each recipe
df2=pd.DataFrame({'Recipe_ID':np.repeat(df.index, df["Reviewer_ID"].str.len()),
                  'Reviewer_ID': np.concatenate(df["Reviewer_ID"].values)})

In [16]:
df2["Rating"]=np.concatenate(df["Rating"].values)
df2["Review"]=np.concatenate(df["Review"].values)

In [17]:
reviews=df2

#### Keep only recipes that are both features in the reviews and recipes dataframes

In [99]:
reviews=reviews[reviews["Recipe_ID"].isin(recipes["Recipe_ID"])]

In [102]:
recipes=recipes[recipes["Recipe_ID"].isin(reviews["Recipe_ID"])]

#### Simple preprocessing

In [143]:
#Add column with ratings
ratings=reviews.groupby("Recipe_ID")["Rating"].mean()
recipes=recipes.set_index("Recipe_ID")
recipes["Rating"]=ratings
recipes=recipes[['Recipe_name', 'Rating','Prepare_time', 'Ingredients', 'Instructions',]]

In [152]:
#lower  recipes
recipes["Ingredients"]=recipes["Ingredients"].str.lower()
recipes["Instructions"]=recipes["Instructions"].str.lower()

In [156]:
reviews["Review"]=reviews["Review"].str.lower()

In [17]:
#Lemmatize reviews
import gensim
from gensim.utils import simple_preprocess
import nltk

lemma = nltk.wordnet.WordNetLemmatizer()
reviews["Review"]=[[lemma.lemmatize(word) for word in simple_preprocess(review)] for review in reviews["Review"].apply(str)]

#### Save cleaned reviews and recipes

In [27]:
reviews.to_csv("reviews_clean.csv",index=False)

In [158]:
recipes.to_csv("recipes_clean.csv",index=False)