In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from collections import Counter
import numpy as np
from scipy import stats

In [2]:
data = pd.read_json('../../data/processed/data.json')

## Preprocessing  

In [3]:
nutrition_df = data[['nutrition','rating','tags']].reset_index()

In [4]:
tags_df1 = pd.DataFrame(data['tags']).reset_index()

In [5]:
ingredients_df1 = pd.DataFrame(data['ingredient_annotations']).reset_index()

In [6]:
nutrition_df = nutrition_df.set_index(['index','rating'])

In [7]:
nutrition_df = nutrition_df['nutrition'].apply(pd.Series)

In [8]:
nutrition_df = nutrition_df.rename(columns = {0:'Calories',1:'Fat',2:'Carbs',3:'Fiber',4:'Sugar',5:'Protein'})

In [9]:
for i in nutrition_df.columns:
    nutrition_df[i] = nutrition_df[i].str.extract('(\d+)')[0]

In [10]:
calories = [4,4,9,4,2]
nutrition_df.iloc[:,1:] = nutrition_df.iloc[:,1:].apply(pd.to_numeric, errors='coerce')
nutrition_df.iloc[:,1:] = nutrition_df.iloc[:,1:]*calories #converting nutritional values grams to their equivalent in calories
nutrition_df = nutrition_df.reset_index()
nutrition_df = nutrition_df.dropna()
nutrition_df = nutrition_df[nutrition_df['rating'] != 'NA']


  nutrition_df.iloc[:,1:] = nutrition_df.iloc[:,1:].apply(pd.to_numeric, errors='coerce')


In [11]:
nutrition_df

Unnamed: 0,index,rating,Calories,Fat,Carbs,Fiber,Sugar,Protein
0,https://tasty.co/recipe/nicks-jambalaya,98,787,196.0,80.0,36.0,32.0,136.0
2,https://tasty.co/recipe/lasagna-skillet-pasta,96,837,172.0,260.0,36.0,52.0,118.0
5,https://tasty.co/recipe/ultimate-mac-n-cheese,92,567,108.0,236.0,9.0,40.0,52.0
6,https://tasty.co/recipe/crispy-chickpeas,94,463,56.0,276.0,144.0,44.0,34.0
7,https://tasty.co/recipe/instant-pot-beef-chili,100,599,148.0,120.0,36.0,36.0,66.0
...,...,...,...,...,...,...,...,...
2959,https://tasty.co/recipe/vegetarian-gumbo,80,1055,224.0,444.0,144.0,64.0,64.0
2961,https://tasty.co/recipe/beef-bean-burritos,96,480,88.0,136.0,27.0,4.0,62.0
2962,https://tasty.co/recipe/garlic-parmesan-grille...,97,353,92.0,132.0,27.0,24.0,16.0
2963,https://tasty.co/recipe/curry-noodle-soup,94,762,196.0,268.0,27.0,60.0,16.0


In [12]:
nutrition_tags = pd.merge(nutrition_df,tags_df1, on = 'index',how = 'inner')

In [13]:
nutrition_tags = pd.merge(nutrition_tags,ingredients_df1, on = 'index',how = 'inner')

In [14]:
nutrition_tags['Proteins/Calories'] = nutrition_tags['Protein']/nutrition_tags['Calories'].astype(int)
nutrition_tags['Fat/Calories'] = nutrition_tags['Fat']/nutrition_tags['Calories'].astype(float)
nutrition_tags['Carbs/Calories'] = nutrition_tags['Carbs']/nutrition_tags['Calories'].astype(float)
nutrition_tags['Fiber/Calories'] = nutrition_tags['Fiber']/nutrition_tags['Calories'].astype(float)
nutrition_tags['Sugar/Calories'] = nutrition_tags['Sugar']/nutrition_tags['Calories'].astype(float)

In [15]:
nutrition_tags['Tags Count'] = nutrition_tags['tags'].apply(len)

In [16]:
nutrition_tags['Ingredients Count'] = nutrition_tags['ingredient_annotations'].apply(len)

In [17]:
nutrition_tags = nutrition_tags[nutrition_tags['Tags Count'] <= 50]

In [18]:
dict1 = {}
for i,j in zip(nutrition_tags['Proteins/Calories'],nutrition_tags['tags']):
    for x in j:
        dict1.setdefault(x,[]).append(i)

In [19]:
dict_mean = {}
for i,j in dict1.items():
    dict_mean[i] = (np.mean(j),len(j))

In [20]:
mean_percentage_protein = pd.DataFrame.from_dict(dict_mean,orient='index')

In [21]:
no_comments = pd.DataFrame(data['number_of_comments'].str.strip('TIPS').astype(int)).reset_index()

In [22]:
no_comments['comments_rank'] = no_comments['number_of_comments'].rank(ascending=False,method = 'first')

In [23]:
df = pd.merge(nutrition_tags,no_comments, on = 'index',how = 'inner')

## EDA 

In [24]:
len(df) # number of recipes

1915

In [25]:
df['number_of_comments'].sum() # total number of comments

204710

In [26]:
df['number_of_comments'].mean() # average number of comments per recipe

106.8981723237598

In [27]:
sett = set()
for i in df['ingredient_annotations']:
    for j in i:
        sett.add(j)

In [28]:
len(sett) # total number of distinct ingredients

941

In [29]:
df['Ingredients Count'].mean() # average number of ingredients per recipe

13.855874673629243

In [30]:
df['rating'].astype(int).mean() # average rating per recipe

91.93577023498695

In [31]:
sett_2 = set()
for i in df['tags']:
    for j in i:
        sett_2.add(j)

In [32]:
len(sett_2)

199

In [33]:
df['Tags Count'].mean() # average number of tags per recipe

20.606788511749347

In [34]:
sett_2 = set()
for i in df['tags']:
    for j in i:
        sett_2.add(j)

In [35]:
len(sett_2)

199

In [36]:
df['Calories'].astype(int).mean() # average calories per recipe

583.8433420365535

In [37]:
df['Carbs/Calories'].mean() # average percentage of carbs in calories per recipe

0.3370095475520622

In [38]:
df['Fat/Calories'].mean() # average percentage of fat in calories per recipe

0.1942643095435878

In [39]:
df['Fiber/Calories'].mean() # average percentage of fiber in calories per recipe

0.13291731863246503

In [40]:
df['Proteins/Calories'].mean() # average percentage of protein in calories per recipe

0.10855446175082618

In [41]:
df['Sugar/Calories'].mean() # average percentage of sugar in calories per recipe

0.07642061161434271

## Separate DFs (for analysis of ingredients and tags in Tableau)

In [42]:
ingredients = [j for i in df['ingredient_annotations'] for j in i]

In [43]:
ingredients_df = pd.DataFrame.from_dict(dict(Counter(ingredients)),orient = 'index').rename(columns = {0:'Count'}).sort_values(by='Count',ascending = False)

In [44]:
tags = [j for i in df['tags'] for j in i]

In [45]:
tags_df = pd.DataFrame.from_dict(dict(Counter(tags)),orient = 'index').rename(columns = {0:'Count'}).sort_values(by='Count',ascending = False)

## Spearman Correlation Test

In [46]:

fat_corr = stats.spearmanr(df['rating'].apply(pd.to_numeric,errors = 'coerce'),df['Fat/Calories'])
carbs_corr = stats.spearmanr(df['rating'].apply(pd.to_numeric,errors = 'coerce'),df['Carbs/Calories'])
fiber_corr = stats.spearmanr(df['rating'].apply(pd.to_numeric,errors = 'coerce'),df['Fiber/Calories'])
sugar_corr = stats.spearmanr(df['rating'].apply(pd.to_numeric,errors = 'coerce'),df['Sugar/Calories'])
protein_corr = stats.spearmanr(df['rating'].apply(pd.to_numeric,errors = 'coerce'),df['Proteins/Calories'])
calories_corr = stats.spearmanr(df['rating'].apply(pd.to_numeric,errors = 'coerce'),df['Calories'])

In [47]:
fat_corr

SignificanceResult(statistic=0.07351171807753841, pvalue=0.0012855735557481865)

In [48]:
carbs_corr

SignificanceResult(statistic=-0.14245883753707664, pvalue=3.8008952106756696e-10)

In [49]:
fiber_corr

SignificanceResult(statistic=-0.03526959149122147, pvalue=0.1228563208672871)

In [50]:
sugar_corr

SignificanceResult(statistic=-0.02819794788942805, pvalue=0.21742587582378411)

In [51]:
protein_corr

SignificanceResult(statistic=0.14280910151332282, pvalue=3.439534705107723e-10)

In [52]:
calories_corr

SignificanceResult(statistic=0.03896946193736873, pvalue=0.08821950478308285)

## Exports

In [53]:
#ingredients_df.to_csv('../../Data/vizualizations/ingredients_df_main.csv')
#tags_df.to_csv('../../Data/vizualizations/tags_df_main.csv')
#df.to_csv('../../Data/vizualizations/main_viz.csv',index = False)
#mean_percentage_protein.to_csv('../../Data/vizualizations/mean_percentage_protein.csv')