In [79]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# Note: The following do not work with Python 3.12
#import sweetviz as sv
#import shap
#from ydata_profiling import ProfileReport

In [80]:
seed = 2024

# pandas, statsmodels, matplotlib and y_data_profiling rely on numpy's random generator, and thus, we need to set the seed in numpy
np.random.seed(seed)

Readin data and preprocessing

In [81]:
# readin the diet data
df_diet = pd.read_csv('data/diet.csv')

# chcek for missing values in the data
print(df_diet.isnull().sum())

# replace missing value in Diet with "Omnivore"
print(df_diet["Diet"].unique())
df_diet["Diet"] = df_diet["Diet"].fillna("Omnivore")

# check again
print(df_diet.isnull().sum())

# Change data type of Diet to category
df_diet["Diet"] = df_diet["Diet"].astype("category")

df_diet.info()


AuthorId    0
Diet        1
Age         0
dtype: int64
['Vegetarian' 'Vegan' 'Omnivore' nan]
AuthorId    0
Diet        0
Age         0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271907 entries, 0 to 271906
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype   
---  ------    --------------   -----   
 0   AuthorId  271907 non-null  object  
 1   Diet      271907 non-null  category
 2   Age       271907 non-null  int64   
dtypes: category(1), int64(1), object(1)
memory usage: 4.4+ MB


In [82]:
# read csv file
df_requests = pd.read_csv('data/requests.csv')

# check for missing values
print(df_requests.isnull().sum())

# HighCalories is a 1.0 and 0.0, change to: true for it should have high calories and false for low calories.

df_requests['HighCalories'] = df_requests['HighCalories'].astype(bool)

# HighProtein is Indifferent and Yes, change to: true for it should have high protein and false for low protein.

df_requests['HighProtein'] = df_requests['HighProtein'].replace({'Indifferent': False, 'Yes': True})

# LowFat is a 1 and 0, change to: true for it should have low fat and false for high fat.

df_requests['LowFat'] = df_requests['LowFat'].astype(bool)

# LowSugar is Indifferent and 0, change to: true for it should have low sugar and false for high sugar.

df_requests['LowSugar'] = df_requests['LowSugar'].replace({'Indifferent': False, '0': True})

# HighFiber is 1 and 0, change to: true for it should have high fiber and false for low fiber.

df_requests['HighFiber'] = df_requests['HighFiber'].astype(bool)

# print(df_requests.head(8))
print(df_requests.info())


AuthorId        0
RecipeId        0
Time            0
HighCalories    0
HighProtein     0
LowFat          0
LowSugar        0
HighFiber       0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140195 entries, 0 to 140194
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   AuthorId      140195 non-null  object 
 1   RecipeId      140195 non-null  int64  
 2   Time          140195 non-null  float64
 3   HighCalories  140195 non-null  bool   
 4   HighProtein   140195 non-null  bool   
 5   LowFat        140195 non-null  bool   
 6   LowSugar      140195 non-null  bool   
 7   HighFiber     140195 non-null  bool   
dtypes: bool(5), float64(1), int64(1), object(1)
memory usage: 3.9+ MB
None


In [83]:
merged_df = df_diet.merge(df_requests, on='AuthorId', how='left')

print(merged_df.info())

print(merged_df.head(8))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362754 entries, 0 to 362753
Data columns (total 10 columns):
 #   Column        Non-Null Count   Dtype   
---  ------        --------------   -----   
 0   AuthorId      362754 non-null  object  
 1   Diet          362754 non-null  category
 2   Age           362754 non-null  int64   
 3   RecipeId      140195 non-null  float64 
 4   Time          140195 non-null  float64 
 5   HighCalories  140195 non-null  object  
 6   HighProtein   140195 non-null  object  
 7   LowFat        140195 non-null  object  
 8   LowSugar      140195 non-null  object  
 9   HighFiber     140195 non-null  object  
dtypes: category(1), float64(2), int64(1), object(6)
memory usage: 25.3+ MB
None
    AuthorId        Diet  Age  RecipeId       Time HighCalories HighProtein   
0  10000120E  Vegetarian   46       NaN        NaN          NaN         NaN  \
1   1000014D       Vegan   18       NaN        NaN          NaN         NaN   
2   1000015A  Vegetarian   58  

Now clean recipes and merge it

In [84]:
#load the data
df_recipes = pd.read_csv('data/recipes.csv')

#check for missing values
print(df_recipes.isnull().sum())

#drop the missing values
# df_recipes = df_recipes.dropna()

df_recipes["RecipeCategory"] = df_recipes["RecipeCategory"].astype("category")

RecipeId                          0
Name                              0
CookTime                          0
PrepTime                          0
RecipeCategory                    0
RecipeIngredientQuantities        0
RecipeIngredientParts             0
Calories                          0
FatContent                        0
SaturatedFatContent               0
CholesterolContent                0
SodiumContent                     0
CarbohydrateContent               0
FiberContent                      0
SugarContent                      0
ProteinContent                    0
RecipeServings                26713
RecipeYield                   50295
dtype: int64


In [85]:
merged_df = merged_df.merge(df_recipes, on='RecipeId', how='left')

print(merged_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 362754 entries, 0 to 362753
Data columns (total 27 columns):
 #   Column                      Non-Null Count   Dtype   
---  ------                      --------------   -----   
 0   AuthorId                    362754 non-null  object  
 1   Diet                        362754 non-null  category
 2   Age                         362754 non-null  int64   
 3   RecipeId                    140195 non-null  float64 
 4   Time                        140195 non-null  float64 
 5   HighCalories                140195 non-null  object  
 6   HighProtein                 140195 non-null  object  
 7   LowFat                      140195 non-null  object  
 8   LowSugar                    140195 non-null  object  
 9   HighFiber                   140195 non-null  object  
 10  Name                        140195 non-null  object  
 11  CookTime                    140195 non-null  float64 
 12  PrepTime                    140195 non-null  float64 
 13 

In [92]:
# load reviews data
df_reviews = pd.read_csv('data/reviews.csv')

print(df_reviews.info())

# df_reviews['Rating'] = df_reviews['Rating'].fillna(0)
# sns.countplot(data=df_reviews, x='Rating')

# drop Rating column
df_reviews = df_reviews.drop('Rating', axis=1)

# check for missing values
# print(df_reviews.isnull().sum())

# change the data type of Like to boolean
df_reviews['Like'] = df_reviews['Like'].astype(bool)

# drop all rows having a TestSetId
df_reviews = df_reviews[df_reviews['TestSetId'].isna()]

print(df_reviews.info())
prit

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140195 entries, 0 to 140194
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   AuthorId   140195 non-null  object 
 1   RecipeId   140195 non-null  int64  
 2   Rating     77108 non-null   float64
 3   Like       97381 non-null   object 
 4   TestSetId  42814 non-null   float64
dtypes: float64(2), int64(1), object(2)
memory usage: 5.3+ MB
None
AuthorId         0
RecipeId         0
Like         42814
TestSetId    97381
dtype: int64
<class 'pandas.core.frame.DataFrame'>
Index: 97381 entries, 42814 to 140194
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   AuthorId   97381 non-null  object 
 1   RecipeId   97381 non-null  int64  
 2   Like       97381 non-null  bool   
 3   TestSetId  0 non-null      float64
dtypes: bool(1), float64(1), int64(1), object(1)
memory usage: 3.1+ MB
None


  df_reviews = pd.read_csv('data/reviews.csv')


In [88]:
df_reviews = df_reviews[df_reviews['TestSetId'].isna()]
df_reviews

Unnamed: 0,AuthorId,RecipeId,Rating,Like,TestSetId
42814,1000036C,320576,,False,
42815,1000216B,189335,,False,
42816,1000221A,133043,2.0,False,
42817,1000221A,90537,2.0,False,
42818,1000221A,334314,2.0,False,
...,...,...,...,...,...
140190,999595E,338070,2.0,False,
140191,999774A,29002,2.0,False,
140192,999774A,159252,,False,
140193,999774A,1171,2.0,True,
