# ANAC


#### imports

In [750]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix

#### set seed

In [751]:
seed = 2024
np.random.seed(seed)

#### read in data

In [752]:
diet_csv = pd.read_csv("diet.csv").copy()
recipes_csv = pd.read_csv("recipes.csv").copy()
requests_csv = pd.read_csv("requests.csv").copy()
reviews_csv = pd.read_csv("reviews.csv").copy()

  reviews_csv = pd.read_csv("reviews.csv").copy()


In [753]:
diet_csv.info() # 271907 entries
requests_csv.info() # 140195 entries
recipes_csv.info() #75604 entries
reviews_csv.info() # 140195 entries
# --> not all customers gave a review!

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271907 entries, 0 to 271906
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   AuthorId  271907 non-null  object
 1   Diet      271906 non-null  object
 2   Age       271907 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 6.2+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140195 entries, 0 to 140194
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   AuthorId      140195 non-null  object 
 1   RecipeId      140195 non-null  int64  
 2   Time          140195 non-null  float64
 3   HighCalories  140195 non-null  float64
 4   HighProtein   140195 non-null  object 
 5   LowFat        140195 non-null  int64  
 6   LowSugar      140195 non-null  object 
 7   HighFiber     140195 non-null  int64  
dtypes: float64(2), int64(3), object(3)
memory usage: 8.6+ MB
<class 'pandas.core.frame.DataFra

### Data Cleaning
#### diet_csv

In [754]:
#rename column
diet_csv.rename(columns= {"AuthorId" : "CustomerId"}, inplace=True)
# Change type of Diet into category
diet_csv["Diet"] = diet_csv["Diet"].astype("category")
#one hot coding  diet
diet_csv = pd.get_dummies(diet_csv, columns=['Diet'], prefix='Diet')
diet_csv['Diet_Omnivore'] = diet_csv['Diet_Omnivore'].astype('int')
diet_csv['Diet_Vegan'] = diet_csv['Diet_Vegan'].astype('int')
diet_csv['Diet_Vegetarian'] = diet_csv['Diet_Vegetarian'].astype('int')

#### recipes_csv

In [755]:
# Change type for Recipe Category
recipes_csv["RecipeCategory"] = recipes_csv["RecipeCategory"].astype("category")


In [756]:
# Add new column TotalTimeNeeded
recipes_csv["TotalTimeNeeded"] = recipes_csv["CookTime"] + recipes_csv["PrepTime"]

#drop recipeservings and yields and saturated + cholesterol + sodium + carbohydratecontent 
recipes_csv = recipes_csv.drop("RecipeServings", axis=1)
recipes_csv = recipes_csv.drop("RecipeYield", axis=1)
recipes_csv = recipes_csv.drop("SaturatedFatContent", axis=1)
recipes_csv = recipes_csv.drop("CholesterolContent", axis=1)
recipes_csv = recipes_csv.drop("SodiumContent", axis=1)
recipes_csv = recipes_csv.drop("CarbohydrateContent", axis=1)
#also drop cook n preptime n quantities n parts
recipes_csv = recipes_csv.drop("PrepTime", axis=1)
recipes_csv = recipes_csv.drop("CookTime", axis=1)
recipes_csv = recipes_csv.drop("RecipeIngredientQuantities", axis=1)
recipes_csv = recipes_csv.drop("RecipeIngredientParts", axis=1)

In [757]:
recipes_csv

Unnamed: 0,RecipeId,Name,RecipeCategory,Calories,FatContent,FiberContent,SugarContent,ProteinContent,TotalTimeNeeded
0,73440,Bow Ties With Broccoli Pesto,Other,241.3,10.1,2.3,1.4,6.7,1800
1,365718,Cashew-chutney Rice,Other,370.8,17.5,1.6,2.2,9.4,4200
2,141757,Copycat Taco Bell Nacho Fries BellGrande,Other,377.6,20.9,3.8,6.1,12.9,6300
3,280351,Slow Cooker Jalapeno Cheddar Cheese Soup,Other,282.8,16.5,2.3,2.7,11.7,19800
4,180505,Cool & Crisp Citrus Chiffon Pie,Other,257.5,8.6,0.4,30.2,6.3,5400
...,...,...,...,...,...,...,...,...,...
75599,253577,Frijoles Negros- Crock Pot Mexican Black Beans,Other,121.5,0.5,7.8,0.6,7.9,72000
75600,267827,Moose Moussaka,Other,652.2,25.8,7.5,7.2,50.1,6300
75601,266983,Cantonese Pepper Steak for Two (Or More),Other,223.9,9.2,1.1,1.7,26.7,2700
75602,253739,Coconut Cream Cooler,Other,2229.8,80.3,15.7,317.9,26.7,420


##### requests_csv

In [758]:
#rename column
requests_csv.rename(columns= {"AuthorId" : "CustomerId"}, inplace=True)
# Map indifferent values for HighProtein and LowSugar
requests_csv["HighProtein"] = requests_csv["HighProtein"].map({
    "Indifferent": 0,
    "0": 0,
    "1": 1,
    "Yes": 1
})

requests_csv["LowSugar"] = requests_csv["LowSugar"].map({
    "Indifferent": 0,
    "0": 0,
    "1": 1
})


In [759]:
# Change types to boolean
requests_csv["HighCalories"] = requests_csv["HighCalories"].astype("int")
requests_csv["LowFat"] = requests_csv["LowFat"].astype("int")
requests_csv["HighFiber"] = requests_csv["HighFiber"].astype("int")
requests_csv["HighProtein"] = requests_csv["HighProtein"].astype("int")
requests_csv["LowSugar"] = requests_csv["LowSugar"].astype("int")

#TODO: how to handle negative time values? -> here: take the total time value from recipes by recipeId!
request_with_recipe = pd.merge(requests_csv, recipes_csv, on="RecipeId", how="left")
request_with_recipe['Time'] = np.where(request_with_recipe['Time'] < 0, request_with_recipe['TotalTimeNeeded'], request_with_recipe['Time'])
request_with_recipe['Time'] = request_with_recipe['Time'].round().astype('int')
request_with_recipe.rename(columns={'Time':'MaxTime'}, inplace=True)
requests_csv = request_with_recipe[['CustomerId', 'RecipeId', 'MaxTime', 'HighCalories','HighProtein', 'LowFat', 'LowSugar', 'HighFiber']]
#request_with_recipe = request_with_recipe[['CustomerId','RecipeId', 'Time','TotalTimeNeeded']]

In [760]:
requests_csv

Unnamed: 0,CustomerId,RecipeId,MaxTime,HighCalories,HighProtein,LowFat,LowSugar,HighFiber
0,2001012259B,73440,1800,0,0,0,0,0
1,437641B,365718,4202,0,1,0,0,1
2,1803340263D,141757,6300,0,0,1,0,0
3,854048B,280351,19801,0,1,1,0,1
4,2277685E,180505,5400,0,0,0,0,0
...,...,...,...,...,...,...,...,...
140190,163793B,78171,1561,0,0,0,0,1
140191,33888B,333262,1502,1,0,1,0,0
140192,401942C,49200,5999,0,0,0,0,1
140193,346866B,214815,900,0,1,1,0,1


#### reviews_csv

In [761]:
#rename column
reviews_csv.rename(columns= {"AuthorId" : "CustomerId"}, inplace=True)
# Remove NA rows and Rating column and TestSetId
reviews_csv = reviews_csv.drop("Rating", axis=1)
reviews_csv.dropna(subset=["Like"], inplace=True) # note: now the entries are reduced to 97381 entries
reviews_csv = reviews_csv.drop("TestSetId", axis=1)

In [762]:
# convert Like type to boolean
reviews_csv["Like"] = reviews_csv["Like"].astype("int")

In [763]:

#merge diet + review
review_with_diet = pd.merge(reviews_csv, diet_csv, on="CustomerId", how="inner") # 97381 entries
#merge diet + review + request
review_diet_with_request = pd.merge(review_with_diet, requests_csv, on=["CustomerId", "RecipeId"], how="inner") # 97381 entries


In [764]:
review_diet_with_request

Unnamed: 0,CustomerId,RecipeId,Like,Age,Diet_Omnivore,Diet_Vegan,Diet_Vegetarian,MaxTime,HighCalories,HighProtein,LowFat,LowSugar,HighFiber
0,1000036C,320576,0,50,0,0,1,119,0,0,0,0,1
1,1000216B,189335,0,78,0,0,1,1199,0,1,0,0,1
2,1000221A,133043,0,25,0,0,1,362,0,1,0,0,1
3,1000221A,90537,0,25,0,0,1,1199,0,1,0,0,1
4,1000221A,334314,0,25,0,0,1,5400,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
97376,999595E,338070,0,31,0,0,1,3899,0,0,1,0,0
97377,999774A,29002,0,57,0,0,1,2402,0,0,0,0,0
97378,999774A,159252,0,57,0,0,1,6000,0,1,0,0,0
97379,999774A,1171,1,57,0,0,1,480,1,1,0,0,0


In [765]:
#one-hot encoding recipes
recipes_csv = pd.get_dummies(recipes_csv, columns=['RecipeCategory'], prefix='RecipeCategory')
recipes_csv['RecipeCategory_Beverages'] = recipes_csv['RecipeCategory_Beverages'].astype('int')
recipes_csv['RecipeCategory_Bread'] = recipes_csv['RecipeCategory_Bread'].astype('int')
recipes_csv['RecipeCategory_Breakfast'] = recipes_csv['RecipeCategory_Breakfast'].astype('int')
recipes_csv['RecipeCategory_Lunch'] = recipes_csv['RecipeCategory_Lunch'].astype('int')
recipes_csv['RecipeCategory_One dish meal'] = recipes_csv['RecipeCategory_One dish meal'].astype('int')
recipes_csv.rename(columns={'RecipeCategory_One dish meal': 'RecipeCategory_One_dish_meal'}, inplace=True)
recipes_csv['RecipeCategory_Soup'] = recipes_csv['RecipeCategory_Soup'].astype('int')
recipes_csv['RecipeCategory_Other'] = recipes_csv['RecipeCategory_Other'].astype('int')
recipes_csv.drop('Name', axis=1, inplace=True)



In [766]:
recipes_csv

Unnamed: 0,RecipeId,Calories,FatContent,FiberContent,SugarContent,ProteinContent,TotalTimeNeeded,RecipeCategory_Beverages,RecipeCategory_Bread,RecipeCategory_Breakfast,RecipeCategory_Lunch,RecipeCategory_One_dish_meal,RecipeCategory_Other,RecipeCategory_Soup
0,73440,241.3,10.1,2.3,1.4,6.7,1800,0,0,0,0,0,1,0
1,365718,370.8,17.5,1.6,2.2,9.4,4200,0,0,0,0,0,1,0
2,141757,377.6,20.9,3.8,6.1,12.9,6300,0,0,0,0,0,1,0
3,280351,282.8,16.5,2.3,2.7,11.7,19800,0,0,0,0,0,1,0
4,180505,257.5,8.6,0.4,30.2,6.3,5400,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75599,253577,121.5,0.5,7.8,0.6,7.9,72000,0,0,0,0,0,1,0
75600,267827,652.2,25.8,7.5,7.2,50.1,6300,0,0,0,0,0,1,0
75601,266983,223.9,9.2,1.1,1.7,26.7,2700,0,0,0,0,0,1,0
75602,253739,2229.8,80.3,15.7,317.9,26.7,420,0,0,0,0,0,1,0


In [767]:
# merge 'review_with_diet' with 'recipes_csv'
df = pd.merge(review_diet_with_request, recipes_csv, on='RecipeId', how='left')
df['TimeDeviation'] = df['MaxTime'] - df['TotalTimeNeeded']
df.drop(['TotalTimeNeeded', 'MaxTime'], axis=1, inplace=True)

In [768]:
df

#TODO test p value for each variable
#TODO fit model

Unnamed: 0,CustomerId,RecipeId,Like,Age,Diet_Omnivore,Diet_Vegan,Diet_Vegetarian,HighCalories,HighProtein,LowFat,...,SugarContent,ProteinContent,RecipeCategory_Beverages,RecipeCategory_Bread,RecipeCategory_Breakfast,RecipeCategory_Lunch,RecipeCategory_One_dish_meal,RecipeCategory_Other,RecipeCategory_Soup,TimeDeviation
0,1000036C,320576,0,50,0,0,1,0,0,0,...,0.0,0.0,1,0,0,0,0,0,0,-1
1,1000216B,189335,0,78,0,0,1,0,1,0,...,4.3,16.3,0,0,0,0,0,1,0,-1
2,1000221A,133043,0,25,0,0,1,0,1,0,...,0.4,0.2,0,0,0,0,0,1,0,2
3,1000221A,90537,0,25,0,0,1,0,1,0,...,15.7,68.9,0,0,1,0,0,0,0,-1
4,1000221A,334314,0,25,0,0,1,1,0,0,...,71.1,18.1,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97376,999595E,338070,0,31,0,0,1,0,0,1,...,27.3,5.2,0,0,0,0,0,1,0,-1
97377,999774A,29002,0,57,0,0,1,0,0,0,...,13.5,32.0,0,0,0,0,0,1,0,2
97378,999774A,159252,0,57,0,0,1,0,1,0,...,229.8,1.1,0,0,0,0,0,1,0,0
97379,999774A,1171,1,57,0,0,1,1,1,0,...,57.8,18.2,1,0,0,0,0,0,0,0



#### 1. Logistic Regression
#### 2. Decision Trees
#### 3. Random Forest
#### 4. Naive bayes
#### 5. Gradient Boosting

#### 1. Data Cleaning (missing values, merge tables)
#### 2. Set dataset into training & testing sets + download testing set
#### 3. modeling
#### 4. Model training: train model on training dataset
#### 5. Model evaluation (performance)
#### 6. prediction on the unseen data