# ANAC


In [1]:
import pandas as pd
import numpy as np

#### set seed

In [2]:
seed = 2024
np.random.seed(seed)

#### read in data

In [3]:
diet_csv = pd.read_csv("diet.csv") 
recipes_csv = pd.read_csv("recipes.csv")
requests_csv = pd.read_csv("requests.csv")

In [4]:
#it seems that a column in 'review' contains a combination of numeric and non-numeric values.
reviews_csv = pd.read_csv("reviews.csv")
#inspect the types of each column
reviews_csv.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140195 entries, 0 to 140194
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   AuthorId   140195 non-null  object 
 1   RecipeId   140195 non-null  int64  
 2   Rating     77108 non-null   float64
 3   Like       97381 non-null   object 
 4   TestSetId  42814 non-null   float64
dtypes: float64(2), int64(1), object(2)
memory usage: 5.3+ MB


  reviews_csv = pd.read_csv("reviews.csv")


### Data Cleaning
#### diet_csv

In [5]:
diet_csv["Diet"] = diet_csv["Diet"].astype("category")

#### recipes_csv

In [6]:
# TODO CookTime, PrepTime - is it in Minutes/Seconds? --> mins make more sense but I think in general those numbers are just labels for time
# TODO RecipeIngredientParts & RecipeIngredientQuantities 
# TODO What to do with Servings and Yield? --> maybe leave out at first?
# TODO recipes and requests have similar columns: LowCalories/Protein/Fat/Sugar/Fiber vs. Fat-/Sugar/etc. Content -> what is the cutoff value?
# TODO what to do with SaturatedFat , Sodium, Cholesterol, Carbohydratecontent? -> leave out
recipes_csv.info()
#missing values in: Servings and Yield
# TODO RecipeIngredientQuantities as numeric data type?
recipes_csv["RecipeCategory"] = recipes_csv["RecipeCategory"].astype("category")
# TODO check if RecipeIngredientParts matches with diet
# the amount of ingredients vs. like

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75604 entries, 0 to 75603
Data columns (total 18 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   RecipeId                    75604 non-null  int64  
 1   Name                        75604 non-null  object 
 2   CookTime                    75604 non-null  int64  
 3   PrepTime                    75604 non-null  int64  
 4   RecipeCategory              75604 non-null  object 
 5   RecipeIngredientQuantities  75604 non-null  object 
 6   RecipeIngredientParts       75604 non-null  object 
 7   Calories                    75604 non-null  float64
 8   FatContent                  75604 non-null  float64
 9   SaturatedFatContent         75604 non-null  float64
 10  CholesterolContent          75604 non-null  float64
 11  SodiumContent               75604 non-null  float64
 12  CarbohydrateContent         75604 non-null  float64
 13  FiberContent                756

##### requests_csv

In [7]:
requests_csv.info()
#calories, protein, lowfat, lowsugar, highfiber into boolean

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140195 entries, 0 to 140194
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   AuthorId      140195 non-null  object 
 1   RecipeId      140195 non-null  int64  
 2   Time          140195 non-null  float64
 3   HighCalories  140195 non-null  float64
 4   HighProtein   140195 non-null  object 
 5   LowFat        140195 non-null  int64  
 6   LowSugar      140195 non-null  object 
 7   HighFiber     140195 non-null  int64  
dtypes: float64(2), int64(3), object(3)
memory usage: 8.6+ MB


In [8]:
# TODO Time - Is it in Minutes/Seconds? 
# TODO Time = prep + cook time? 
# TODO Time :  rounding up so it is like prep&cook time in recipes?
requests_csv["HighCalories"] = requests_csv["HighCalories"].astype("bool")
requests_csv["LowFat"] = requests_csv["LowFat"].astype("bool")
requests_csv["HighFiber"] = requests_csv["HighFiber"].astype("bool")

requests_csv["HighProtein"] = requests_csv["HighProtein"].map({
    "Indifferent": 0,
    "0": 0,
    "1": 1,
    "Yes": 1
})

requests_csv["LowSugar"] = requests_csv["LowSugar"].map({
    "Indifferent": 0,
    "0": 0,
    "1": 1
})
requests_csv["HighProtein"] = requests_csv["HighProtein"].astype("bool")
requests_csv["LowSugar"] = requests_csv["LowSugar"].astype("bool")


In [9]:
requests_csv.head()

Unnamed: 0,AuthorId,RecipeId,Time,HighCalories,HighProtein,LowFat,LowSugar,HighFiber
0,2001012259B,73440,1799.950949,False,False,False,False,False
1,437641B,365718,4201.82098,False,True,False,False,True
2,1803340263D,141757,6299.861496,False,False,True,False,False
3,854048B,280351,19801.365796,False,True,True,False,True
4,2277685E,180505,5400.093457,False,False,False,False,False


#### reviews_csv

In [10]:
reviews_csv.info()
# TODO wtf rating only has values 2/NA -> delete?  -> ye
# TODO need to fix NA values for Like -> delete rows with NA? -> ye
# TODO why are there missing values for TestSetId? 
#print(reviews_csv["Rating"].values.unique())

# TODO Join the tables
# 1. Logistic Regression
# 2. Decision Trees
# 3. Random Forest
# 4. Naive bayes
# 5. Gradient Boosting

# 1. Data Cleaning (missing values, merge tables)
# 2. Set dataset into training & testing sets + download testing set
# 3. modeling
# 4. Model training: train model on training dataset
# 5. Model evaluation (performance)
# 6. prediction on the unseen data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140195 entries, 0 to 140194
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   AuthorId   140195 non-null  object 
 1   RecipeId   140195 non-null  int64  
 2   Rating     77108 non-null   float64
 3   Like       97381 non-null   object 
 4   TestSetId  42814 non-null   float64
dtypes: float64(2), int64(1), object(2)
memory usage: 5.3+ MB
