# ANAC


#### imports

In [2]:
import pandas as pd
import numpy as np

#### set seed

In [3]:
seed = 2024
np.random.seed(seed)

#### read in data

In [4]:
diet_csv = pd.read_csv("diet.csv") 
recipes_csv = pd.read_csv("recipes.csv")
requests_csv = pd.read_csv("requests.csv")
reviews_csv = pd.read_csv("reviews.csv")

### Data Cleaning
#### diet_csv

In [None]:
diet_csv.info()

In [6]:
# Change type of Diet into category
diet_csv["Diet"] = diet_csv["Diet"].astype("category")

#### recipes_csv

In [None]:
recipes_csv.info()

In [7]:
# Change type for Recipe Category
recipes_csv["RecipeCategory"] = recipes_csv["RecipeCategory"].astype("category")
# the amount of ingredients vs. like


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75604 entries, 0 to 75603
Data columns (total 18 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   RecipeId                    75604 non-null  int64  
 1   Name                        75604 non-null  object 
 2   CookTime                    75604 non-null  int64  
 3   PrepTime                    75604 non-null  int64  
 4   RecipeCategory              75604 non-null  object 
 5   RecipeIngredientQuantities  75604 non-null  object 
 6   RecipeIngredientParts       75604 non-null  object 
 7   Calories                    75604 non-null  float64
 8   FatContent                  75604 non-null  float64
 9   SaturatedFatContent         75604 non-null  float64
 10  CholesterolContent          75604 non-null  float64
 11  SodiumContent               75604 non-null  float64
 12  CarbohydrateContent         75604 non-null  float64
 13  FiberContent                756

In [None]:
# Add new column TotalTimeNeeded
recipes_csv["TotalTimeNeeded"] = recipes_csv["CookTime"] + recipes_csv["PrepTime"]

##### requests_csv

In [8]:
requests_csv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140195 entries, 0 to 140194
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   AuthorId      140195 non-null  object 
 1   RecipeId      140195 non-null  int64  
 2   Time          140195 non-null  float64
 3   HighCalories  140195 non-null  float64
 4   HighProtein   140195 non-null  object 
 5   LowFat        140195 non-null  int64  
 6   LowSugar      140195 non-null  object 
 7   HighFiber     140195 non-null  int64  
dtypes: float64(2), int64(3), object(3)
memory usage: 8.6+ MB


In [9]:
# Map indifferent values for HighProtein and LowSugar
requests_csv["HighProtein"] = requests_csv["HighProtein"].map({
    "Indifferent": 0,
    "0": 0,
    "1": 1,
    "Yes": 1
})

requests_csv["LowSugar"] = requests_csv["LowSugar"].map({
    "Indifferent": 0,
    "0": 0,
    "1": 1
})


In [None]:
# Change types to boolean
requests_csv["HighCalories"] = requests_csv["HighCalories"].astype("bool")
requests_csv["LowFat"] = requests_csv["LowFat"].astype("bool")
requests_csv["HighFiber"] = requests_csv["HighFiber"].astype("bool")
requests_csv["HighProtein"] = requests_csv["HighProtein"].astype("bool")
requests_csv["LowSugar"] = requests_csv["LowSugar"].astype("bool")

#### reviews_csv

In [17]:
reviews_csv.info()
# TODO why are there missing values for TestSetId? 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140195 entries, 0 to 140194
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   AuthorId   140195 non-null  object 
 1   RecipeId   140195 non-null  int64  
 2   Rating     77108 non-null   float64
 3   Like       97381 non-null   object 
 4   TestSetId  42814 non-null   float64
dtypes: float64(2), int64(1), object(2)
memory usage: 5.3+ MB


In [None]:
# Remove NA rows and Rating column
reviews_csv = reviews_csv.drop("Rating", axis=1)
reviews_csv = reviews_csv.dropna(subset=["Like"], inplace=False)

In [24]:
# convert Like type to boolean
reviews_csv["Like"] = reviews_csv["Like"].astype("boolean")


# TODO Join the tables
# 1. Logistic Regression
# 2. Decision Trees
# 3. Random Forest
# 4. Naive bayes
# 5. Gradient Boosting

# 1. Data Cleaning (missing values, merge tables)
# 2. Set dataset into training & testing sets + download testing set
# 3. modeling
# 4. Model training: train model on training dataset
# 5. Model evaluation (performance)
# 6. prediction on the unseen data