# ANAC


#### imports

In [2384]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix

#### set seed

In [2385]:
seed = 2024
np.random.seed(seed)

#### read in data

In [2386]:
diet_csv = pd.read_csv("diet.csv").copy()
recipes_csv = pd.read_csv("recipes.csv").copy()
requests_csv = pd.read_csv("requests.csv").copy()
reviews_csv = pd.read_csv("reviews.csv").copy()

  reviews_csv = pd.read_csv("reviews.csv").copy()


In [2387]:
diet_csv.info() # 271907 entries
requests_csv.info() # 140195 entries
recipes_csv.info() #75604 entries
reviews_csv.info() # 140195 entries
# --> not all customers gave a review!

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271907 entries, 0 to 271906
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   AuthorId  271907 non-null  object
 1   Diet      271906 non-null  object
 2   Age       271907 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 6.2+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140195 entries, 0 to 140194
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   AuthorId      140195 non-null  object 
 1   RecipeId      140195 non-null  int64  
 2   Time          140195 non-null  float64
 3   HighCalories  140195 non-null  float64
 4   HighProtein   140195 non-null  object 
 5   LowFat        140195 non-null  int64  
 6   LowSugar      140195 non-null  object 
 7   HighFiber     140195 non-null  int64  
dtypes: float64(2), int64(3), object(3)
memory usage: 8.6+ MB
<class 'pandas.core.frame.DataFra

### Data Cleaning
#### diet_csv

In [2388]:
#rename column
diet_csv.rename(columns= {"AuthorId" : "CustomerId"}, inplace=True)
# Change type of Diet into category
diet_csv["Diet"] = diet_csv["Diet"].astype("category")
diet_csv.dropna(subset=["Diet"], inplace=True)
#one hot coding  diet
diet_csv = pd.get_dummies(diet_csv, columns=['Diet'], prefix='Diet')
diet_csv['Diet_Omnivore'] = diet_csv['Diet_Omnivore'].astype('int')
diet_csv['Diet_Vegan'] = diet_csv['Diet_Vegan'].astype('int')
diet_csv['Diet_Vegetarian'] = diet_csv['Diet_Vegetarian'].astype('int')

#### recipes_csv

In [2389]:
# Change type for Recipe Category
recipes_csv["RecipeCategory"] = recipes_csv["RecipeCategory"].astype("category")


In [2390]:
# Add new column TotalTimeNeeded
recipes_csv["TotalTimeNeeded"] = recipes_csv["CookTime"] + recipes_csv["PrepTime"]

#drop recipeservings and yields and saturated + cholesterol + sodium + carbohydratecontent 
recipes_csv = recipes_csv.drop("RecipeServings", axis=1)
recipes_csv = recipes_csv.drop("RecipeYield", axis=1)
#recipes_csv = recipes_csv.drop("SaturatedFatContent", axis=1)
#recipes_csv = recipes_csv.drop("CholesterolContent", axis=1)
#recipes_csv = recipes_csv.drop("SodiumContent", axis=1)
#recipes_csv = recipes_csv.drop("CarbohydrateContent", axis=1)
#also drop cook n preptime n quantities n parts
#recipes_csv = recipes_csv.drop("PrepTime", axis=1)
#recipes_csv = recipes_csv.drop("CookTime", axis=1)
recipes_csv = recipes_csv.drop("RecipeIngredientQuantities", axis=1)
recipes_csv = recipes_csv.drop("RecipeIngredientParts", axis=1)

##### requests_csv

In [2391]:
#rename column
requests_csv.rename(columns= {"AuthorId" : "CustomerId"}, inplace=True)
# Map indifferent values for HighProtein and LowSugar
requests_csv["HighProtein"] = requests_csv["HighProtein"].map({
    "Indifferent": 0,
    "0": 0,
    "1": 1,
    "Yes": 1
})

requests_csv["LowSugar"] = requests_csv["LowSugar"].map({
    "Indifferent": 1,
    "0": 0,
    "1": 1
})


In [2392]:
# Change types to boolean
requests_csv["HighCalories"] = requests_csv["HighCalories"].astype("int")
requests_csv["LowFat"] = requests_csv["LowFat"].astype("int")
requests_csv["HighFiber"] = requests_csv["HighFiber"].astype("int")
requests_csv["HighProtein"] = requests_csv["HighProtein"].astype("int")
requests_csv["LowSugar"] = requests_csv["LowSugar"].astype("int")

#TODO: how to handle negative time values? -> here: take the total time value from recipes by recipeId!
request_with_recipe = pd.merge(requests_csv, recipes_csv, on="RecipeId", how="left")
request_with_recipe['Time'] = np.where(request_with_recipe['Time'] < 0, request_with_recipe['TotalTimeNeeded'], request_with_recipe['Time'])
request_with_recipe['Time'] = request_with_recipe['Time'].round().astype('int')
request_with_recipe.rename(columns={'Time':'MaxTime'}, inplace=True)
requests_csv = request_with_recipe[['CustomerId', 'RecipeId', 'MaxTime', 'HighCalories','HighProtein', 'LowFat', 'LowSugar', 'HighFiber']]
#request_with_recipe = request_with_recipe[['CustomerId','RecipeId', 'Time','TotalTimeNeeded']]

In [2393]:
requests_csv

Unnamed: 0,CustomerId,RecipeId,MaxTime,HighCalories,HighProtein,LowFat,LowSugar,HighFiber
0,2001012259B,73440,1800,0,0,0,0,0
1,437641B,365718,4202,0,1,0,1,1
2,1803340263D,141757,6300,0,0,1,1,0
3,854048B,280351,19801,0,1,1,0,1
4,2277685E,180505,5400,0,0,0,0,0
...,...,...,...,...,...,...,...,...
140190,163793B,78171,1561,0,0,0,0,1
140191,33888B,333262,1502,1,0,1,0,0
140192,401942C,49200,5999,0,0,0,0,1
140193,346866B,214815,900,0,1,1,1,1


#### reviews_csv

In [2394]:
#rename column
reviews_csv.rename(columns= {"AuthorId" : "CustomerId"}, inplace=True)
# Remove NA rows and Rating column and TestSetId
reviews_csv = reviews_csv.drop("Rating", axis=1)
reviews_csv.dropna(subset=["Like"], inplace=True) # note: now the entries are reduced to 97381 entries
reviews_csv = reviews_csv.drop("TestSetId", axis=1)

In [2395]:
# convert Like type to boolean
reviews_csv["Like"] = reviews_csv["Like"].astype("int")

In [2396]:
#merge diet + review
review_with_diet = pd.merge(reviews_csv, diet_csv, on="CustomerId", how="inner") # 97381 entries
#merge diet + review + request
review_diet_with_request = pd.merge(review_with_diet, requests_csv, on=["CustomerId", "RecipeId"], how="inner") # 97381 entries


In [2397]:
review_diet_with_request

Unnamed: 0,CustomerId,RecipeId,Like,Age,Diet_Omnivore,Diet_Vegan,Diet_Vegetarian,MaxTime,HighCalories,HighProtein,LowFat,LowSugar,HighFiber
0,1000036C,320576,0,50,0,0,1,119,0,0,0,1,1
1,1000216B,189335,0,78,0,0,1,1199,0,1,0,0,1
2,1000221A,133043,0,25,0,0,1,362,0,1,0,1,1
3,1000221A,90537,0,25,0,0,1,1199,0,1,0,0,1
4,1000221A,334314,0,25,0,0,1,5400,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
97376,999595E,338070,0,31,0,0,1,3899,0,0,1,1,0
97377,999774A,29002,0,57,0,0,1,2402,0,0,0,1,0
97378,999774A,159252,0,57,0,0,1,6000,0,1,0,0,0
97379,999774A,1171,1,57,0,0,1,480,1,1,0,0,0


In [2398]:
#one-hot encoding recipes
recipes_csv = pd.get_dummies(recipes_csv, columns=['RecipeCategory'], prefix='RecipeCategory')
recipes_csv['RecipeCategory_Beverages'] = recipes_csv['RecipeCategory_Beverages'].astype('int')
recipes_csv['RecipeCategory_Bread'] = recipes_csv['RecipeCategory_Bread'].astype('int')
recipes_csv['RecipeCategory_Breakfast'] = recipes_csv['RecipeCategory_Breakfast'].astype('int')
recipes_csv['RecipeCategory_Lunch'] = recipes_csv['RecipeCategory_Lunch'].astype('int')
recipes_csv['RecipeCategory_One dish meal'] = recipes_csv['RecipeCategory_One dish meal'].astype('int')
recipes_csv.rename(columns={'RecipeCategory_One dish meal': 'RecipeCategory_One_dish_meal'}, inplace=True)
recipes_csv['RecipeCategory_Soup'] = recipes_csv['RecipeCategory_Soup'].astype('int')
recipes_csv['RecipeCategory_Other'] = recipes_csv['RecipeCategory_Other'].astype('int')
recipes_csv.drop('Name', axis=1, inplace=True)



In [2399]:
recipes_csv

Unnamed: 0,RecipeId,CookTime,PrepTime,Calories,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,TotalTimeNeeded,RecipeCategory_Beverages,RecipeCategory_Bread,RecipeCategory_Breakfast,RecipeCategory_Lunch,RecipeCategory_One_dish_meal,RecipeCategory_Other,RecipeCategory_Soup
0,73440,0,1800,241.3,10.1,1.2,0.0,13.1,31.8,2.3,1.4,6.7,1800,0,0,0,0,0,1,0
1,365718,3600,600,370.8,17.5,7.2,22.9,553.3,44.3,1.6,2.2,9.4,4200,0,0,0,0,0,1,0
2,141757,3600,2700,377.6,20.9,10.5,45.7,1501.8,36.6,3.8,6.1,12.9,6300,0,0,0,0,0,1,0
3,280351,18000,1800,282.8,16.5,10.3,50.5,630.2,22.8,2.3,2.7,11.7,19800,0,0,0,0,0,1,0
4,180505,3600,1800,257.5,8.6,2.4,110.7,160.9,39.8,0.4,30.2,6.3,5400,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75599,253577,43200,28800,121.5,0.5,0.1,0.0,1175.1,22.2,7.8,0.6,7.9,72000,0,0,0,0,0,1,0
75600,267827,3600,2700,652.2,25.8,10.7,197.9,435.5,51.9,7.5,7.2,50.1,6300,0,0,0,0,0,1,0
75601,266983,1800,900,223.9,9.2,3.6,78.3,725.9,7.3,1.1,1.7,26.7,2700,0,0,0,0,0,1,0
75602,253739,300,120,2229.8,80.3,69.3,0.0,294.7,369.0,15.7,317.9,26.7,420,0,0,0,0,0,1,0


In [2400]:
# merge 'review_with_diet' with 'recipes_csv'
df = pd.merge(review_diet_with_request, recipes_csv, on='RecipeId', how='left')
df['TimeDeviation'] = df['MaxTime'] - df['TotalTimeNeeded']
#df.drop(['TotalTimeNeeded', 'MaxTime'], axis=1, inplace=True)

In [2401]:
df
#TODO test p value for each variable
#TODO fit model

Unnamed: 0,CustomerId,RecipeId,Like,Age,Diet_Omnivore,Diet_Vegan,Diet_Vegetarian,MaxTime,HighCalories,HighProtein,...,ProteinContent,TotalTimeNeeded,RecipeCategory_Beverages,RecipeCategory_Bread,RecipeCategory_Breakfast,RecipeCategory_Lunch,RecipeCategory_One_dish_meal,RecipeCategory_Other,RecipeCategory_Soup,TimeDeviation
0,1000036C,320576,0,50,0,0,1,119,0,0,...,0.0,120,1,0,0,0,0,0,0,-1
1,1000216B,189335,0,78,0,0,1,1199,0,1,...,16.3,1200,0,0,0,0,0,1,0,-1
2,1000221A,133043,0,25,0,0,1,362,0,1,...,0.2,360,0,0,0,0,0,1,0,2
3,1000221A,90537,0,25,0,0,1,1199,0,1,...,68.9,1200,0,0,1,0,0,0,0,-1
4,1000221A,334314,0,25,0,0,1,5400,1,0,...,18.1,5400,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97376,999595E,338070,0,31,0,0,1,3899,0,0,...,5.2,3900,0,0,0,0,0,1,0,-1
97377,999774A,29002,0,57,0,0,1,2402,0,0,...,32.0,2400,0,0,0,0,0,1,0,2
97378,999774A,159252,0,57,0,0,1,6000,0,1,...,1.1,6000,0,0,0,0,0,1,0,0
97379,999774A,1171,1,57,0,0,1,480,1,1,...,18.2,480,1,0,0,0,0,0,0,0


In [2402]:
import statsmodels.api as sm

X = df[["Diet_Vegetarian", "Diet_Vegan", "Diet_Omnivore", "Age","MaxTime", "HighCalories", "HighFiber", "HighProtein", "LowFat", "LowSugar", "CookTime", "PrepTime", "Calories", "FatContent", "SaturatedFatContent", "CholesterolContent", "SodiumContent", "CarbohydrateContent", "FiberContent", "SugarContent", "ProteinContent", "TotalTimeNeeded", "TimeDeviation", "RecipeCategory_Other", "RecipeCategory_Soup", "RecipeCategory_Lunch", "RecipeCategory_Breakfast", "RecipeCategory_Bread", "RecipeCategory_Beverages", "RecipeCategory_One_dish_meal" ]].copy()
y = df['Like']

In [2403]:
#dropping
X.drop(columns=['Diet_Vegetarian','MaxTime', 'CookTime', 'RecipeCategory_Other'], inplace=True)

In [2404]:
#Test for multicollinearity using VIF test
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

for index, variable_name in enumerate(X.columns):
    if variable_name == "const": 
        continue
    print(f"VIF for variable {variable_name} is {vif(X, index)}")
    

VIF for variable Diet_Vegan is 1.0663657899314225
VIF for variable Diet_Omnivore is 1.136247950757977
VIF for variable Age is 3.265543956751271
VIF for variable HighCalories is 1.5773884126411808
VIF for variable HighFiber is 1.5767912961223876
VIF for variable HighProtein is 1.564406492371117
VIF for variable LowFat is 1.375640469797117
VIF for variable LowSugar is 1.3766175430893879
VIF for variable PrepTime is 1.497483123392879
VIF for variable Calories is 568.4598067722246
VIF for variable FatContent is 295.40010008883075
VIF for variable SaturatedFatContent is 34.188007529219256
VIF for variable CholesterolContent is 6.371348177432757
VIF for variable SodiumContent is 1.1148413639658523
VIF for variable CarbohydrateContent is 66.51178788581674
VIF for variable FiberContent is 2.6418593211767103
VIF for variable SugarContent is 7.909130429504921
VIF for variable ProteinContent is 35.11043162387945
VIF for variable TotalTimeNeeded is 1.5232223825159916
VIF for variable TimeDeviation

In [2405]:
# split data into learning and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
  train_test_split(X, y,
                   test_size=0.3, 
                   shuffle=True,
                   random_state=3)

In [2406]:
#fit model
model_sm = sm.Logit(y_train, sm.add_constant(X_train))
results = model_sm.fit()
print(results.summary())
# McFadden Ratio
print("McFadden Ratio",results.prsquared)

Optimization terminated successfully.
         Current function value: 0.344729
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                   Like   No. Observations:                68166
Model:                          Logit   Df Residuals:                    68139
Method:                           MLE   Df Model:                           26
Date:                Sat, 06 Jan 2024   Pseudo R-squ.:                  0.1191
Time:                        16:33:35   Log-Likelihood:                -23499.
converged:                       True   LL-Null:                       -26676.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                   coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------
const                           -4.6918      0.051    -91.998      0.000

In [2407]:
# Extract the p-values for each variable
p_values = results.pvalues

# Filter variables with p-values greater than 0.05
variables_to_print = p_values[p_values > 0.05].index

# Print the variables with p-values greater than 0.05
print("Variables with p-values > 0.05:")
print(variables_to_print)

Variables with p-values > 0.05:
Index(['Diet_Vegan', 'PrepTime', 'Calories', 'SaturatedFatContent',
       'CholesterolContent', 'SodiumContent', 'TotalTimeNeeded',
       'TimeDeviation', 'RecipeCategory_Lunch'],
      dtype='object')


In [2408]:
# Specify the restriction for PrepTime (assuming "PrepTime" is the column name in X)
wald_test_result_prep_time = results.wald_test("(ProteinContent = 0)", scalar=True)

# Print results for PrepTime
print(f"Test statistic (chi^2_{int(wald_test_result_prep_time.df_denom)}-distributed): {wald_test_result_prep_time.statistic}")
print(f"P-value of the statistic: {wald_test_result_prep_time.pvalue}")

Test statistic (chi^2_1-distributed): 4.873230358169181
P-value of the statistic: 0.027276395081885393


In [2409]:
# Predict on the test set
X_test_with_const = sm.add_constant(X_test)  # Add constant to the test set
predictions = results.predict(X_test_with_const)
y_pred_test = (predictions > 0.5).astype(int)

# Compare predictions to actual labels
comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_test})

# Print the comparison
print(comparison_df)


       Actual  Predicted
20653       0          0
82063       0          0
85333       0          0
96197       0          0
83016       0          0
...       ...        ...
47865       0          0
22491       1          0
10052       0          0
44580       0          0
4127        0          0

[29215 rows x 2 columns]


In [2410]:
# build confusion matrix
confusion_matrix = pd.crosstab(y_test, y_pred_test)
confusion_matrix

col_0,0,1
Like,Unnamed: 1_level_1,Unnamed: 2_level_1
0,25244,133
1,3715,123


In [2411]:
correct_predictions = (y_test == y_pred_test).sum()
total_predictions = len(y_test)
accuracy = correct_predictions / total_predictions

print("Accuracy =", "{:.3f}".format(accuracy))

Accuracy = 0.868



#### 1. Logistic Regression
#### 2. Decision Trees
#### 3. Random Forest
#### 4. Naive bayes
#### 5. Gradient Boosting

#### 1. Data Cleaning (missing values, merge tables)
#### 2. Set dataset into training & testing sets + download testing set
#### 3. modeling
#### 4. Model training: train model on training dataset
#### 5. Model evaluation (performance)
#### 6. prediction on the unseen data