# ANAC


#### imports

In [2127]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix

#### set seed

In [2128]:
seed = 2024
np.random.seed(seed)

#### read in data

In [2129]:
diet_csv = pd.read_csv("diet.csv").copy()
recipes_csv = pd.read_csv("recipes.csv").copy()
requests_csv = pd.read_csv("requests.csv").copy()
reviews_csv = pd.read_csv("reviews.csv").copy()

  reviews_csv = pd.read_csv("reviews.csv").copy()


In [2130]:
diet_csv.info() # 271907 entries
requests_csv.info() # 140195 entries
recipes_csv.info() #75604 entries
reviews_csv.info() # 140195 entries
# --> not all customers gave a review!

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271907 entries, 0 to 271906
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   AuthorId  271907 non-null  object
 1   Diet      271906 non-null  object
 2   Age       271907 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 6.2+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140195 entries, 0 to 140194
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   AuthorId      140195 non-null  object 
 1   RecipeId      140195 non-null  int64  
 2   Time          140195 non-null  float64
 3   HighCalories  140195 non-null  float64
 4   HighProtein   140195 non-null  object 
 5   LowFat        140195 non-null  int64  
 6   LowSugar      140195 non-null  object 
 7   HighFiber     140195 non-null  int64  
dtypes: float64(2), int64(3), object(3)
memory usage: 8.6+ MB
<class 'pandas.core.frame.DataFra

### Data Cleaning
#### diet_csv

In [2131]:
#rename column
diet_csv.rename(columns= {"AuthorId" : "CustomerId"}, inplace=True)
# Change type of Diet into category
diet_csv["Diet"] = diet_csv["Diet"].astype("category")
diet_csv.dropna(subset=["Diet"], inplace=True)
#one hot coding  diet
diet_csv = pd.get_dummies(diet_csv, columns=['Diet'], prefix='Diet')
diet_csv['Diet_Omnivore'] = diet_csv['Diet_Omnivore'].astype('int')
diet_csv['Diet_Vegan'] = diet_csv['Diet_Vegan'].astype('int')
diet_csv['Diet_Vegetarian'] = diet_csv['Diet_Vegetarian'].astype('int')

#### recipes_csv

In [2132]:
# Change type for Recipe Category
recipes_csv["RecipeCategory"] = recipes_csv["RecipeCategory"].astype("category")


In [2133]:
# Add new column TotalTimeNeeded
recipes_csv["TotalTimeNeeded"] = recipes_csv["CookTime"] + recipes_csv["PrepTime"]

#drop recipeservings and yields and saturated + cholesterol + sodium + carbohydratecontent 
recipes_csv = recipes_csv.drop("RecipeServings", axis=1)
recipes_csv = recipes_csv.drop("RecipeYield", axis=1)
#recipes_csv = recipes_csv.drop("SaturatedFatContent", axis=1)
#recipes_csv = recipes_csv.drop("CholesterolContent", axis=1)
#recipes_csv = recipes_csv.drop("SodiumContent", axis=1)
#recipes_csv = recipes_csv.drop("CarbohydrateContent", axis=1)
#also drop cook n preptime n quantities n parts
#recipes_csv = recipes_csv.drop("PrepTime", axis=1)
#recipes_csv = recipes_csv.drop("CookTime", axis=1)
recipes_csv = recipes_csv.drop("RecipeIngredientQuantities", axis=1)
recipes_csv = recipes_csv.drop("RecipeIngredientParts", axis=1)

##### requests_csv

In [2134]:
#rename column
requests_csv.rename(columns= {"AuthorId" : "CustomerId"}, inplace=True)
# Map indifferent values for HighProtein and LowSugar
#requests_csv["HighProtein"] = requests_csv["HighProtein"].map({
#    "Indifferent": 0,
#    "0": 0,
#    "1": 1,
#    "Yes": 1
#})
from sklearn.preprocessing import LabelEncoder

mapping = {'Yes': 1, 'No': 0, 'Indifferent': -1}
requests_csv['HighProtein'] = requests_csv['HighProtein'].map(mapping)

#requests_csv["LowSugar"] = requests_csv["LowSugar"].map({
#    "Indifferent": 1,
#    "0": 0,
#   "1": 1
#})

mapping = {'Yes': 1, '0': 0, 'Indifferent': -1}
requests_csv['LowSugar'] = requests_csv['LowSugar'].map(mapping)


In [2135]:
requests_csv

Unnamed: 0,CustomerId,RecipeId,Time,HighCalories,HighProtein,LowFat,LowSugar,HighFiber
0,2001012259B,73440,1799.950949,0.0,-1,0,0,0
1,437641B,365718,4201.820980,0.0,1,0,-1,1
2,1803340263D,141757,6299.861496,0.0,-1,1,-1,0
3,854048B,280351,19801.365796,0.0,1,1,0,1
4,2277685E,180505,5400.093457,0.0,-1,0,0,0
...,...,...,...,...,...,...,...,...
140190,163793B,78171,1560.649725,0.0,-1,0,0,1
140191,33888B,333262,1502.011466,1.0,-1,1,0,0
140192,401942C,49200,5999.274269,0.0,-1,0,0,1
140193,346866B,214815,899.523513,0.0,1,1,-1,1


In [2136]:
# Change types to boolean
requests_csv["HighCalories"] = requests_csv["HighCalories"].astype("int")
requests_csv["LowFat"] = requests_csv["LowFat"].astype("int")
requests_csv["HighFiber"] = requests_csv["HighFiber"].astype("int")
requests_csv["HighProtein"] = requests_csv["HighProtein"].astype("int")
requests_csv["LowSugar"] = requests_csv["LowSugar"].astype("int")

#TODO: how to handle negative time values? -> here: take the total time value from recipes by recipeId!
request_with_recipe = pd.merge(requests_csv, recipes_csv, on="RecipeId", how="left")
request_with_recipe['Time'] = np.where(request_with_recipe['Time'] < 0, request_with_recipe['TotalTimeNeeded'], request_with_recipe['Time'])
#request_with_recipe['Time'] = np.where(request_with_recipe['Time'] < 0, 0, request_with_recipe['Time'])
request_with_recipe['Time'] = request_with_recipe['Time'].round().astype('int')
request_with_recipe.rename(columns={'Time':'MaxTime'}, inplace=True)
requests_csv = request_with_recipe[['CustomerId', 'RecipeId', 'MaxTime', 'HighCalories','HighProtein', 'LowFat', 'LowSugar', 'HighFiber']]
#request_with_recipe = request_with_recipe[['CustomerId','RecipeId', 'Time','TotalTimeNeeded']]

In [2137]:
requests_csv

Unnamed: 0,CustomerId,RecipeId,MaxTime,HighCalories,HighProtein,LowFat,LowSugar,HighFiber
0,2001012259B,73440,1800,0,-1,0,0,0
1,437641B,365718,4202,0,1,0,-1,1
2,1803340263D,141757,6300,0,-1,1,-1,0
3,854048B,280351,19801,0,1,1,0,1
4,2277685E,180505,5400,0,-1,0,0,0
...,...,...,...,...,...,...,...,...
140190,163793B,78171,1561,0,-1,0,0,1
140191,33888B,333262,1502,1,-1,1,0,0
140192,401942C,49200,5999,0,-1,0,0,1
140193,346866B,214815,900,0,1,1,-1,1


#### reviews_csv

In [2138]:
#rename column
reviews_csv.rename(columns= {"AuthorId" : "CustomerId"}, inplace=True)
# Remove NA rows and Rating column and TestSetId
reviews_csv = reviews_csv.drop("Rating", axis=1)
reviews_csv.dropna(subset=["Like"], inplace=True) # note: now the entries are reduced to 97381 entries
reviews_csv = reviews_csv.drop("TestSetId", axis=1)

In [2139]:
# convert Like type to boolean
reviews_csv["Like"] = reviews_csv["Like"].astype("int")

In [2140]:
#merge diet + review
review_with_diet = pd.merge(reviews_csv, diet_csv, on="CustomerId", how="inner") # 97381 entries
#merge diet + review + request
review_diet_with_request = pd.merge(review_with_diet, requests_csv, on=["CustomerId", "RecipeId"], how="inner") # 97381 entries


In [2141]:
review_diet_with_request

Unnamed: 0,CustomerId,RecipeId,Like,Age,Diet_Omnivore,Diet_Vegan,Diet_Vegetarian,MaxTime,HighCalories,HighProtein,LowFat,LowSugar,HighFiber
0,1000036C,320576,0,50,0,0,1,119,0,-1,0,-1,1
1,1000216B,189335,0,78,0,0,1,1199,0,1,0,0,1
2,1000221A,133043,0,25,0,0,1,362,0,1,0,-1,1
3,1000221A,90537,0,25,0,0,1,1199,0,1,0,0,1
4,1000221A,334314,0,25,0,0,1,5400,1,-1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
97376,999595E,338070,0,31,0,0,1,3899,0,-1,1,-1,0
97377,999774A,29002,0,57,0,0,1,2402,0,-1,0,-1,0
97378,999774A,159252,0,57,0,0,1,6000,0,1,0,0,0
97379,999774A,1171,1,57,0,0,1,480,1,1,0,0,0


In [2142]:
#one-hot encoding recipes
recipes_csv = pd.get_dummies(recipes_csv, columns=['RecipeCategory'], prefix='RecipeCategory')
recipes_csv['RecipeCategory_Beverages'] = recipes_csv['RecipeCategory_Beverages'].astype('int')
recipes_csv['RecipeCategory_Bread'] = recipes_csv['RecipeCategory_Bread'].astype('int')
recipes_csv['RecipeCategory_Breakfast'] = recipes_csv['RecipeCategory_Breakfast'].astype('int')
recipes_csv['RecipeCategory_Lunch'] = recipes_csv['RecipeCategory_Lunch'].astype('int')
recipes_csv['RecipeCategory_One dish meal'] = recipes_csv['RecipeCategory_One dish meal'].astype('int')
recipes_csv.rename(columns={'RecipeCategory_One dish meal': 'RecipeCategory_One_dish_meal'}, inplace=True)
recipes_csv['RecipeCategory_Soup'] = recipes_csv['RecipeCategory_Soup'].astype('int')
recipes_csv['RecipeCategory_Other'] = recipes_csv['RecipeCategory_Other'].astype('int')
recipes_csv.drop('Name', axis=1, inplace=True)



In [2143]:
recipes_csv

Unnamed: 0,RecipeId,CookTime,PrepTime,Calories,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,TotalTimeNeeded,RecipeCategory_Beverages,RecipeCategory_Bread,RecipeCategory_Breakfast,RecipeCategory_Lunch,RecipeCategory_One_dish_meal,RecipeCategory_Other,RecipeCategory_Soup
0,73440,0,1800,241.3,10.1,1.2,0.0,13.1,31.8,2.3,1.4,6.7,1800,0,0,0,0,0,1,0
1,365718,3600,600,370.8,17.5,7.2,22.9,553.3,44.3,1.6,2.2,9.4,4200,0,0,0,0,0,1,0
2,141757,3600,2700,377.6,20.9,10.5,45.7,1501.8,36.6,3.8,6.1,12.9,6300,0,0,0,0,0,1,0
3,280351,18000,1800,282.8,16.5,10.3,50.5,630.2,22.8,2.3,2.7,11.7,19800,0,0,0,0,0,1,0
4,180505,3600,1800,257.5,8.6,2.4,110.7,160.9,39.8,0.4,30.2,6.3,5400,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75599,253577,43200,28800,121.5,0.5,0.1,0.0,1175.1,22.2,7.8,0.6,7.9,72000,0,0,0,0,0,1,0
75600,267827,3600,2700,652.2,25.8,10.7,197.9,435.5,51.9,7.5,7.2,50.1,6300,0,0,0,0,0,1,0
75601,266983,1800,900,223.9,9.2,3.6,78.3,725.9,7.3,1.1,1.7,26.7,2700,0,0,0,0,0,1,0
75602,253739,300,120,2229.8,80.3,69.3,0.0,294.7,369.0,15.7,317.9,26.7,420,0,0,0,0,0,1,0


In [2144]:
# merge 'review_with_diet' with 'recipes_csv'
df = pd.merge(review_diet_with_request, recipes_csv, on='RecipeId', how='left')
df['TimeDeviation'] = df['MaxTime'] - df['TotalTimeNeeded']
#df.drop(['TotalTimeNeeded', 'MaxTime'], axis=1, inplace=True)

In [2145]:
df
#TODO test p value for each variable
#TODO fit model

Unnamed: 0,CustomerId,RecipeId,Like,Age,Diet_Omnivore,Diet_Vegan,Diet_Vegetarian,MaxTime,HighCalories,HighProtein,...,ProteinContent,TotalTimeNeeded,RecipeCategory_Beverages,RecipeCategory_Bread,RecipeCategory_Breakfast,RecipeCategory_Lunch,RecipeCategory_One_dish_meal,RecipeCategory_Other,RecipeCategory_Soup,TimeDeviation
0,1000036C,320576,0,50,0,0,1,119,0,-1,...,0.0,120,1,0,0,0,0,0,0,-1
1,1000216B,189335,0,78,0,0,1,1199,0,1,...,16.3,1200,0,0,0,0,0,1,0,-1
2,1000221A,133043,0,25,0,0,1,362,0,1,...,0.2,360,0,0,0,0,0,1,0,2
3,1000221A,90537,0,25,0,0,1,1199,0,1,...,68.9,1200,0,0,1,0,0,0,0,-1
4,1000221A,334314,0,25,0,0,1,5400,1,-1,...,18.1,5400,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97376,999595E,338070,0,31,0,0,1,3899,0,-1,...,5.2,3900,0,0,0,0,0,1,0,-1
97377,999774A,29002,0,57,0,0,1,2402,0,-1,...,32.0,2400,0,0,0,0,0,1,0,2
97378,999774A,159252,0,57,0,0,1,6000,0,1,...,1.1,6000,0,0,0,0,0,1,0,0
97379,999774A,1171,1,57,0,0,1,480,1,1,...,18.2,480,1,0,0,0,0,0,0,0


In [2146]:
import statsmodels.api as sm

X = df.drop(columns=["Like", "CustomerId", "RecipeId", "Diet_Vegan", "RecipeCategory_Other", "PrepTime", "CarbohydrateContent", "Calories", "SaturatedFatContent", "CookTime", "MaxTime"]).copy()
y = df['Like']

In [2147]:
#Test for multicollinearity using VIF test
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

for index, variable_name in enumerate(X.columns):
    if variable_name == "const": 
        continue
    print(f"VIF for variable {variable_name} is {vif(X, index)}")
    

VIF for variable Age is 5.958577953126548
VIF for variable Diet_Omnivore is 1.7398038960247015
VIF for variable Diet_Vegetarian is 6.425025997528694
VIF for variable HighCalories is 1.637488511137945
VIF for variable HighProtein is 1.0422147430311672
VIF for variable LowFat is 1.4081753155345331
VIF for variable LowSugar is 1.4093301772666844
VIF for variable HighFiber is 1.6361456675928467
VIF for variable FatContent is 8.46952296038486
VIF for variable CholesterolContent is 6.065253098004526
VIF for variable SodiumContent is 1.1086606301606432
VIF for variable FiberContent is 1.5461414271685197
VIF for variable SugarContent is 1.5169636418476735
VIF for variable ProteinContent is 9.818465156647411
VIF for variable TotalTimeNeeded is 1.012796106458233
VIF for variable RecipeCategory_Beverages is 1.0461882442778034
VIF for variable RecipeCategory_Bread is 1.0769005152389732
VIF for variable RecipeCategory_Breakfast is 1.0676623233454483
VIF for variable RecipeCategory_Lunch is 1.085349

In [2148]:
# split data into learning and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
  train_test_split(X, y,
                   test_size=0.3, 
                   shuffle=True,
                   random_state=2024)

In [2149]:
#fit model
model_sm = sm.Logit(y_train, sm.add_constant(X_train))
results = model_sm.fit()
print(results.summary())
# McFadden Ratio
print("McFadden Ratio",results.prsquared)

Optimization terminated successfully.
         Current function value: 0.342637
         Iterations 7
                           Logit Regression Results                           
Dep. Variable:                   Like   No. Observations:                68166
Model:                          Logit   Df Residuals:                    68143
Method:                           MLE   Df Model:                           22
Date:                Sat, 06 Jan 2024   Pseudo R-squ.:                  0.1198
Time:                        23:01:50   Log-Likelihood:                -23356.
converged:                       True   LL-Null:                       -26536.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                   coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------
const                           -4.7195      0.071    -66.114      0.000

In [2150]:
# Extract the p-values for each variable
p_values = results.pvalues

# Filter variables with p-values greater than 0.05
variables_to_print = p_values[p_values > 0.1].index

# Print the variables with p-values greater than 0.05
print("Variables with p-values > 0.05:")
print(variables_to_print)

Variables with p-values > 0.05:
Index(['Diet_Vegetarian', 'CholesterolContent', 'SodiumContent',
       'TotalTimeNeeded', 'RecipeCategory_Soup', 'TimeDeviation'],
      dtype='object')


In [2151]:
# Specify the restriction for PrepTime (assuming "PrepTime" is the column name in X)
wald_test_result_prep_time = results.wald_test("(ProteinContent = 0)", scalar=True)

# Print results for PrepTime
print(f"Test statistic (chi^2_{int(wald_test_result_prep_time.df_denom)}-distributed): {wald_test_result_prep_time.statistic}")
print(f"P-value of the statistic: {wald_test_result_prep_time.pvalue}")

Test statistic (chi^2_1-distributed): 17.33216737414434
P-value of the statistic: 3.138293011589294e-05


In [2152]:
# Predict on the test set
X_test_with_const = sm.add_constant(X_test)  # Add constant to the test set
predictions = results.predict(X_test_with_const)
y_pred_test = (predictions > 0.5).astype(int)

# Compare predictions to actual labels
comparison_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_test})

# Print the comparison
print(comparison_df)


       Actual  Predicted
14781       0          0
2909        0          0
43665       0          0
45082       0          0
22571       0          0
...       ...        ...
86061       0          0
37320       0          0
15654       0          0
72934       0          0
52863       0          0

[29215 rows x 2 columns]


In [2153]:
from sklearn.metrics import confusion_matrix, balanced_accuracy_score

# build confusion matrix
confusion_matrix = confusion_matrix(y_test, y_pred_test)
correct_predictions = (y_test == y_pred_test).sum()
total_predictions = len(y_test)
accuracy = correct_predictions / total_predictions

# Calculate Sensitivity and Specificity
sensitivity = confusion_matrix[1, 1] / (confusion_matrix[1, 1] + confusion_matrix[1, 0])
specificity = confusion_matrix[0, 0] / (confusion_matrix[0, 0] + confusion_matrix[0, 1])



# Alternatively, you can use scikit-learn's balanced_accuracy_score function
balanced_accuracy_sklearn = balanced_accuracy_score(y_test, y_pred_test)

print("Accuracy =", "{:.3f}".format(accuracy))
print("Balanced Accuracy:", balanced_accuracy_sklearn)

Accuracy = 0.866
Balanced Accuracy: 0.5103426582296998


In [2154]:
# build confusion matrix
confusion_matrix = pd.crosstab(y_test, y_pred_test)
confusion_matrix

col_0,0,1
Like,Unnamed: 1_level_1,Unnamed: 2_level_1
0,25199,104
1,3815,97



#### 1. Logistic Regression
#### 2. Decision Trees
#### 3. Random Forest
#### 4. Naive bayes
#### 5. Gradient Boosting

#### 1. Data Cleaning (missing values, merge tables)
#### 2. Set dataset into training & testing sets + download testing set
#### 3. modeling
#### 4. Model training: train model on training dataset
#### 5. Model evaluation (performance)
#### 6. prediction on the unseen data