# ANAC

# data read-in

In [27]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from scipy.stats import skew, kurtosis, probplot

In [28]:
seed = 2024
np.random.seed(seed)

In [29]:
diet_csv = pd.read_csv("diet.csv").copy()
recipes_csv = pd.read_csv("recipes.csv").copy()
requests_csv = pd.read_csv("requests.csv").copy()
reviews_csv = pd.read_csv("reviews.csv").copy()

  reviews_csv = pd.read_csv("reviews.csv").copy()


# Basics

In [30]:
diet_csv.rename(columns= {"AuthorId" : "CustomerId"}, inplace=True)
reviews_csv.rename(columns= {"AuthorId" : "CustomerId"}, inplace=True)

diet_csv["Diet"] = diet_csv["Diet"].astype("category")

recipes_csv["RecipeCategory"] = recipes_csv["RecipeCategory"].astype("category")

requests_csv.rename(columns= {
    "AuthorId" : "CustomerId",
    "Time": "MaxTime"
}, inplace=True)

# Handing missing values

In [31]:
diet_csv['Diet'].fillna('Vegetarian', inplace=True)


# with mean
recipesServings_mean = recipes_csv['RecipeServings'].mean()
#fill na rows with the mean
recipes_csv['RecipeServings'].fillna(recipesServings_mean, inplace=True)
recipesServings_mean = recipes_csv['RecipeServings'].mean()



mapping_cal = {1: 1, 0.0: 0}
requests_csv['HighCalories'] = requests_csv['HighCalories'].map(mapping_cal).astype('category')

mapping_protein = {'Yes': 'Yes', 'Indifferent': 'Indifferent', 'No': 'No', }
requests_csv['HighProtein'] = requests_csv['HighProtein'].map(mapping_protein).astype('category')

requests_csv['LowFat'] = requests_csv['LowFat'].astype('category')

mapping_sugar = {'1': 'Yes', 'Indifferent': 'Indifferent', '0': 'No', }
requests_csv['LowSugar'] = requests_csv['LowSugar'].map(mapping_sugar).astype('category')

requests_csv['HighFiber'] = requests_csv['HighFiber'].astype('category')

# Merge tables


In [32]:
#merge diet + review
reviews_csv["Like"] = reviews_csv["Like"].astype("category")
reviews_csv = reviews_csv.drop("Rating", axis=1)
reviews_csv.dropna(subset=["Like"], inplace=True) # note: now the entries are reduced to 97381 entries
reviews_csv = reviews_csv.drop("TestSetId", axis=1)
review_with_diet = pd.merge(reviews_csv, diet_csv, on="CustomerId", how="inner") # 97381 entries
#merge diet + review + request
review_diet_with_request = pd.merge(review_with_diet, requests_csv, on=["CustomerId", "RecipeId"], how="inner") # 97381 entries
df = pd.merge(review_diet_with_request, recipes_csv, on='RecipeId', how='left')
df = pd.get_dummies(df, columns=['Diet', 'RecipeCategory', 'HighProtein', 'LowSugar'], drop_first=True) 

df

Unnamed: 0,CustomerId,RecipeId,Like,Age,MaxTime,HighCalories,LowFat,HighFiber,Name,CookTime,...,Diet_Vegan,Diet_Vegetarian,RecipeCategory_Bread,RecipeCategory_Breakfast,RecipeCategory_Lunch,RecipeCategory_One dish meal,RecipeCategory_Other,RecipeCategory_Soup,HighProtein_Yes,LowSugar_No
0,1000036C,320576,False,50,119.024930,0,0,1,Downeaster,0,...,False,True,False,False,False,False,False,False,False,False
1,1000216B,189335,False,78,1199.386790,0,0,1,Thai Rice Soup (Kao Tome Gai),600,...,False,True,False,False,False,False,True,False,True,True
2,1000221A,133043,False,25,362.152341,0,0,1,Lemon and Thyme Marinade for Poultry,60,...,False,True,False,False,False,False,True,False,True,False
3,1000221A,90537,False,25,1198.957497,0,0,1,Black Bean Salsa,0,...,False,True,False,True,False,False,False,False,True,True
4,1000221A,334314,False,25,5400.036634,1,0,0,Irish Soda Bread,3600,...,False,True,True,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97376,999595E,338070,False,31,3899.421310,0,1,0,Pumpkin Cake Mix Dessert,3000,...,False,True,False,False,False,False,True,False,False,False
97377,999774A,29002,False,57,2402.372535,0,0,0,Summer Corkscrew Pasta,1200,...,False,True,False,False,False,False,True,False,False,False
97378,999774A,159252,False,57,5999.598903,0,0,0,"Chili, Kaffir Lime and Lemongrass Jelly",4800,...,False,True,False,False,False,False,True,False,True,True
97379,999774A,1171,True,57,480.233207,1,0,0,Kahlua Hot Chocolate,360,...,False,True,False,False,False,False,False,False,True,True


# Plot continuous variables vs Like

## Handling outliers

In [33]:

maxtime_val = df['MaxTime'].max()
outliers = df['MaxTime'] >= maxtime_val
median_without_outliers = df.loc[~outliers, 'MaxTime'].median()
df.loc[outliers, 'MaxTime'] = median_without_outliers

maxtime_val = df['PrepTime'].max()
outliers = df['PrepTime'] >= maxtime_val
median_without_outliers = df.loc[~outliers, 'PrepTime'].median()
df.loc[outliers, 'PrepTime'] = median_without_outliers
outliers = (df['Like'] == True) & (df['PrepTime'] > 3000000)
df.loc[outliers, 'PrepTime'] = median_without_outliers

outliers = (df['Like'] == True) & (df['Calories'] > 30000)
median_without_outliers = df.loc[~outliers, 'Calories'].median()
df.loc[outliers, 'Calories'] = median_without_outliers

outliers = df['FatContent'] > 25000
median_without_outliers = df.loc[~outliers, 'FatContent'].median()
df.loc[outliers, 'FatContent'] = median_without_outliers
outliers = (df['Like'] == True) & (df['FatContent'] > 2500)
df.loc[outliers, 'FatContent'] = median_without_outliers

outliers = df['SaturatedFatContent'] > 12000
median_without_outliers = df.loc[~outliers, 'SaturatedFatContent'].median()
df.loc[outliers, 'SaturatedFatContent'] = median_without_outliers

outliers = df['CholesterolContent'] > 35000
median_without_outliers = df.loc[~outliers, 'CholesterolContent'].median()
df.loc[outliers, 'CholesterolContent'] = median_without_outliers
outliers = (df['Like'] == True) & (df['CholesterolContent'] > 10000)
df.loc[outliers, 'CholesterolContent'] = median_without_outliers

outliers = (df['Like'] == True) & (df['CarbohydrateContent'] > 4000)
median_without_outliers = df.loc[~outliers, 'CarbohydrateContent'].median()
df.loc[outliers, 'CarbohydrateContent'] = median_without_outliers

outliers = (df['Like'] == True) & (df['FiberContent'] > 400)
median_without_outliers = df.loc[~outliers, 'FiberContent'].median()
df.loc[outliers, 'FiberContent'] = median_without_outliers

outliers = (df['Like'] == True) & (df['SugarContent'] > 4000)
median_without_outliers = df.loc[~outliers, 'SugarContent'].median()
df.loc[outliers, 'SugarContent'] = median_without_outliers

outliers = df['ProteinContent'] > 17500
median_without_outliers = df.loc[~outliers, 'ProteinContent'].median()
df.loc[outliers, 'ProteinContent'] = median_without_outliers
outliers = (df['Like'] == True) & (df['ProteinContent'] > 3000)
df.loc[outliers, 'ProteinContent'] = median_without_outliers

outliers = df['RecipeServings'] > 30000
median_without_outliers = df.loc[~outliers, 'RecipeServings'].median()
df.loc[outliers, 'RecipeServings'] = median_without_outliers
outliers = (df['Like'] == True) & (df['RecipeServings'] > 400)
df.loc[outliers, 'RecipeServings'] = median_without_outliers


# Data standardization and dimensional reduction

# Fit Model

In [34]:
X = df.drop(['CustomerId', 'RecipeId', 'Like', 'Name', 'RecipeIngredientQuantities', 'RecipeIngredientParts', 'RecipeYield', 'RecipeCategory_Other', ], axis=1)
y = df['Like']

In [35]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
    train_test_split(X, y,
                     test_size=0.3,
                     shuffle=True,
                     random_state=3)
from sklearn.tree import DecisionTreeClassifier

In [36]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score

gradient_boosting = GradientBoostingClassifier(random_state=seed)

param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.2, 0.4, 0.6],
    'max_depth': [3, 4, 5],
}

grid_search = GridSearchCV(estimator=gradient_boosting, param_grid=param_grid, scoring='balanced_accuracy', cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

y_pred = best_model.predict(X_test)

balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

print("Best Parameters:", best_params)
print("Best Balanced Accuracy on Training Data:", grid_search.best_score_)
print("Balanced Accuracy on Test Data:", balanced_accuracy)




Best Parameters: {'learning_rate': 0.4, 'max_depth': 5, 'n_estimators': 300}
Best Balanced Accuracy on Training Data: 0.7053452009134731
Balanced Accuracy on Test Data: 0.713417094909135
Balanced Accuracy with bagging: 0.713417094909135
