In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# Note: The following do not work with Python 3.12
#import sweetviz as sv
#import shap
#from ydata_profiling import ProfileReport


seed = 2024  #seed = 2024: train model as stated in example_crisp_dm_pipeline.ipynb
np.random.seed(seed)

# Pre Data Cleaning: Readin data and preprocessing individual table

##### Recipes

In [2]:
df_recipes = pd.read_csv('data/recipes.csv')

# Consolidated Non-Vegetarian Keywords
non_vegetarian_keywords = list(set([
    'flounder', 'lobsters', 'lump', 'rack', 'shank', 'steak', 'scallops', 'alligator', 
    'livers', 'roe', 'ham', 'turkey', 'chicken', 'duck', 'bacon', 'tuna', 'swordfish', 
    'lobster', 'meatballs', 'salmon', 'sweetbreads', 'breasts', 'chicken-flavored', 
    'ducklings', 'drumstick', 'liver', 'shanks', 'rabbit', 'poultry', 'herring', 
    'mussels', 'clams', 'squid', 'pork', 'veal', 'haddock', 'chorizo', 'chihuahua', 
    'eel', 'stuffing', 'cod', 'gelatin', 'sausage', 'curd', 'thighs', 'lox', 'cabbage', 
    'wonton', 'bone', 'giblets', 'pheasant', 'quail', 'shrimp', 'fish', 'sole', 
    'gizzard', 'Canadian', 'pesto', 'truffles', 'anchovies', 'venison', 'pheasants', 
    'tenderloin', 'meats', 'tripe', 'breast', 'wings', 'ribs', 'sausages', 'trout', 
    'oysters', 'octopus', 'crab', 'prawns', 'catfish', 'sardines', 'mahi', 'halibut', 
    'bass', 'perch', 'tilapia', 'grouper'
]))

# Consolidated Non-Vegan Keywords
non_vegan_keywords = list(set([
    'milk', 'cheese', 'butter', 'egg', 'honey', 'mozzarella-cheddar', 'cream', 'whip', 
    'jarlsberg', 'fontina', 'ham', 'cheesecake', 'hollandaise', 'caviar', 'creamRegular', 
    'custard', 'yogurt', 'gouda', 'margarine', 'beef', 'salmon', 'sour', 'bisquick', 
    'carton', 'cotija', 'creme', 'buttercream', 'buttermilk', 'ricotta', 'cottage', 
    'eggs', 'mayonnaise', 'eggshells', 'lactose-free', 'skim', 'ghee', 'mascarpone', 
    'alfredo', 'whey', 'casein', 'lactose', 'albumin', 'bechamel', 'sour cream', 
    'cream cheese', 'feta', 'gorgonzola', 'parmesan', 'mozzarella', 'cheddar', 'brie', 
    'camembert', 'roquefort', 'stilton', 'blue cheese', 'colby', 'monterey jack', 
    'swiss cheese', 'provolone', 'edam', 'havarti', 'pecorino', 'asiago', 'emmental', 
    'gruyere', 'halloumi', 'manchego', 'paneer', 'queso fresco', 'ricotta salata', 
    'romano', 'taleggio', 'vacherin', 'milk chocolate', 'whey protein', 'casein protein', 
    'egg noodles', 'egg whites', 'egg yolks', 'hollandaise sauce', 'aioli', 'flan', 
    'quiche', 'meringue', 'pavlova', 'egg wash', 'frittata', 'omelette', 'scrambled eggs', 
    'poached eggs', 'hard-boiled eggs', 'deviled eggs', 'eggnog', 'brioche', 'challah', 
    'pound cake', 'sponge cake', 'angel food cake', 'ladyfingers', 'mousse', 'souffle', 
    'creme brulee', 'panna cotta', 'tiramisu', 'yorkshire pudding', 'beef broth', 
    'chicken broth', 'fish sauce', 'oyster sauce', 'worcestershire sauce', 'caesar dressing', 
    'carbonara sauce', 'béarnaise sauce', 'gravlax', 'smoked salmon', 'caviar', 'anchovy paste', 
    'fish stock'
]))


# Function to check if a RecipeIngredientParts is vegetarian
def is_vegetarian(ingredient):
    for keyword in non_vegetarian_keywords:
        if keyword in ingredient.lower():
            return False
    return True

# Function to check if a RecipeIngredientParts is vegan
def is_vegan(ingredient):
    for keyword in non_vegan_keywords:
        if keyword in ingredient.lower():
            return False
    return True

# Apply the is_vegetarian function to the RecipeIngredientParts column
df_recipes['is_vegetarian'] = df_recipes['RecipeIngredientParts'].apply(is_vegetarian)

# Apply the is_vegan function to the RecipeIngredientParts column
df_recipes['is_vegan'] = df_recipes['RecipeIngredientParts'].apply(is_vegan)

# Map the diet category based on the is_vegetarian and is_vegan columns
df_recipes['diet_category'] = df_recipes.apply(lambda row: 'Vegetarian' if row['is_vegetarian'] else 'Vegan' if row['is_vegan'] else 'Omnivore', axis=1)

# create TotalTime_Recipe column
df_recipes['TotalTime_Recipe'] = df_recipes['CookTime'] + df_recipes['PrepTime']

# drop columns
df_recipes = df_recipes.drop(columns=['Name', 'CookTime', 'PrepTime', 'RecipeIngredientParts', 'RecipeIngredientQuantities', 'RecipeYield', 'is_vegetarian', 'is_vegan'])
# dtype conversion
df_recipes["RecipeCategory"] = df_recipes["RecipeCategory"].astype("category")
df_recipes["diet_category"] = df_recipes["diet_category"].astype("category")
# rename columns
df_recipes = df_recipes.rename(columns={"diet_category": "recipe_diet_category"})

#df_recipes.info()
#df_recipes.head()

##### Diet

In [3]:
df_diet = pd.read_csv('data/diet.csv')

# chcek for missing values in the data
print(df_diet.isnull().sum())

# replace missing value in Diet with "Omnivore"
print(df_diet["Diet"].unique())
df_diet["Diet"] = df_diet["Diet"].fillna("Omnivore")

# check again
print(df_diet.isnull().sum())

# Change data type of Diet to category
df_diet["Diet"] = df_diet["Diet"].astype("category")

# rename the column Diet to diet_category
df_diet = df_diet.rename(columns={"Diet": "user_diet_category"})

#df_diet.info()
#df_diet.head()


AuthorId    0
Diet        1
Age         0
dtype: int64
['Vegetarian' 'Vegan' 'Omnivore' nan]
AuthorId    0
Diet        0
Age         0
dtype: int64


##### Request

In [4]:
df_requests = pd.read_csv('data/requests.csv')

# check for missing values
print(df_requests.isnull().sum())

#dtype
df_requests['HighCalories'] = df_requests['HighCalories'].astype('boolean')

df_requests['HighProtein'] = df_requests['HighProtein'].replace({'Indifferent': False, 'Yes': True})
df_requests['HighProtein'] = df_requests['HighProtein'].astype('boolean')

df_requests['LowFat'] = df_requests['LowFat'].astype('boolean')

df_requests['LowSugar'] = df_requests['LowSugar'].replace({'Indifferent': False, '0': True})
df_requests['LowSugar'] = df_requests['LowSugar'].astype('boolean')

df_requests['HighFiber'] = df_requests['HighFiber'].astype('boolean')

# rename columns
df_requests.rename(columns={'Time': 'TotalTime_Requested'}, inplace=True)
df_requests.rename(columns={'HighCalories': 'HighCalories_Requested'}, inplace=True)
df_requests.rename(columns={'HighProtein': 'HighProtein_Requested'}, inplace=True)
df_requests.rename(columns={'LowFat': 'LowFat_Requested'}, inplace=True)
df_requests.rename(columns={'LowSugar': 'LowSugar_Requested'}, inplace=True)
df_requests.rename(columns={'HighFiber': 'HighFiber_Requested'}, inplace=True)

#df_requests.info() 
#df_requests.head()


AuthorId        0
RecipeId        0
Time            0
HighCalories    0
HighProtein     0
LowFat          0
LowSugar        0
HighFiber       0
dtype: int64


##### Review

In [5]:
df_reviews = pd.read_csv('data/reviews.csv')

#sns.countplot(data=df_reviews, x='Rating')  # Rating is only 2 except 2 rows -> drop Rating column
df_reviews = df_reviews.drop('Rating', axis=1)

# check for missing values
# print(df_reviews.isnull().sum())

# dtype 
df_reviews['Like'] = df_reviews['Like'].astype('boolean')

#df_reviews.info()
#df_reviews.head()

  df_reviews = pd.read_csv('data/reviews.csv')


# Data aggregation (Merge the tables)

In [6]:
# keep all request, add info about custormers diet when exit -> df_diet right_join df_requests
merged_df_diet_request = df_diet.merge(df_requests, on='AuthorId', how='right')
#merged_df_diet_request.head(100)

# request without matched recipe, or recipe without request is useless  -> normal join 
merged_df_diet_request_recipes = merged_df_diet_request.merge(df_recipes, on='RecipeId')
#merged_df_diet_request_recipes.tail(100)

# review without request,recipes is useless -> left 
merged_df_diet_request_recipes_reviews = merged_df_diet_request_recipes.merge(df_reviews, on=['RecipeId', 'AuthorId'], how='left')
#merged_df_diet_request_recipes_reviews.info()

merged_df = merged_df_diet_request_recipes_reviews

# Post Data Cleaning (after merged) 

In [7]:
# https://ishanjainoffical.medium.com/choosing-the-right-correlation-pearson-vs-spearman-vs-kendalls-tau-02dc7d7dd01d
def plot_corr(df, title, is_like=True):
    if 'Like' in df:
        df = df[df['Like'] == 1]
    fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(25, 7))
    nutrients_corr = df.corr(method='kendall') 
    mask = np.triu(np.ones_like(nutrients_corr, dtype=bool))
    cmap = sns.color_palette("coolwarm", as_cmap=True)
    sns.heatmap(nutrients_corr, mask=mask, cmap=cmap, annot=True, fmt=".2f", ax=ax1, center=0)
    ax1.set_title(title + ' - kendall', fontsize=16)
    nutrients_corr = df.corr(method='pearson')
    mask = np.triu(np.ones_like(nutrients_corr, dtype=bool))
    cmap = sns.color_palette("coolwarm", as_cmap=True)
    sns.heatmap(nutrients_corr, mask=mask, cmap=cmap, annot=True, fmt=".2f", ax=ax2, center=0)
    ax2.set_title(title + ' - pearson', fontsize=16)
    plt.show()

In [8]:
# Filter 5% outliers for each recipes nutrient
df_nutrients = merged_df[['Calories', 'FatContent', 'SaturatedFatContent', 'CholesterolContent', 'SodiumContent', 'CarbohydrateContent', 'FiberContent', 'SugarContent', 'ProteinContent',
                          'HighCalories_Requested', 'HighProtein_Requested', 'LowFat_Requested', 'LowSugar_Requested', 'HighFiber_Requested', 'Like']]

threshold_map = {}
for col in df_nutrients.columns:
    if col in ['HighCalories_Requested', 'HighProtein_Requested', 'LowFat_Requested', 'LowSugar_Requested', 'HighFiber_Requested', 'Like']:
        continue
    for i in range(10000):
        threshold = i
        percent = (df_nutrients[df_nutrients[col] > threshold][col].count() / df_nutrients[col].count())*100
        if percent <= 5:
            threshold_map[col] = threshold
            break
print(threshold_map)

merged_df = merged_df[merged_df['Calories'] < threshold_map['Calories']]
merged_df = merged_df[merged_df['FatContent'] < threshold_map['FatContent']]
merged_df = merged_df[merged_df['SaturatedFatContent'] < threshold_map['SaturatedFatContent']]
merged_df = merged_df[merged_df['CholesterolContent'] < threshold_map['CholesterolContent']]
merged_df = merged_df[merged_df['SodiumContent'] < threshold_map['SodiumContent']]
merged_df = merged_df[merged_df['CarbohydrateContent'] < threshold_map['CarbohydrateContent']]
merged_df = merged_df[merged_df['FiberContent'] < threshold_map['FiberContent']]
merged_df = merged_df[merged_df['SugarContent'] < threshold_map['SugarContent']]
merged_df = merged_df[merged_df['ProteinContent'] < threshold_map['ProteinContent']]


# -> Drop unimportant columns 
merged_df = merged_df.drop(columns=['AuthorId', 'RecipeId', 'TotalTime_Requested', 'TotalTime_Recipe', 'RecipeServings', 'RecipeCategory', 'SaturatedFatContent', 'CholesterolContent', 'FiberContent', 'SugarContent', 'LowSugar_Requested', 'HighFiber_Requested', 'Age'])
# One-Hot_encoding for categorical columns
merged_df = pd.get_dummies(merged_df, columns=['user_diet_category', 'recipe_diet_category'])


{'Calories': 1304, 'FatContent': 76, 'SaturatedFatContent': 30, 'CholesterolContent': 290, 'SodiumContent': 2219, 'CarbohydrateContent': 138, 'FiberContent': 13, 'SugarContent': 72, 'ProteinContent': 54}


In [9]:
merged_df.info()
merged_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 115349 entries, 0 to 140194
Data columns (total 16 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   HighCalories_Requested           115349 non-null  boolean
 1   HighProtein_Requested            115349 non-null  boolean
 2   LowFat_Requested                 115349 non-null  boolean
 3   Calories                         115349 non-null  float64
 4   FatContent                       115349 non-null  float64
 5   SodiumContent                    115349 non-null  float64
 6   CarbohydrateContent              115349 non-null  float64
 7   ProteinContent                   115349 non-null  float64
 8   Like                             80145 non-null   boolean
 9   TestSetId                        35204 non-null   float64
 10  user_diet_category_Omnivore      115349 non-null  bool   
 11  user_diet_category_Vegan         115349 non-null  bool   
 12  user_di

Unnamed: 0,HighCalories_Requested,HighProtein_Requested,LowFat_Requested,Calories,FatContent,SodiumContent,CarbohydrateContent,ProteinContent,Like,TestSetId,user_diet_category_Omnivore,user_diet_category_Vegan,user_diet_category_Vegetarian,recipe_diet_category_Omnivore,recipe_diet_category_Vegan,recipe_diet_category_Vegetarian
0,False,False,False,241.3,10.1,13.1,31.8,6.7,False,,False,False,True,False,False,True
1,True,True,True,241.3,10.1,13.1,31.8,6.7,False,,False,False,True,False,False,True
2,True,False,False,241.3,10.1,13.1,31.8,6.7,False,,False,False,True,False,False,True
3,True,True,True,241.3,10.1,13.1,31.8,6.7,False,,False,True,False,False,False,True
4,False,True,True,241.3,10.1,13.1,31.8,6.7,False,,False,True,False,False,False,True


# Data Spliting : Test - Train - Val 

<span style="color:red">

- randomly split with shuffle=True  (Note: remember the random_state number to be able to reproduce the split) 
- k-cross validation? 

</span>

In [10]:
from sklearn.model_selection import train_test_split

# TrainVal vs. Test split
test_dataframe = merged_df[merged_df['TestSetId'].notna()]
#test_dataframe.head(100)

# Train vs. Val split
train_val_dataframe = merged_df[merged_df['TestSetId'].isna()]

# Prepare train val for training 
train_val_dataframe = merged_df[merged_df['Like'].notna()]
train_val_dataframe = train_val_dataframe.drop('TestSetId', axis=1)
# find duplicated rows in the dataframe and drop
print("remove ", train_val_dataframe[train_val_dataframe.duplicated()].count(), "duplicate rows")
train_val_dataframe = train_val_dataframe.drop_duplicates()

# put Target (Like column) at the end 
like_column = train_val_dataframe.pop('Like')
train_val_dataframe['Like'] = like_column
train_val_dataframe['Like'] = train_val_dataframe['Like'].astype(int)
#train_val_dataframe.head(100)

X_train, X_val, y_train, y_val = \
  train_test_split(train_val_dataframe.iloc[:, :-1], train_val_dataframe.iloc[:, -1:],
                   test_size=0.1, 
                   shuffle=True,
                   random_state=3)

X_train.info()
#X_val.head()
#y_train.info()
#y_val.head()

# count like of train
print('Train:')
print(y_train['Like'].value_counts())
print('Val:')
print(y_val['Like'].value_counts())


remove  HighCalories_Requested             10157
HighProtein_Requested              10157
LowFat_Requested                   10157
Calories                           10157
FatContent                         10157
SodiumContent                      10157
CarbohydrateContent                10157
ProteinContent                     10157
Like                               10157
user_diet_category_Omnivore        10157
user_diet_category_Vegan           10157
user_diet_category_Vegetarian      10157
recipe_diet_category_Omnivore      10157
recipe_diet_category_Vegan         10157
recipe_diet_category_Vegetarian    10157
dtype: int64 duplicate rows
<class 'pandas.core.frame.DataFrame'>
Index: 62989 entries, 54607 to 134615
Data columns (total 14 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   HighCalories_Requested           62989 non-null  boolean
 1   HighProtein_Requested            62989 non-null  

# Training

Änderung: 
Bei meta_parameter_grid wurde hinzugefügt:
- parameter_grid_gaussianNB
- parameter_grid_linearSVC

In [11]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from sklearn.naive_bayes import GaussianNB , MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

model_logistic_regression = LogisticRegression(max_iter=30)
model_random_forest = RandomForestClassifier()
model_gradient_boosting = GradientBoostingClassifier()
model_gauusianNB = GaussianNB()
model_linearSVC = LinearSVC()

# data scaling
transform_scaler = StandardScaler()

# dimensionality reduction
transform_pca = PCA()

# train the models
pipeline = Pipeline(steps=[("scaler", transform_scaler), 
                           ("pca", transform_pca),
                           ("model", None)])

parameter_grid_preprocessing = {
  "pca__n_components" : [13, 14],
}

parameter_grid_gaussianNB = {
  "model" : [model_gauusianNB],
  "model__var_smoothing": [1e-9, 1e-8, 1e-7]
}

# parameter_grid_linearSVC = {
#  "model" : [model_linearSVC],
#  "model__C": [0.1, 1, 10],  # Regularization parameter
#  "model__kernels": ['linear', 'rbf'],  # Kernel type
#  "model__gamma": [0.1, 1, 10]  # Kernel coefficient for 'rbf'
# }
parameter_grid_linearSVC = {
    "model": [model_linearSVC],
    "model__C": [0.1, 1, 10]  # Regularization parameter
}


parameter_grid_logistic_regression = {
  "model" : [model_logistic_regression],
  "model__C" : [0.1, 1, 10],  # inverse regularization strength
}

parameter_grid_gradient_boosting = {
  "model" : [model_gradient_boosting],
  "model__n_estimators" : [10, 20, 30]
}

parameter_grid_random_forest = {
  "model" : [model_random_forest],
  "model__n_estimators" : [10, 20, 50],  # number of max trees in the forest
  "model__max_depth" : [12, 13],
}

meta_parameter_grid = [parameter_grid_logistic_regression,
                       parameter_grid_random_forest,
                       parameter_grid_gradient_boosting,
                       parameter_grid_gaussianNB,
                       parameter_grid_linearSVC
]

meta_parameter_grid = [{**parameter_grid_preprocessing, **model_grid}
                       for model_grid in meta_parameter_grid]

search = GridSearchCV(pipeline,
                      meta_parameter_grid, 
                      scoring="balanced_accuracy",
                      n_jobs=2, 
                      cv=5,  # number of folds for cross-validation 
                      error_score="raise"
)

# here, the actual training and grid search happens
search.fit(X_train, y_train.values.ravel())

print("best parameter:", search.best_params_ ,"(CV score=%0.3f)" % search.best_score_)

KeyboardInterrupt: 

# Evaluation

In [None]:
# evaluate performance of model on test set
print("Score on test set:", search.score(X_val, y_val.values.ravel()))

# prediction and show contingency table
ct = pd.crosstab(search.best_estimator_.predict(X_val), y_val.values.ravel(),
                 rownames=["pred"], colnames=["true"])
print(ct)

Score on test set: 0.5739818241966528
true     0     1
pred            
0     8278  1074
1      170   217


In [None]:
# (optional, if you're curious) for a detailed look on the performance of the different models
def get_search_score_overview():
  for c,s in zip(search.cv_results_["params"],search.cv_results_["mean_test_score"]):
      print(c, s)

print(get_search_score_overview())

{'model': LogisticRegression(max_iter=30), 'model__C': 0.1, 'pca__n_components': 13} 0.501821998328961
{'model': LogisticRegression(max_iter=30), 'model__C': 0.1, 'pca__n_components': 14} 0.501821998328961
{'model': LogisticRegression(max_iter=30), 'model__C': 1, 'pca__n_components': 13} 0.50195140418955
{'model': LogisticRegression(max_iter=30), 'model__C': 1, 'pca__n_components': 14} 0.50195140418955
{'model': LogisticRegression(max_iter=30), 'model__C': 10, 'pca__n_components': 13} 0.50195140418955
{'model': LogisticRegression(max_iter=30), 'model__C': 10, 'pca__n_components': 14} 0.50195140418955
{'model': RandomForestClassifier(max_depth=13, n_estimators=10), 'model__max_depth': 12, 'model__n_estimators': 10, 'pca__n_components': 13} 0.5609268322021761
{'model': RandomForestClassifier(max_depth=13, n_estimators=10), 'model__max_depth': 12, 'model__n_estimators': 10, 'pca__n_components': 14} 0.5555012650925621
{'model': RandomForestClassifier(max_depth=13, n_estimators=10), 'model_

In [None]:
# prepare test data for prediction
test_set_id = test_dataframe.pop('TestSetId')
test_dataframe = test_dataframe.drop('Like', axis=1)
test_dataframe.head()

Unnamed: 0,HighCalories_Requested,HighProtein_Requested,LowFat_Requested,Calories,FatContent,SodiumContent,CarbohydrateContent,ProteinContent,user_diet_category_Omnivore,user_diet_category_Vegan,user_diet_category_Vegetarian,recipe_diet_category_Omnivore,recipe_diet_category_Vegan,recipe_diet_category_Vegetarian
5,False,True,False,241.3,10.1,13.1,31.8,6.7,False,True,False,False,False,True
7,False,False,False,241.3,10.1,13.1,31.8,6.7,False,False,True,False,False,True
8,False,False,False,241.3,10.1,13.1,31.8,6.7,False,False,True,False,False,True
14,False,True,False,241.3,10.1,13.1,31.8,6.7,False,True,False,False,False,True
15,False,True,False,241.3,10.1,13.1,31.8,6.7,True,False,False,False,False,True


In [None]:
# prediction
model = search.best_estimator_ 
test_dataframe["Like"] = model.predict(test_dataframe)

#TODO: 

# prediction := List if Like 
# test_set_id := List of test ID

# write to CSV file in the same order  (den Code unten anpassenm)
# 1.ID  1.Like 
# 2.ID  2.Like

output = pd.DataFrame(test_dataframe["Like"])
output["id"] = test_set_id.astype("int")

output = output.rename(columns={'Like': 'prediction'})
output = output.reindex(columns=["id", "prediction"])

output.to_csv('recipe_prediction.csv', index=False)

For deployment

In [None]:
# Let's assume that our id column is the index of the dataframe

# print(test_dataframe)
#output = pd.DataFrame(test_dataframe["Like"])
# output = output.reset_index(drop=True)
#output["id"] = output.index + 1
#output = output.rename(columns={'Like': 'prediction'})
#output = output.reindex(columns=["id", "prediction"])
# output length
#print(len(output))
#output.to_csv('recipe_prediction.csv', index=False)