In [86]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# Note: The following do not work with Python 3.12
#import sweetviz as sv
#import shap
#from ydata_profiling import ProfileReport


seed = 2024  #seed = 2024: train model as stated in example_crisp_dm_pipeline.ipynb
np.random.seed(seed)

# Pre Data Cleaning: Readin data and preprocessing individual table

##### Recipes

In [87]:
df_recipes = pd.read_csv('data/recipes.csv')

# Consolidated Non-Vegetarian Keywords
non_vegetarian_keywords = list(set([
    'flounder', 'lobsters', 'lump', 'rack', 'shank', 'steak', 'scallops', 'alligator', 
    'livers', 'roe', 'ham', 'turkey', 'chicken', 'duck', 'bacon', 'tuna', 'swordfish', 
    'lobster', 'meatballs', 'salmon', 'sweetbreads', 'breasts', 'chicken-flavored', 
    'ducklings', 'drumstick', 'liver', 'shanks', 'rabbit', 'poultry', 'herring', 
    'mussels', 'clams', 'squid', 'pork', 'veal', 'haddock', 'chorizo', 'chihuahua', 
    'eel', 'stuffing', 'cod', 'gelatin', 'sausage', 'curd', 'thighs', 'lox', 'cabbage', 
    'wonton', 'bone', 'giblets', 'pheasant', 'quail', 'shrimp', 'fish', 'sole', 
    'gizzard', 'Canadian', 'pesto', 'truffles', 'anchovies', 'venison', 'pheasants', 
    'tenderloin', 'meats', 'tripe', 'breast', 'wings', 'ribs', 'sausages', 'trout', 
    'oysters', 'octopus', 'crab', 'prawns', 'catfish', 'sardines', 'mahi', 'halibut', 
    'bass', 'perch', 'tilapia', 'grouper'
]))

# Consolidated Non-Vegan Keywords
non_vegan_keywords = list(set([
    'milk', 'cheese', 'butter', 'egg', 'honey', 'mozzarella-cheddar', 'cream', 'whip', 
    'jarlsberg', 'fontina', 'ham', 'cheesecake', 'hollandaise', 'caviar', 'creamRegular', 
    'custard', 'yogurt', 'gouda', 'margarine', 'beef', 'salmon', 'sour', 'bisquick', 
    'carton', 'cotija', 'creme', 'buttercream', 'buttermilk', 'ricotta', 'cottage', 
    'eggs', 'mayonnaise', 'eggshells', 'lactose-free', 'skim', 'ghee', 'mascarpone', 
    'alfredo', 'whey', 'casein', 'lactose', 'albumin', 'bechamel', 'sour cream', 
    'cream cheese', 'feta', 'gorgonzola', 'parmesan', 'mozzarella', 'cheddar', 'brie', 
    'camembert', 'roquefort', 'stilton', 'blue cheese', 'colby', 'monterey jack', 
    'swiss cheese', 'provolone', 'edam', 'havarti', 'pecorino', 'asiago', 'emmental', 
    'gruyere', 'halloumi', 'manchego', 'paneer', 'queso fresco', 'ricotta salata', 
    'romano', 'taleggio', 'vacherin', 'milk chocolate', 'whey protein', 'casein protein', 
    'egg noodles', 'egg whites', 'egg yolks', 'hollandaise sauce', 'aioli', 'flan', 
    'quiche', 'meringue', 'pavlova', 'egg wash', 'frittata', 'omelette', 'scrambled eggs', 
    'poached eggs', 'hard-boiled eggs', 'deviled eggs', 'eggnog', 'brioche', 'challah', 
    'pound cake', 'sponge cake', 'angel food cake', 'ladyfingers', 'mousse', 'souffle', 
    'creme brulee', 'panna cotta', 'tiramisu', 'yorkshire pudding', 'beef broth', 
    'chicken broth', 'fish sauce', 'oyster sauce', 'worcestershire sauce', 'caesar dressing', 
    'carbonara sauce', 'béarnaise sauce', 'gravlax', 'smoked salmon', 'caviar', 'anchovy paste', 
    'fish stock'
]))


# Function to check if a RecipeIngredientParts is vegetarian
def is_vegetarian(ingredient):
    for keyword in non_vegetarian_keywords:
        if keyword in ingredient.lower():
            return False
    return True

# Function to check if a RecipeIngredientParts is vegan
def is_vegan(ingredient):
    for keyword in non_vegan_keywords:
        if keyword in ingredient.lower():
            return False
    return True

# Apply the is_vegetarian function to the RecipeIngredientParts column
df_recipes['is_vegetarian'] = df_recipes['RecipeIngredientParts'].apply(is_vegetarian)

# Apply the is_vegan function to the RecipeIngredientParts column
df_recipes['is_vegan'] = df_recipes['RecipeIngredientParts'].apply(is_vegan)

# Map the diet category based on the is_vegetarian and is_vegan columns
df_recipes['diet_category'] = df_recipes.apply(lambda row: 'Vegetarian' if row['is_vegetarian'] else 'Vegan' if row['is_vegan'] else 'Omnivore', axis=1)

# create TotalTime_Recipe column
df_recipes['TotalTime_Recipe'] = df_recipes['CookTime'] + df_recipes['PrepTime']

# drop columns
df_recipes = df_recipes.drop(columns=['Name', 'CookTime', 'PrepTime', 'RecipeIngredientParts', 'RecipeIngredientQuantities', 'RecipeYield', 'is_vegetarian', 'is_vegan'])
# dtype conversion
df_recipes["RecipeCategory"] = df_recipes["RecipeCategory"].astype("category")
df_recipes["diet_category"] = df_recipes["diet_category"].astype("category")
# rename columns
df_recipes = df_recipes.rename(columns={"diet_category": "recipe_diet_category"})

df_recipes.info()
df_recipes.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75604 entries, 0 to 75603
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype   
---  ------                --------------  -----   
 0   RecipeId              75604 non-null  int64   
 1   RecipeCategory        75604 non-null  category
 2   Calories              75604 non-null  float64 
 3   FatContent            75604 non-null  float64 
 4   SaturatedFatContent   75604 non-null  float64 
 5   CholesterolContent    75604 non-null  float64 
 6   SodiumContent         75604 non-null  float64 
 7   CarbohydrateContent   75604 non-null  float64 
 8   FiberContent          75604 non-null  float64 
 9   SugarContent          75604 non-null  float64 
 10  ProteinContent        75604 non-null  float64 
 11  RecipeServings        48891 non-null  float64 
 12  recipe_diet_category  75604 non-null  category
 13  TotalTime_Recipe      75604 non-null  int64   
dtypes: category(2), float64(10), int64(2)
memory usage: 7.

Unnamed: 0,RecipeId,RecipeCategory,Calories,FatContent,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,recipe_diet_category,TotalTime_Recipe
0,73440,Other,241.3,10.1,1.2,0.0,13.1,31.8,2.3,1.4,6.7,9.0,Vegetarian,1800
1,365718,Other,370.8,17.5,7.2,22.9,553.3,44.3,1.6,2.2,9.4,8.0,Omnivore,4200
2,141757,Other,377.6,20.9,10.5,45.7,1501.8,36.6,3.8,6.1,12.9,8.0,Vegetarian,6300
3,280351,Other,282.8,16.5,10.3,50.5,630.2,22.8,2.3,2.7,11.7,6.0,Omnivore,19800
4,180505,Other,257.5,8.6,2.4,110.7,160.9,39.8,0.4,30.2,6.3,6.0,Vegan,5400


##### Diet

In [88]:
df_diet = pd.read_csv('data/diet.csv')

# chcek for missing values in the data
print(df_diet.isnull().sum())

# replace missing value in Diet with "Omnivore"
print(df_diet["Diet"].unique())
df_diet["Diet"] = df_diet["Diet"].fillna("Omnivore")

# check again
print(df_diet.isnull().sum())

# Change data type of Diet to category
df_diet["Diet"] = df_diet["Diet"].astype("category")

# rename the column Diet to diet_category
df_diet = df_diet.rename(columns={"Diet": "user_diet_category"})

df_diet.info()
df_diet.head()


AuthorId    0
Diet        1
Age         0
dtype: int64
['Vegetarian' 'Vegan' 'Omnivore' nan]
AuthorId    0
Diet        0
Age         0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271907 entries, 0 to 271906
Data columns (total 3 columns):
 #   Column              Non-Null Count   Dtype   
---  ------              --------------   -----   
 0   AuthorId            271907 non-null  object  
 1   user_diet_category  271907 non-null  category
 2   Age                 271907 non-null  int64   
dtypes: category(1), int64(1), object(1)
memory usage: 4.4+ MB


Unnamed: 0,AuthorId,user_diet_category,Age
0,10000120E,Vegetarian,46
1,1000014D,Vegan,18
2,1000015A,Vegetarian,58
3,1000016E,Vegetarian,32
4,1000027E,Vegan,61


##### Request

In [89]:
df_requests = pd.read_csv('data/requests.csv')

# check for missing values
print(df_requests.isnull().sum())

#dtype
df_requests['HighCalories'] = df_requests['HighCalories'].astype('boolean')

df_requests['HighProtein'] = df_requests['HighProtein'].replace({'Indifferent': False, 'Yes': True})
df_requests['HighProtein'] = df_requests['HighProtein'].astype('boolean')

df_requests['LowFat'] = df_requests['LowFat'].astype('boolean')

df_requests['LowSugar'] = df_requests['LowSugar'].replace({'Indifferent': False, '0': True})
df_requests['LowSugar'] = df_requests['LowSugar'].astype('boolean')

df_requests['HighFiber'] = df_requests['HighFiber'].astype('boolean')

# rename columns
df_requests.rename(columns={'Time': 'TotalTime_Requested'}, inplace=True)
df_requests.rename(columns={'HighCalories': 'HighCalories_Requested'}, inplace=True)
df_requests.rename(columns={'HighProtein': 'HighProtein_Requested'}, inplace=True)
df_requests.rename(columns={'LowFat': 'LowFat_Requested'}, inplace=True)
df_requests.rename(columns={'LowSugar': 'LowSugar_Requested'}, inplace=True)
df_requests.rename(columns={'HighFiber': 'HighFiber_Requested'}, inplace=True)

df_requests.info() 
df_requests.head()


AuthorId        0
RecipeId        0
Time            0
HighCalories    0
HighProtein     0
LowFat          0
LowSugar        0
HighFiber       0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140195 entries, 0 to 140194
Data columns (total 8 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   AuthorId                140195 non-null  object 
 1   RecipeId                140195 non-null  int64  
 2   TotalTime_Requested     140195 non-null  float64
 3   HighCalories_Requested  140195 non-null  boolean
 4   HighProtein_Requested   140195 non-null  boolean
 5   LowFat_Requested        140195 non-null  boolean
 6   LowSugar_Requested      140195 non-null  boolean
 7   HighFiber_Requested     140195 non-null  boolean
dtypes: boolean(5), float64(1), int64(1), object(1)
memory usage: 4.5+ MB


Unnamed: 0,AuthorId,RecipeId,TotalTime_Requested,HighCalories_Requested,HighProtein_Requested,LowFat_Requested,LowSugar_Requested,HighFiber_Requested
0,2001012259B,73440,1799.950949,False,False,False,True,False
1,437641B,365718,4201.82098,False,True,False,False,True
2,1803340263D,141757,6299.861496,False,False,True,False,False
3,854048B,280351,19801.365796,False,True,True,True,True
4,2277685E,180505,5400.093457,False,False,False,True,False


##### Review

In [90]:
df_reviews = pd.read_csv('data/reviews.csv')

#sns.countplot(data=df_reviews, x='Rating')  # Rating is only 2 except 2 rows -> drop Rating column
df_reviews = df_reviews.drop('Rating', axis=1)

# check for missing values
# print(df_reviews.isnull().sum())

# dtype 
df_reviews['Like'] = df_reviews['Like'].astype('boolean')

df_reviews.info()
df_reviews.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140195 entries, 0 to 140194
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   AuthorId   140195 non-null  object 
 1   RecipeId   140195 non-null  int64  
 2   Like       97381 non-null   boolean
 3   TestSetId  42814 non-null   float64
dtypes: boolean(1), float64(1), int64(1), object(1)
memory usage: 3.5+ MB


  df_reviews = pd.read_csv('data/reviews.csv')


Unnamed: 0,AuthorId,RecipeId,Like,TestSetId
0,2492191A,33671,,1.0
1,2002019979A,92647,,2.0
2,408594E,161770,,3.0
3,2001625557E,108231,,4.0
4,2001427116E,71109,,5.0


# Data aggregation (Merge the tables)

In [91]:
# keep all request, add info about custormers diet when exit -> df_diet right_join df_requests
merged_df_diet_request = df_diet.merge(df_requests, on='AuthorId', how='right')
#merged_df_diet_request.head(100)

# request without matched recipe, or recipe without request is useless  -> normal join 
merged_df_diet_request_recipes = merged_df_diet_request.merge(df_recipes, on='RecipeId')
#merged_df_diet_request_recipes.tail(100)

# review without request,recipes is useless -> left 
merged_df_diet_request_recipes_reviews = merged_df_diet_request_recipes.merge(df_reviews, on=['RecipeId', 'AuthorId'], how='left')
#merged_df_diet_request_recipes_reviews.info()

merged_df = merged_df_diet_request_recipes_reviews

# Post Data Cleaning (after merged) 

In [92]:
# https://ishanjainoffical.medium.com/choosing-the-right-correlation-pearson-vs-spearman-vs-kendalls-tau-02dc7d7dd01d
def plot_corr(df, title, is_like=True):
    if 'Like' in df:
        df = df[df['Like'] == 1]
    fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(25, 7))
    nutrients_corr = df.corr(method='kendall') 
    mask = np.triu(np.ones_like(nutrients_corr, dtype=bool))
    cmap = sns.color_palette("coolwarm", as_cmap=True)
    sns.heatmap(nutrients_corr, mask=mask, cmap=cmap, annot=True, fmt=".2f", ax=ax1, center=0)
    ax1.set_title(title + ' - kendall', fontsize=16)
    nutrients_corr = df.corr(method='pearson')
    mask = np.triu(np.ones_like(nutrients_corr, dtype=bool))
    cmap = sns.color_palette("coolwarm", as_cmap=True)
    sns.heatmap(nutrients_corr, mask=mask, cmap=cmap, annot=True, fmt=".2f", ax=ax2, center=0)
    ax2.set_title(title + ' - pearson', fontsize=16)
    plt.show()

In [93]:
# -> Drop
merged_df = merged_df.drop(columns=['AuthorId', 'RecipeId', 'TotalTime_Requested', 'TotalTime_Recipe', 'RecipeServings', 'RecipeCategory', 'SaturatedFatContent', 'CholesterolContent', 'FiberContent', 'SugarContent', 'LowSugar_Requested', 'HighFiber_Requested', 'Age'])
# One-Hot_encoding
# merged_df = pd.get_dummies(merged_df, columns=['user_diet_category', 'recipe_diet_category'])

# drop user_diet_category, recipe_diet_category
merged_df = merged_df.drop(columns=['user_diet_category', 'recipe_diet_category'])

merged_df


Unnamed: 0,HighCalories_Requested,HighProtein_Requested,LowFat_Requested,Calories,FatContent,SodiumContent,CarbohydrateContent,ProteinContent,Like,TestSetId
0,False,False,False,241.3,10.1,13.1,31.8,6.7,False,
1,True,True,True,241.3,10.1,13.1,31.8,6.7,False,
2,True,False,False,241.3,10.1,13.1,31.8,6.7,False,
3,True,True,True,241.3,10.1,13.1,31.8,6.7,False,
4,False,True,True,241.3,10.1,13.1,31.8,6.7,False,
...,...,...,...,...,...,...,...,...,...,...
140190,False,False,True,121.5,0.5,1175.1,22.2,7.9,True,
140191,False,True,True,652.2,25.8,435.5,51.9,50.1,,7148.0
140192,False,True,False,223.9,9.2,725.9,7.3,26.7,True,
140193,True,False,False,2229.8,80.3,294.7,369.0,26.7,True,


In [94]:
# add a new column merged_df['same_category]: 
    # = 1 if recipe_category == Ominvore but user_diet_category == Vegetarian or Vegan
    # = 1 if recipe_category == Vegetarian but user_diet_category == Vegan 
    # else 0 

#merged_df['same_category'] = merged_df.apply(lambda row: 1 if (row['recipe_diet_category'] == 'Omnivore' and (row['user_diet_category'] == 'Vegetarian' or row['user_diet_category'] == 'Vegan')) 
#                    or ((row['recipe_diet_category'] == 'Vegetarian' or row['recipe_diet_category'] == 'Omnivore') and row['user_diet_category'] == 'Vegan') else 0, axis=1)

# corr between Like and same_category
#merged_df[['Like', 'same_category']].corr(method='kendall')

In [95]:
# plot corr for whole merged_df
#plot_corr(merged_df.drop(columns=['user_diet_category', 'recipe_diet_category']), title="Full data without categorical columns", is_like=True)


In [96]:
merged_df.info()
merged_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140195 entries, 0 to 140194
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   HighCalories_Requested  140195 non-null  boolean
 1   HighProtein_Requested   140195 non-null  boolean
 2   LowFat_Requested        140195 non-null  boolean
 3   Calories                140195 non-null  float64
 4   FatContent              140195 non-null  float64
 5   SodiumContent           140195 non-null  float64
 6   CarbohydrateContent     140195 non-null  float64
 7   ProteinContent          140195 non-null  float64
 8   Like                    97381 non-null   boolean
 9   TestSetId               42814 non-null   float64
dtypes: boolean(4), float64(6)
memory usage: 7.5 MB


Unnamed: 0,HighCalories_Requested,HighProtein_Requested,LowFat_Requested,Calories,FatContent,SodiumContent,CarbohydrateContent,ProteinContent,Like,TestSetId
0,False,False,False,241.3,10.1,13.1,31.8,6.7,False,
1,True,True,True,241.3,10.1,13.1,31.8,6.7,False,
2,True,False,False,241.3,10.1,13.1,31.8,6.7,False,
3,True,True,True,241.3,10.1,13.1,31.8,6.7,False,
4,False,True,True,241.3,10.1,13.1,31.8,6.7,False,


# Data Spliting : Test - Train - Val 

<span style="color:red">

- randomly split with shuffle=True  (Note: remember the random_state number to be able to reproduce the split) 
- k-cross validation? 

</span>

In [97]:
from sklearn.model_selection import train_test_split

# TrainVal vs. Test split
test_dataframe = merged_df[merged_df['TestSetId'].notna()]
#test_dataframe.head(100)

# Train vs. Val split
train_val_dataframe = merged_df[merged_df['TestSetId'].isna()]

# Prepare train val for training 
train_val_dataframe = merged_df[merged_df['Like'].notna()]
train_val_dataframe = train_val_dataframe.drop('TestSetId', axis=1)
# put Target (Like column) at the end 
like_column = train_val_dataframe.pop('Like')
train_val_dataframe['Like'] = like_column
train_val_dataframe['Like'] = train_val_dataframe['Like'].astype(int)
#train_val_dataframe.head(100)

X_train, X_val, y_train, y_val = \
  train_test_split(train_val_dataframe.iloc[:, :-1], train_val_dataframe.iloc[:, -1:],
                   test_size=0.1, 
                   shuffle=True,
                   random_state=3)

X_train.info()
#X_val.head()
#y_train.info()
#y_val.head()


<class 'pandas.core.frame.DataFrame'>
Index: 87642 entries, 34459 to 102791
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   HighCalories_Requested  87642 non-null  boolean
 1   HighProtein_Requested   87642 non-null  boolean
 2   LowFat_Requested        87642 non-null  boolean
 3   Calories                87642 non-null  float64
 4   FatContent              87642 non-null  float64
 5   SodiumContent           87642 non-null  float64
 6   CarbohydrateContent     87642 non-null  float64
 7   ProteinContent          87642 non-null  float64
dtypes: boolean(3), float64(5)
memory usage: 4.5 MB


# Training

Änderung: 
Bei meta_parameter_grid wurde hinzugefügt:
- parameter_grid_gaussianNB
- parameter_grid_linearSVC

In [98]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC

# Model initialization
model_logistic_regression = LogisticRegression(max_iter=100)
model_random_forest = RandomForestClassifier()
model_gradient_boosting = GradientBoostingClassifier()
model_gaussianNB = GaussianNB()
model_linearSVC = LinearSVC(max_iter=10000)

# Data scaling
transform_scaler = StandardScaler()

# Dimensionality reduction (optional, based on PCA analysis)
transform_pca = PCA()

# Pipeline setup
pipeline = Pipeline([
    ("scaler", transform_scaler),
    # Uncomment the following line if PCA is necessary
    # ("pca", transform_pca),
    ("model", None)
])

# Hyperparameters for grid search
parameter_grid_preprocessing = {
    # "pca__n_components": [7, 8],  # Uncomment if using PCA
}

parameter_grid_gaussianNB = {
    "model": [model_gaussianNB],
    "model__var_smoothing": [1e-9, 1e-8, 1e-7]
}

parameter_grid_linearSVC = {
    "model": [model_linearSVC],
    "model__C": [0.1, 1, 10]  # Regularization parameter
}

parameter_grid_logistic_regression = {
    "model": [model_logistic_regression],
    "model__C": [0.1, 1, 10]  # Inverse regularization strength
}

parameter_grid_gradient_boosting = {
    "model": [model_gradient_boosting],
    "model__n_estimators": [10, 20, 30]
}

# Updated parameter grid for RandomForestClassifier
parameter_grid_random_forest = {
    "model": [RandomForestClassifier()],
    "model__n_estimators": [100, 200, 500],  # Number of trees in the forest
    "model__max_depth": [10, 20, 30, None],  # Maximum depth of each tree
    "model__min_samples_split": [2, 5, 10],  # Minimum number of samples required to split an internal node
    "model__min_samples_leaf": [1, 2, 4],    # Minimum number of samples required at a leaf node
    "model__max_features": ['auto', 'sqrt', 'log2'],  # Number of features to consider for the best split
    "model__bootstrap": [True, False]  # Whether bootstrap samples are used when building trees
}

# Combining all parameter grids
meta_parameter_grid = [
    parameter_grid_logistic_regression,
    parameter_grid_random_forest,
    parameter_grid_gradient_boosting,
    parameter_grid_gaussianNB,
    parameter_grid_linearSVC
]

# Adding preprocessing parameters to each model's grid
meta_parameter_grid = [{**parameter_grid_preprocessing, **model_grid}
                       for model_grid in meta_parameter_grid]

# GridSearchCV setup
search = GridSearchCV(
    pipeline,
    meta_parameter_grid, 
    scoring="balanced_accuracy",
    n_jobs=2, 
    cv=5,  # Number of folds for cross-validation
    error_score="raise"
)

# Training and grid search
# Replace X_train and y_train with your actual data
search.fit(X_train, y_train.values.ravel())

# Uncomment below to print the best parameters
print("Best parameters:", search.best_params_, "(CV score=%0.3f)" % search.best_score_)




# Evaluation

In [None]:
# evaluate performance of model on test set
print("Score on test set:", search.score(X_val, y_val.values.ravel()))

# prediction and show contingency table
ct = pd.crosstab(search.best_estimator_.predict(X_val), y_val.values.ravel(),
                 rownames=["pred"], colnames=["true"])
print(ct)

Score on test set: 0.5644447812507335
true     0     1
pred            
0     8287  1100
1      161   191


In [None]:
# (optional, if you're curious) for a detailed look on the performance of the different models
def get_search_score_overview():
  for c,s in zip(search.cv_results_["params"],search.cv_results_["mean_test_score"]):
      print(c, s)

print(get_search_score_overview())

{'model': LogisticRegression(max_iter=30), 'model__C': 0.1, 'pca__n_components': 7} 0.5020343640293786
{'model': LogisticRegression(max_iter=30), 'model__C': 0.1, 'pca__n_components': 8} 0.5019993593248523
{'model': LogisticRegression(max_iter=30), 'model__C': 1, 'pca__n_components': 7} 0.5020278195267609
{'model': LogisticRegression(max_iter=30), 'model__C': 1, 'pca__n_components': 8} 0.5020644830235109
{'model': LogisticRegression(max_iter=30), 'model__C': 10, 'pca__n_components': 7} 0.5020278195267609
{'model': LogisticRegression(max_iter=30), 'model__C': 10, 'pca__n_components': 8} 0.5020210425369774
{'model': RandomForestClassifier(max_depth=13, n_estimators=10), 'model__max_depth': 12, 'model__n_estimators': 10, 'pca__n_components': 7} 0.5517508573803369
{'model': RandomForestClassifier(max_depth=13, n_estimators=10), 'model__max_depth': 12, 'model__n_estimators': 10, 'pca__n_components': 8} 0.5517199251977019
{'model': RandomForestClassifier(max_depth=13, n_estimators=10), 'mode

In [None]:
# prepare test data for prediction
test_set_id = test_dataframe.pop('TestSetId')
test_dataframe = test_dataframe.drop('Like', axis=1)
test_dataframe.head()

Unnamed: 0,HighCalories_Requested,HighProtein_Requested,LowFat_Requested,Calories,FatContent,SodiumContent,CarbohydrateContent,ProteinContent
5,False,True,False,241.3,10.1,13.1,31.8,6.7
7,False,False,False,241.3,10.1,13.1,31.8,6.7
8,False,False,False,241.3,10.1,13.1,31.8,6.7
14,False,True,False,241.3,10.1,13.1,31.8,6.7
15,False,True,False,241.3,10.1,13.1,31.8,6.7


In [None]:
# prediction
model = search.best_estimator_ 
test_dataframe["Like"] = model.predict(test_dataframe)

#TODO: 

# prediction := List if Like 
# test_set_id := List of test ID

# write to CSV file in the same order  (den Code unten anpassenm)
# 1.ID  1.Like 
# 2.ID  2.Like

output = pd.DataFrame(test_dataframe["Like"])
output["id"] = test_set_id.astype("int")

output = output.rename(columns={'Like': 'prediction'})
output = output.reindex(columns=["id", "prediction"])

output.to_csv('recipe_prediction.csv', index=False)

For deployment

In [None]:
# Let's assume that our id column is the index of the dataframe

# print(test_dataframe)
#output = pd.DataFrame(test_dataframe["Like"])
# output = output.reset_index(drop=True)
#output["id"] = output.index + 1
#output = output.rename(columns={'Like': 'prediction'})
#output = output.reindex(columns=["id", "prediction"])
# output length
#print(len(output))
#output.to_csv('recipe_prediction.csv', index=False)