### All the choices in this code are based on our data mining efforts.

In [1]:
import pandas as pd
import numpy as np

from sklearn.cluster import KMeans
from itertools import chain

In [None]:
# Load user and recipe data from CSV files
# https://www.kaggle.com/datasets/shuyangli94/food-com-recipes-and-user-interactions
usr_rates = pd.read_csv('RAW_interactions.csv').copy()
rcp = pd.read_csv('RAW_recipes.csv').copy()

We aim to separate main dishes from desserts, more specifically savory from sweet. Since our recommendation algorithm relies on the distance between users to suggest a dish or a dessert, it seems relevant to distinguish between the two categories.

To achieve this separation, we select the most frequent tags in the dataset:

For main dishes: meat, vegetables, and seafood
For desserts: desserts, cookies-and-brownies, and chocolate

In [None]:
# Filter recipes into main dishes and desserts according to the chosen tags
rcp_main_dish = rcp[rcp['tags'].apply(
    lambda x: 'main-dish' in x or 'meat' in x or 'vegetables' in x or 'seafood' in x)]
rcp_dessert = rcp[rcp['tags'].apply(
    lambda x: 'desserts' in x or 'cookies-and-brownies' in x or 'chocolate' in x)]

In [None]:
# Create sets of recipe IDs for main dishes and desserts
main_dish_ids = set(rcp_main_dish['id'])
dessert_ids = set(rcp_dessert['id'])

# Define a function to determine the type of dish (main dish or dessert) based on the recipe ID
def get_type_of_dish(recipe_id):
    if recipe_id in dessert_ids:
        return 'dessert'  # If the ID belongs to desserts
    elif recipe_id in main_dish_ids:
        return 'main'  # If the ID belongs to main dishes
    else:
        return 'none'  # If the ID does not belong to either category


# Apply the function to add a 'type_of_dish' column to the recipe 
rcp['type_of_dish'] = rcp['id'].apply(get_type_of_dish)

# Filter the recipes to keep only those classified as dessert or main dish
rcp = rcp[rcp['type_of_dish'] != 'none']

The reviews and dates are not important

We gather all of each user’s ratings into lists to personally binarize them between -1 and 1 (dislike, like)

In [None]:
# Remove unnecessary columns from user interactions
usr_rates.drop(columns=['review', 'date'], inplace=True)

# Group interactions by user and aggregate the values into lists
usr_rates = usr_rates.groupby('user_id').aggregate(list)

In [None]:
# Update the sets of IDs for desserts and main dishes
dessert_ids = set(rcp[rcp['type_of_dish'] == 'dessert']['id'])
main_dish_ids = set(rcp[rcp['type_of_dish'] == 'main']['id'])

# Function to extract desserts from a list of recipe 
def get_list_of_desserts(recipe_id):
    x = []
    for i in recipe_id:
        if i in dessert_ids:
            x.append(i)
    return x

# Function to extract main dishes from a list of recipe IDs
def get_list_of_mains(recipe_id):
    x = []
    for i in recipe_id:
        if i in main_dish_ids:
            x.append(i)
    return x


# Apply the functions to add 'recipes_dessert' and 'recipes_main' columns to the user 
usr_rates['recipes_dessert'] = usr_rates['recipe_id'].apply(
    get_list_of_desserts)
usr_rates['recipes_main'] = usr_rates['recipe_id'].apply(get_list_of_mains)

Binarizing the ratings using K-Means

In [None]:
# Function to binarize ratings using K-Means
def cluster_ratings(ratings):
    km = KMeans(n_clusters=2, verbose=0, random_state=42)
    ratings_array = np.array(ratings).reshape(-1, 1)
    if len(set(ratings)) > 1:
        km.fit(ratings_array)

        # Calculate the means of each cluster
        cluster_0_mean = np.mean(ratings_array[km.labels_ == 0])
        cluster_1_mean = np.mean(ratings_array[km.labels_ == 1])

        # Identify the cluster with the higher mean
        high_label = 0 if cluster_0_mean > cluster_1_mean else 1

        # Assign 1 to the higher-rated cluster and -1 to the other
        return [1 if label == high_label else -1 for label in km.labels_]
    else:
        # If all ratings are identical, consider them as "liked"
        return [1 for _ in ratings]


# Add a 'ratings_binary' column with binarized 
usr_rates['ratings_binary'] = usr_rates['rating'].apply(cluster_ratings)

In [None]:
# Initialize two new columns for binary ratings of desserts and main 
usr_rates['ratings_dessert_binary'] = usr_rates['rating'].apply(lambda x: [])
usr_rates['ratings_main_binary'] = usr_rates['rating'].apply(lambda x: [])

In [None]:
# Fill in the binary rating columns for desserts and main 
for i in usr_rates.index:  # Iterate over each 

    # If the user has rated desserts
    if usr_rates['recipes_dessert'][i] != []:

        # Associate the binary rating with the corresponding dessert recipe
        for j in range(len(usr_rates['recipes_dessert'][i])):
            usr_rates['ratings_dessert_binary'][i].append(
                usr_rates['ratings_binary'][i][usr_rates['recipe_id'][i].index(usr_rates['recipes_dessert'][i][j])])

    # If the user has rated main dishes
    if usr_rates['recipes_main'][i] != []:

        # Associate the binary rating with the corresponding main dish recipe
        for j in range(len(usr_rates['recipes_main'][i])):
            usr_rates['ratings_main_binary'][i].append(
                usr_rates['ratings_binary'][i][usr_rates['recipe_id'][i].index(usr_rates['recipes_main'][i][j])])

Separating main dishes and desserts into two different DataFrames

In [None]:
# Extract only the columns related to main dishes and their binary ratings
usr_rates_main_dish = usr_rates[['recipes_main', 'ratings_main_binary']]

# Rename the columns to make them more generic and suitable for later use
usr_rates_main_dish.columns = ['recipes_id', 'rates']

# Reset the index to transform the index into a column and obtain a standard tabular structure
usr_rates_main_dish.reset_index(drop=False, inplace=True)

# Extract only the columns related to desserts and their binary 
usr_rates_dessert = usr_rates[['recipes_dessert', 'ratings_dessert_binary']]

# Rename the columns to standardize the data format
usr_rates_dessert.columns = ['recipes_id', 'rates']

# Reset the index to transform the index into a column, as done for the main dishes
usr_rates_dessert.reset_index(drop=False, inplace=True)

Creation of main dish and dessert matrices, with each user as a row and each recipe as a column, resulting in user/recipe vectors

In [None]:
def matrix(usr_rates):
    # Retrieve and sort the unique recipes
    recipes = sorted(set(chain.from_iterable(
        filter(None, usr_rates['recipes_id']))))

    # List of users
    users = usr_rates['user_id']

    # Initialize a zero matrix (users x recipes
    M = np.zeros((len(users), len(recipes)))

    # Map each recipe to an index
    rcp_idx = {recipe: idx for idx, recipe in enumerate(recipes)}

    # Fill the matrix with user ratings
    for idx, (rcps, rates) in enumerate(zip(usr_rates['recipes_id'], usr_rates['rates'])):
        for rcp, rate in zip(rcps, rates):
            M[idx, rcp_idx[rcp]] = rate

    # Convert the matrix into a DataFrame with users as index and recipes as columns
    df_matrix = pd.DataFrame(M, index=users, columns=recipes)

    return df_matrix

In [None]:
# Build the user-recipe matrices for main dishes and desserts
df_matrix_main_dish = matrix(usr_rates_main_dish)
df_matrix_dessert = matrix(usr_rates_dessert)

After analyzing the data through network graphs, we observed the presence of isolated users. These isolated users are individuals who rated recipes that no one else has rated. Therefore, we decided to remove them from our datasets.

Additionally, we made the decision to remove:

- All recipes with fewer than one rating
- All users with fewer than two ratings
- All users without at least one positive rating

In the PP() function, we loop until the dimensions of our DataFrames remain unchanged, ensuring that all the above rules are satisfied.

In [None]:
def drop_isolated_users(df_matrix):
    # Convert the DataFrame to a NumPy array for faster computations
    np_matrix = df_matrix.to_numpy()

    # Count the number of non-zero entries per column (recipes)
    col_counts = (np_matrix != 0).sum(axis=0)

    # Identify active columns (recipes rated by at least two users)
    active_cols = col_counts >= 2

    # Initialize a mask for non-isolated users
    not_isolated_users = np.zeros(np_matrix.shape[0], dtype=bool)

    # Mark users as non-isolated if they have rated an active recipe
    for col in np.where(active_cols)[0]:
        not_isolated_users |= (np_matrix[:, col] != 0)

    # Identify isolated users
    isolated_users = df_matrix.index[~not_isolated_users]

    # Remove isolated users from the DataFrame
    return df_matrix.drop(index=isolated_users)

In [None]:
def PP(df_matrix):
    # Save the initial shape of the matrix
    shape = df_matrix.shape

    # Initial filter: Keep only users with at least 2 ratings
    df_filt_matrix = df_matrix.loc[(df_matrix != 0).sum(axis=1) >= 2]

    while True:
        # Step 1: Retain columns (recipes) with at least 1 rating
        df_filt_matrix = df_filt_matrix.loc[:,
                                            (df_filt_matrix != 0).sum(axis=0) >= 1]

        # Step 2: Retain rows (users) with at least 2 ratings
        df_filt_matrix = df_filt_matrix.loc[(
            df_filt_matrix != 0).sum(axis=1) >= 2]

        # Step 3: Retain users with at least 1 positive rating
        df_filt_matrix = df_filt_matrix.loc[(
            df_filt_matrix == 1).sum(axis=1) >= 1]

        # Step 4: Remove isolated users
        df_filt_matrix = drop_isolated_users(df_filt_matrix)

        # Check if the shape of the matrix has changed; if not, exit the loop
        if df_filt_matrix.shape == shape:
            break
        else:
            shape = df_filt_matrix.shape

    # Return the filtered matrix
    return df_filt_matrix

In [None]:
# Apply the filtering process (PP) to main dishes and desserts
df_filt_matrix_main_dish = PP(df_matrix_main_dish)
df_filt_matrix_dessert = PP(df_matrix_dessert)

À partir de nos matrices, on retourne au format initial, format utilisé par notre algorithme de calcul des distances. 

In [None]:
def matrix_toFormat(df_matrix):
    # Initialize a list to store tuples (user, recipe, rating)
    lst = []

    # Iterate through each user (row in the matrix)
    for usr in df_matrix.index:
        # Get the recipes and ratings for the user
        rates = df_matrix.loc[usr][df_matrix.loc[usr] != 0]

        # Add a tuple (user, recipe, rating) for each rated recipe
        for rcp, rate in rates.items():
            lst.append((usr, rcp, rate))

    # Convert the list to a DataFrame with columns 'user_id', 'recipe_id', and 'rate'
    df = pd.DataFrame(lst, columns=['user_id', 'recipe_id', 'rate'])

    # Return the formatted DataFrame
    return df

In [None]:
# Convert the filtered matrices of main dishes and desserts into formatted DataFrames
main_dishes = matrix_toFormat(df_filt_matrix_main_dish)
desserts = matrix_toFormat(df_filt_matrix_dessert)

In [None]:
# Save the filtered DataFrames of main dishes and desserts to CSV 
main_dishes.to_csv('PP_user_main_dishes.csv')
desserts.to_csv('PP_user_desserts.csv')

In our application, we want to display the names, steps, and ingredients of each recipe. Therefore, we retrieve all this information using our final recipe IDs

In [None]:
# Combine the recipe IDs from main dishes and desserts
ids = set(main_dishes['recipe_id']).union(set(desserts['recipe_id']))

# Filter the recipe data to keep only those corresponding to the selected 
recipes_data = rcp[rcp['id'].isin(ids)].copy()

# Remove unnecessary columns
recipes_data.drop(columns=['minutes', 'contributor_id', 'submitted', 'tags', 'nutrition',
                  'n_steps', 'n_ingredients', 'type_of_dish', 'description'], inplace=True)

# Set recipe IDs as the index
recipes_data.set_index('id')

# Reorganize the 
recipes_data = recipes_data[[
    'id', 'name', 'steps', 'ingredients']]

In [None]:
# Save the filtered recipe data to a CSV 
recipes_data.to_csv('PP_recipes_data.csv', index=False)