In [1]:
import opendatasets as od
import pandas as pd
import numpy as np

from concurrent.futures import ProcessPoolExecutor
from itertools import product
from tqdm import tqdm
from collections import defaultdict
from surprise import Dataset, Reader, SVD
from surprise import NormalPredictor
from surprise.model_selection import train_test_split as surprise_train_test_split
from surprise.accuracy import rmse
from surprise import accuracy

# Preprocesamiento

In [2]:
od.download_kaggle_dataset("https://www.kaggle.com/datasets/irkaal/foodcom-recipes-and-reviews", "data")

Skipping, found downloaded files in "data\foodcom-recipes-and-reviews" (use force=True to force download)


In [3]:
recipes = pd.read_parquet("data/foodcom-recipes-and-reviews/recipes.parquet")
reviews = pd.read_parquet("data/foodcom-recipes-and-reviews/reviews.parquet")

In [4]:
recipes.head()

Unnamed: 0,RecipeId,Name,AuthorId,AuthorName,CookTime,PrepTime,TotalTime,DatePublished,Description,Images,...,SaturatedFatContent,CholesterolContent,SodiumContent,CarbohydrateContent,FiberContent,SugarContent,ProteinContent,RecipeServings,RecipeYield,RecipeInstructions
0,38.0,Low-Fat Berry Blue Frozen Dessert,1533,Dancer,PT24H,PT45M,PT24H45M,1999-08-09 21:46:00+00:00,Make and share this Low-Fat Berry Blue Frozen ...,[https://img.sndimg.com/food/image/upload/w_55...,...,1.3,8.0,29.8,37.1,3.6,30.2,3.2,4.0,,"[Toss 2 cups berries with sugar., Let stand fo..."
1,39.0,Biryani,1567,elly9812,PT25M,PT4H,PT4H25M,1999-08-29 13:12:00+00:00,Make and share this Biryani recipe from Food.com.,[https://img.sndimg.com/food/image/upload/w_55...,...,16.6,372.8,368.4,84.4,9.0,20.4,63.4,6.0,,[Soak saffron in warm milk for 5 minutes and p...
2,40.0,Best Lemonade,1566,Stephen Little,PT5M,PT30M,PT35M,1999-09-05 19:52:00+00:00,This is from one of my first Good House Keepi...,[https://img.sndimg.com/food/image/upload/w_55...,...,0.0,0.0,1.8,81.5,0.4,77.2,0.3,4.0,,"[Into a 1 quart Jar with tight fitting lid, pu..."
3,41.0,Carina's Tofu-Vegetable Kebabs,1586,Cyclopz,PT20M,PT24H,PT24H20M,1999-09-03 14:54:00+00:00,This dish is best prepared a day in advance to...,[https://img.sndimg.com/food/image/upload/w_55...,...,3.8,0.0,1558.6,64.2,17.3,32.1,29.3,2.0,4 kebabs,"[Drain the tofu, carefully squeezing out exces..."
4,42.0,Cabbage Soup,1538,Duckie067,PT30M,PT20M,PT50M,1999-09-19 06:19:00+00:00,Make and share this Cabbage Soup recipe from F...,[https://img.sndimg.com/food/image/upload/w_55...,...,0.1,0.0,959.3,25.1,4.8,17.7,4.3,4.0,,"[Mix everything together and bring to a boil.,..."


In [5]:
reviews.head()

Unnamed: 0,ReviewId,RecipeId,AuthorId,AuthorName,Rating,Review,DateSubmitted,DateModified
0,2,992,2008,gayg msft,5,better than any you can get at a restaurant!,2000-01-25 21:44:00+00:00,2000-01-25 21:44:00+00:00
1,7,4384,1634,Bill Hilbrich,4,"I cut back on the mayo, and made up the differ...",2001-10-17 16:49:59+00:00,2001-10-17 16:49:59+00:00
2,9,4523,2046,Gay Gilmore ckpt,2,i think i did something wrong because i could ...,2000-02-25 09:00:00+00:00,2000-02-25 09:00:00+00:00
3,13,7435,1773,Malarkey Test,5,easily the best i have ever had. juicy flavor...,2000-03-13 21:15:00+00:00,2000-03-13 21:15:00+00:00
4,14,44,2085,Tony Small,5,An excellent dish.,2000-03-28 12:51:00+00:00,2000-03-28 12:51:00+00:00


In [6]:
def check_users(train_users, test_users):
    users_in_train = set(train_users)
    users_in_test = set(test_users)
    users_out_of_train = users_in_test - users_in_train
    if users_out_of_train != set():
      raise Exception(f"Users {users_out_of_train} are not in train")
    
def get_train_and_test(reviews, test_size):
    np.random.seed(314159265)
    reviews_randomized = list(reviews.itertuples(index=True))
    np.random.shuffle(reviews_randomized)
    test_tuples = []

    user_review_counts = reviews.groupby('AuthorId').size()
    users_with_multiple_reviews_counts = user_review_counts[user_review_counts > 1].to_dict()
    users_with_multiple_reviews = set(users_with_multiple_reviews_counts.keys())

    for review in tqdm(reviews_randomized):
      if review.AuthorId not in users_with_multiple_reviews: continue
      users_with_multiple_reviews_counts[review.AuthorId] -= 1
      if users_with_multiple_reviews_counts[review.AuthorId] == 1:
        users_with_multiple_reviews.remove(review.AuthorId)

      test_tuples.append(review)
      if len(test_tuples) >= len(reviews) * test_size: break
      
    test_df = pd.DataFrame(test_tuples).set_index("Index")
    train_df = reviews.drop(test_df.index)
    return train_df, test_df

# Most Popular

In [7]:
# Inspeccionar las columnas relevantes
reviews_data = reviews[['RecipeId', 'AuthorId', 'Rating']]
recipes_data = recipes[['RecipeId']]

# Dividir el conjunto en entrenamiento y prueba por usuario
train, test = get_train_and_test(reviews_data, test_size=0.2)


 23%|██▎       | 328722/1401982 [00:00<00:01, 1036734.93it/s]


In [8]:
# Crear un recomendador "Most Popular" basado en frecuencia
def most_popular_by_interaction(train_data, top_n=10):
    # Contar la popularidad de cada receta en el conjunto de entrenamiento
    popularity = train_data['RecipeId'].value_counts()
    # Retornar los IDs de las recetas más populares
    return popularity.head(top_n).index.tolist()

def most_popular_by_positive_interaction(train_data, top_n=10, threshold = 4):
    # Filtrar interacciones positivas (rating >= 4)
    positive_interactions = train_data[train_data['Rating'] >= threshold]
    # Contar la frecuencia de interacciones positivas por receta
    positive_frequency = positive_interactions['RecipeId'].value_counts()
    # Ordenar las recetas por frecuencia descendente y seleccionar las más populares
    popular_items = positive_frequency.head(top_n).index.tolist()
    return popular_items

def most_popular_by_rating(train_data, top_n=10):
    # Calcular el promedio de los ratings por receta
    avg_ratings = train_data.groupby('RecipeId')['Rating'].mean()
    # Ordenar las recetas por rating promedio descendente
    popular_items = avg_ratings.sort_values(ascending=False).head(top_n).index.tolist()
    return popular_items

# Evaluación de métricas NDCG@10, Recall@10, Precision@10
def calculate_metrics(test_data, recommendations, top_n=10, threshold = 4):
    ndcg_scores = []
    recall_scores = []
    precision_scores = []

    # Agrupar las interacciones por usuario
    test_grouped = test_data.groupby('AuthorId')

    for user, group in test_grouped:
        # Obtener las recetas con las que el usuario interactuó
        user_items = set(group[group['Rating'] >= threshold]['RecipeId'].tolist())
        # Recomendaciones relevantes para el usuario
        recommended_items = recommendations[:top_n]
        hits = len(set(recommended_items) & user_items)

        # Precision@N
        precision = hits / top_n
        precision_scores.append(precision)

        # Recall@N
        recall = hits / len(user_items) if user_items else 0
        recall_scores.append(recall)

        # NDCG@N
        dcg = sum([1 / np.log2(idx + 2) for idx, item in enumerate(recommended_items) if item in user_items])
        idcg = sum([1 / np.log2(idx + 2) for idx in range(min(len(user_items), top_n))])
        ndcg = dcg / idcg if idcg > 0 else 0
        ndcg_scores.append(ndcg)

    # Calcular promedios de las métricas
    return {
        'Precision@10': np.mean(precision_scores),
        'Recall@10': np.mean(recall_scores),
        'NDCG@10': np.mean(ndcg_scores)
    }

In [9]:
popular_by_interaction = most_popular_by_interaction(train, 10)
popular_by_positive_interaction = most_popular_by_positive_interaction(train, 10)
popular_by_rating = most_popular_by_rating(train, 10)
metrics = calculate_metrics(test, popular_by_interaction, top_n=10, threshold = 4)
print(metrics)
metrics = calculate_metrics(test, popular_by_rating, top_n=10, threshold = 4)
print(metrics)
metrics = calculate_metrics(test, popular_by_positive_interaction, top_n=10, threshold = 4)
print(metrics)

{'Precision@10': 0.004580316208920691, 'Recall@10': 0.015601049603278613, 'NDCG@10': 0.01108472982218702}
{'Precision@10': 1.928103174943228e-05, 'Recall@10': 4.6997855772514054e-05, 'NDCG@10': 4.204886709135507e-05}
{'Precision@10': 0.004980933201936672, 'Recall@10': 0.016615683072420638, 'NDCG@10': 0.011586892039924903}


# Random

https://surprise.readthedocs.io/en/stable/FAQ.html

In [10]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

def calculate_metrics(model, test_df, n=10, threshold=4):
    """
    Calculate Precision@N, Recall@N, and NDCG@N for a recommendation system.

    Parameters:
        test_data (DataFrame): DataFrame with user interactions (must include 'AuthorId', 'RecipeId', and 'Rating').
        predictions (list): List of prediction objects from Surprise's `test` method.
        n (int): Number of top recommendations to consider.
        threshold (int): Minimum rating to consider an item relevant.

    Returns:
        dict: Dictionary with mean Precision@N, Recall@N, and NDCG@N scores.
    """
    ndcg_scores = []
    recall_scores = []
    precision_scores = []
    # Group the test data by user
    
    RecipeIds = test_df.RecipeId.unique()
    test_grouped = test_df.groupby('AuthorId')
    for user, group in tqdm(test_grouped):
        combinations = list(product([user], RecipeIds, [0]))
        predictions = model.test(combinations)

        recommended_items = get_top_n(predictions, n=n)
        recommended_items = [item[0] for item in recommended_items[user]]

        # Get relevant items for the user (items with ratings >= threshold)
        user_items = set(group[group['Rating'] >= threshold]['RecipeId'].tolist())

        # Calculate hits
        hits = len(set(recommended_items) & user_items)

        # Precision@N
        precision = hits / n
        precision_scores.append(precision)

        # Recall@N
        recall = hits / len(user_items) if user_items else 0
        recall_scores.append(recall)

        # NDCG@N
        dcg = sum([1 / np.log2(idx + 2) for idx, item in enumerate(recommended_items) if item in user_items])
        idcg = sum([1 / np.log2(idx + 2) for idx in range(min(len(user_items), n))])
        ndcg = dcg / idcg if idcg > 0 else 0
        ndcg_scores.append(ndcg)

    # Return the mean metrics
    return {
        'Precision@N': np.mean(precision_scores),
        'Recall@N': np.mean(recall_scores),
        'NDCG@N': np.mean(ndcg_scores)
    }


In [11]:
# Inspeccionar las columnas relevantes
reviews_data = reviews[['AuthorId', 'RecipeId', 'Rating']]

# Dividir el conjunto en entrenamiento y prueba por usuario
train_df, test_df = get_train_and_test(reviews_data, test_size=0.2)

reader = Reader(rating_scale=(0, 5))

train_data = Dataset.load_from_df(train_df, reader)
trainset = train_data.build_full_trainset()

testset = list(test_df.itertuples(index=False, name=None))

 23%|██▎       | 328722/1401982 [00:00<00:00, 1097776.44it/s]


In [12]:
randomModel = NormalPredictor()
randomModel.fit(trainset)

<surprise.prediction_algorithms.random_pred.NormalPredictor at 0x16f77a77e50>

In [13]:
metrics = calculate_metrics(randomModel, test_df, n=10, threshold=4)

100%|██████████| 46678/46678 [8:01:34<00:00,  1.62it/s]  


In [14]:
metrics

{'Precision@N': 0.00021637602296585117,
 'Recall@N': 0.0005861152964504721,
 'NDCG@N': 0.00032347558698440763}

# FunkSVD

In [17]:
# Inspeccionar las columnas relevantes
reviews_data = reviews[['AuthorId', 'RecipeId', 'Rating']]

# Dividir el conjunto en entrenamiento y prueba por usuario
train_df, test_df = get_train_and_test(reviews_data, test_size=0.2)

reader = Reader(rating_scale=(0, 5))

train_data = Dataset.load_from_df(train_df, reader)
trainset = train_data.build_full_trainset()

testset = list(test_df.itertuples(index=False, name=None))

svdModel = SVD()
svdModel.fit(trainset)

 23%|██▎       | 328722/1401982 [00:00<00:00, 1127808.97it/s]


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x16fe35ae9b0>

In [18]:
metrics = calculate_metrics(svdModel, test_df, n=10, threshold=4)

100%|██████████| 46678/46678 [10:02:58<00:00,  1.29it/s]  


In [19]:
metrics

{'Precision@N': 0.0007005441535627061,
 'Recall@N': 0.0020817465433371194,
 'NDCG@N': 0.0015608756797149132}