# Recommender Pipeline

*   Synthetische User Profiles
*   Ausfüllen von BioBank Datenstruktur (LLM)
* Datenstruktur umwandeln (mit LLM) in User Item interaktionen
* mithilfe von User Item interaktionen lebensmittel recommenden





### About this Dataset
This data was collected from https://www.allrecipes.com/.
Features include:

group: grouping by origin of recipes, consisting of 3 (or 2) groups, separated by dots.
name: the name of recipe
rating: rating of the recipe
n_rater: number of participants rating the recipe
n_reiviewer: number of participants reviewing the recipe
summary: blurb about the recipe
process: summary of the recipe process
ingredient: ingredient of the recipe

In [1]:
# !pip install pandas
# !pip install --upgrade kagglehub
# !pip install -U LibRecommender
# !pip install keras==2.12.0 tensorflow==2.12.0
#
# !pip show LibRecommender

In [2]:
import kagglehub
import pandas as pd
from zipfile import ZipFile
import tensorflow as tf
import os


path = kagglehub.dataset_download("shuyangli94/food-com-recipes-and-user-interactions")

print("Path to dataset files:", path)


  from .autonotebook import tqdm as notebook_tqdm
2025-03-13 12:29:58.829261: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-03-13 12:29:58.903058: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-13 12:29:59.041690: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-13 12:29:59.042202: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Path to dataset files: /home/mw/.cache/kagglehub/datasets/shuyangli94/food-com-recipes-and-user-interactions/versions/2


In [3]:
def updateLabels(interactions_data):
    interactions_data["label"] = interactions_data["label"].apply(lambda x: int(x))
    return interactions_data


In [4]:
def rename_and_drop_columns(interactions_data):
  interactions_data.rename(
      columns={"user_id": "user", "recipe_id": "item","rating": "label"},
      inplace=True
  )
  for column in interactions_data.columns:
    if column != "user" and column != "item" and column != "label":
      interactions_data.drop(columns=column, inplace=True)

  updateLabels(interactions_data)

  return interactions_data

In [5]:
# 2) Vorhandene Interactions-Dateien kombinieren, weil ansonsten ein out of bounds Fehler auftritt
eval_data_path = os.path.join(path, "interactions_validation.csv")
eval_data = pd.read_csv(eval_data_path)


train_data_path = os.path.join(path, "interactions_train.csv")
train_data = pd.read_csv(train_data_path)


test_data_path = os.path.join(path, "interactions_test.csv")
test_data = pd.read_csv(test_data_path)


# Data muss zusammengefügt werden, damit sie gefiltert und im gleichen Verhältnis wieder aufgeteilt werden kann
data = pd.concat([train_data, eval_data, test_data], ignore_index=True)
data = rename_and_drop_columns(data)

In [6]:
all_unique_labels = data["label"].unique()
all_unique_labels

array([5, 4, 3, 1, 0, 2])

In [7]:
# Alle unterschiedlichen Inhalte in der Spalte "label" und deren Häufigkeit
label_counts = data["label"].value_counts()
print("Unterschiedliche Inhalte in 'label' und deren Häufigkeit:")
print(label_counts)


Unterschiedliche Inhalte in 'label' und deren Häufigkeit:
label
5    530417
4    131846
3     27058
0     18000
2      7336
1      3722
Name: count, dtype: int64


In [8]:
data.columns

Index(['user', 'item', 'label'], dtype='object')

In [9]:
data.head()

Unnamed: 0,user,item,label
0,2046,4684,5
1,2046,517,5
2,1773,7435,5
3,1773,278,4
4,2046,3431,5


In [10]:
data['user'][0]

2046

In [11]:
data['item'][0]

4684

In [12]:
data['label'][0]

5

In [13]:
import pandas as pd

threshold = 30

# 1) Items filtern, die mindestens * Interaktionen haben:
min_item_interactions = threshold
item_counts = data["item"].value_counts()
items_to_keep = item_counts[item_counts >= min_item_interactions].index

data_filtered = data[data["item"].isin(items_to_keep)]

# 2) User filtern, die mindestens * Interaktionen haben:
min_user_interactions = threshold
user_counts = data_filtered["user"].value_counts()
users_to_keep = user_counts[user_counts >= min_user_interactions].index

data_filtered = data_filtered[data_filtered["user"].isin(users_to_keep)]

# Ergebnis prüfen
print("Datensatz vor Filterung:", data.shape)
print("Datensatz nach Filterung:", data_filtered.shape)
print(data_filtered.head())


Datensatz vor Filterung: (718379, 3)
Datensatz nach Filterung: (71370, 3)
      user   item  label
164  11297   5478      4
245   4470    834      5
300   6357  11365      5
349   6357  11642      5
365   9869   2886      5


In [14]:
from libreco.data import random_split, DatasetPure

train_data, eval_data, test_data = random_split(data_filtered, multi_ratios=[0.8, 0.1, 0.1])

train_data, data_info = DatasetPure.build_trainset(train_data)
eval_data = DatasetPure.build_evalset(eval_data)
test_data = DatasetPure.build_testset(test_data)
print(data_info)

n_users: 1136, n_items: 2455, data density: 2.0472 %


In [15]:
data_filtered.columns

Index(['user', 'item', 'label'], dtype='object')

In [16]:
data_filtered.head()

Unnamed: 0,user,item,label
164,11297,5478,4
245,4470,834,5
300,6357,11365,5
349,6357,11642,5
365,9869,2886,5


In [17]:
# nr of unique users
data_filtered['user'].nunique()

1136

In [76]:
def create_user_item_matrix(df):
    # Create the pivot table: rows are users, columns are items, values are labels
    user_item_matrix = df.pivot(index='user', columns='item', values='label')
    return user_item_matrix.fillna(3).to_numpy()-3 # center around 0, replace NaN with 0

user_item_matrix = create_user_item_matrix(data_filtered)
user_item_matrix.shape

(1136, 2455)

### Uncertainty-sampling approach

In [77]:
import numpy as np
from scipy.sparse.linalg import svds
from sklearn.neighbors import NearestNeighbors
import random

def get_rating(item_id, default_value=None):
    if default_value is not None:
        return default_value

    return random.randint(-2, 2)

def active_learning_recommendation(user_item_matrix, n_iterations=10, n_factors=20, k_neighbors=5, latent_neighbor=True, default_rating=None):
    n_users, n_items = user_item_matrix.shape

    # SVD
    U, sigma, Vt = svds(user_item_matrix, k=n_factors)

    # KNN
    if not latent_neighbor:
        knn_model = NearestNeighbors(n_neighbors=k_neighbors, metric='cosine')
        knn_model.fit(user_item_matrix)
    else:
        knn_model = NearestNeighbors(n_neighbors=k_neighbors, metric='cosine')
        knn_model.fit(U)

    # Iterative active learning process
    summary = []
    user_vector, user_latent = np.zeros(n_items), np.mean(U, axis=0)
    for _ in range(n_iterations):
        # Find similar users in original vector space
        if not latent_neighbor:
            _, similar_users_indices = knn_model.kneighbors([user_vector], n_neighbors=min(k_neighbors, len(user_item_matrix)))
        else:
            _, similar_users_indices = knn_model.kneighbors([user_latent], n_neighbors=min(k_neighbors, len(U)))

        # Get the latent vectors for these similar users for prediction
        similar_users_latents = U[similar_users_indices[0]]

        # Active learning by uncertainty sampling
        # 1. Calculate projected ratings
        rating_projections = np.dot(similar_users_latents, np.dot(np.diag(sigma), Vt))
        # 2. Calculate variance of ratings across selected users
        rating_variances = np.var(rating_projections, axis=0) * (user_vector == 0)
        # 3. Uncertainty sampling: select the item with the highest variance
        selected_item = np.argmax(rating_variances)

        # Ask oracle to rate the new item
        rating = get_rating(selected_item, default_rating)
        # Update user vector & latent representation
        user_vector[selected_item] = rating
        user_latent = np.dot(user_vector, np.dot(Vt.T, np.diag(1.0 / sigma)))
        # Add to summary
        summary.append((selected_item, rating))

    return summary

In [78]:
results = active_learning_recommendation(user_item_matrix, n_iterations=5, latent_neighbor=True, default_rating=2)
print(results)
results = active_learning_recommendation(user_item_matrix, n_iterations=5, latent_neighbor=True, default_rating=-2)
print(results)

[(1433, 2), (642, 2), (514, 2), (778, 2), (396, 2)]
[(1433, -2), (1044, -2), (1506, -2), (396, -2), (778, -2)]


In [79]:
results = active_learning_recommendation(user_item_matrix, n_iterations=5, latent_neighbor=False, default_rating=2)
print(results)
results = active_learning_recommendation(user_item_matrix, n_iterations=5, latent_neighbor=False, default_rating=-2)
print(results)

[(1260, 2), (778, 2), (514, 2), (396, 2), (1506, 2)]
[(1260, -2), (1257, -2), (514, -2), (778, -2), (1433, -2)]
