In [16]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import csv
from collections import defaultdict
import json
from lightfm import LightFM
from lightfm.data import Dataset
import optuna
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import precision_at_k
import os
import numpy as np

# Default path to data files
PATH = "../data/"

In [17]:
# Load user-item interaction data
interaction_data = pd.read_csv(
    PATH + 'ml-1m/ratings.dat',
    sep='::',
    names=['user_id', 'item_id', 'rating', 'timestamp']
    )[['user_id', 'item_id', 'rating']]
display(interaction_data.shape)
interaction_data.head(5)

(1000209, 3)

Unnamed: 0,user_id,item_id,rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [18]:
# Instantiate a defaultdict to hold user features
user_data = defaultdict(dict)

# Read data and build user features dictionary
def load_feature(file_path, feature_name):
    with open(file_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            userId = row['userId']
            value = row[feature_name]
            user_data[userId][feature_name] = value

# Load each feature file
load_feature(PATH + 'ageRel.csv', 'age')
load_feature(PATH + 'genderRel.csv', 'gender')
load_feature(PATH + 'occupationRel.csv', 'occupation')
load_feature(PATH + 'residesRel.csv', 'zipcode')

# Build user features list
user_features_raw = [
    (userId, [f'age:{data["age"]}', f'gender:{data["gender"]}',
              f'occupation:{data["occupation"]}', f'zipcode:{data["zipcode"]}'])
    for userId, data in user_data.items()
]

# Display first 5 user features
for item in user_features_raw[:5]:
    print(item)

('1', ['age:1', 'gender:F', 'occupation:10', 'zipcode:48'])
('2', ['age:56', 'gender:M', 'occupation:16', 'zipcode:70'])
('3', ['age:25', 'gender:M', 'occupation:15', 'zipcode:55'])
('4', ['age:45', 'gender:M', 'occupation:7', 'zipcode:02'])
('5', ['age:25', 'gender:M', 'occupation:20', 'zipcode:55'])


In [19]:
# Instantiate a defaultdict to hold item features
item_data = defaultdict(lambda: defaultdict(list))

# Read data and build item features dictionary
# Modified version to handle multiple genres
def load_feature(file_path, feature_name):
    with open(file_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            itemId = row['movieId']
            value = row[feature_name]
            if feature_name == 'genreDesc':
                item_data[itemId]['genre'].append(value)
            else:
                item_data[itemId][feature_name] = value

# Load each feature file
load_feature(PATH + 'releaseRel.csv', 'releaseDate')
load_feature(PATH + 'genreRel.csv', 'genreDesc')

# Build item features list
item_features_raw = [
    (
        itemId,
        [f'releaseDate:{data["releaseDate"]}'] +
        [f'genre:{genre}' for genre in data['genre']]
    )
    for itemId, data in item_data.items()
]

# Manter apenas filmes que foram assitidos por pelo menos um usuário
valid_item_ids = set(interaction_data['item_id'].astype(str).unique())
item_features_raw = [
    (item_id, features) for item_id, features in item_features_raw if item_id in valid_item_ids
]

# Display first 5 item features
for item in item_features_raw[:5]:
    print(item)

('1', ['releaseDate:1995', 'genre:Animation', "genre:Children's", 'genre:Comedy'])
('2', ['releaseDate:1995', 'genre:Adventure', "genre:Children's", 'genre:Fantasy'])
('3', ['releaseDate:1995', 'genre:Comedy', 'genre:Romance'])
('4', ['releaseDate:1995', 'genre:Comedy', 'genre:Drama'])
('5', ['releaseDate:1995', 'genre:Comedy'])


In [20]:
# Load test item IDs from the json file saved
# previously from Knowledge Graph Method
with open('../experiments/test_ids.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
# Extract test item IDs as integers
test_item_ids = [item['movieId'] for item in data]
print(f"items on test set: {len(test_item_ids)}")
display(test_item_ids[:5])

items on test set: 297


['3408', '2687', '3186', '2762', '3114']

In [21]:
# Split data into train and test sets for later evaluation
# with LightFM the same way as done in Knowledge Graph Method

# Interaction data for training (excluding test items)
# Remover todas as interações dos itens de teste permite
# simular o cenário de recomendação de novos itens
train_interactions_df = interaction_data[
    ~interaction_data['item_id'].astype(str).isin(test_item_ids)]

# Interaction data for testing (only test items)
# Usado na etapa de avaliação como ground truth
test_interactions_df = interaction_data[
    interaction_data['item_id'].astype(str).isin(test_item_ids)]

# Item side features for testing (only testing items)
test_item_features = [item for item in item_features_raw if item[0] in test_item_ids]

In [22]:
# Build lightfm Dataset
dataset = Dataset()

# All unique user and item ids because LightFM needs all ids
# even if some items are only in the test set
user_ids = interaction_data['user_id'].astype(str).unique()
# Os itens em cold start precisam estar no dataset
item_ids = interaction_data['item_id'].astype(str).unique()

# Unique features from user and item features
user_feature_set = set(f for _, feats in user_features_raw for f in feats)
item_feature_set = set(f for _, feats in item_features_raw for f in feats)

# Partial fit
dataset.fit(
    users=user_ids,
    items=item_ids,
    user_features=user_feature_set,
    item_features=item_feature_set
)

# Build training matrices considering only training interactions
# so its possible to simulate cold-start for items in the test set
(interactions, weights) = dataset.build_interactions(
    [(str(row['user_id']), str(row['item_id']), row['rating']) for _, row in train_interactions_df.iterrows()]
)

user_features = dataset.build_user_features(user_features_raw)
item_features = dataset.build_item_features(item_features_raw)

In [23]:
# Implement hiperparameter tuning using Optuna
# to find the best parameters for LightFM model

# Define the objective function for optimization
def objective(trial):
    # Hiperparâmetros a otimizar
    no_components = trial.suggest_int('no_components', 20, 100)
    learning_rate = trial.suggest_float('learning_rate', 1e-4, 1e-1, log=True)
    loss = trial.suggest_categorical('loss', ['logistic'])

    # Validação cruzada: média de N splits
    n_splits = 3
    scores = []

    for _ in range(n_splits):
        # Split data into train and validation sets
        train, valid = random_train_test_split(
            interactions, test_percentage=0.2)
        
        # Instantiate and train the model
        model = LightFM(
            no_components=no_components,
            learning_rate=learning_rate,
            loss=loss
        )

        model.fit(
            train,
            user_features=user_features,
            item_features=item_features,
            epochs=20,
            num_threads=4
        )

        # Evaluate the model using precision@k
        score = precision_at_k(
            model,
            valid,
            k=5,
            user_features=user_features,
            item_features=item_features
        ).mean()
        scores.append(score)
        
    return sum(scores) / len(scores)

In [24]:
# Run the optimization with Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

# Print the best parameters found
print("Best parameters found:")
print(study.best_params)

# Save the best parameters to a JSON file
os.makedirs("../experiments", exist_ok=True)
with open("../experiments/lightfm_best_params.json", "w", encoding="utf-8") as f:
    json.dump(
        {
            "best_params": study.best_params,
            "best_precision_at_5": study.best_value
        },
        f,
        indent=4
    )

[I 2025-10-26 14:35:22,009] A new study created in memory with name: no-name-218d7300-a34e-4152-b744-cfd6c688551f
[I 2025-10-26 14:49:02,978] Trial 0 finished with value: 0.018090691417455673 and parameters: {'no_components': 43, 'learning_rate': 0.0419953200687935, 'loss': 'logistic'}. Best is trial 0 with value: 0.018090691417455673.
[I 2025-10-26 15:12:27,852] Trial 1 finished with value: 0.017053134739398956 and parameters: {'no_components': 79, 'learning_rate': 0.00013054210973122946, 'loss': 'logistic'}. Best is trial 0 with value: 0.018090691417455673.
[I 2025-10-26 15:19:41,410] Trial 2 finished with value: 0.021899228915572166 and parameters: {'no_components': 23, 'learning_rate': 0.03906730044605027, 'loss': 'logistic'}. Best is trial 2 with value: 0.021899228915572166.
[I 2025-10-26 15:46:52,879] Trial 3 finished with value: 0.022384539246559143 and parameters: {'no_components': 92, 'learning_rate': 0.027653258787099663, 'loss': 'logistic'}. Best is trial 3 with value: 0.022

Best parameters found:
{'no_components': 90, 'learning_rate': 0.03352736068651356, 'loss': 'logistic'}


In [25]:
# Load the best parameters from the JSON file
with open("../experiments/lightfm_best_params.json", "r", encoding="utf-8") as f:
    best = json.load(f)
best_params = best["best_params"]

# Instantiate and train the final model with the best parameters
# and interactions from the entire training set
final_model = LightFM(
    no_components=best_params["no_components"],
    learning_rate=best_params["learning_rate"],
    loss=best_params["loss"]
)
final_model.fit(
    interactions,
    user_features=user_features,
    item_features=item_features,
    epochs=20,
    num_threads=4
)

<lightfm.lightfm.LightFM at 0x1ec1c524e20>

In [26]:
# Predict the 50 most relevant users for each item in the test set

# Load all the user IDs
user_ids = list(user_ids)

# Build the item features matrix for the test set
test_item_features_matrix = dataset.build_item_features(test_item_features)

# Generate recommendations for each test item
# getting top 50 users for each item so its possible to compare
# with Knowledge Graph Method results on k = 10, 20 and 50
top_k = 50
recommendations = {}

# Mapeamento reverso do índice interno para o ID real do usuário
user_id_map = {v: k for k, v in dataset.mapping()[0].items()}

for item_id in test_item_ids:
    # Índice interno do item de teste
    item_internal_idx = dataset.mapping()[2][item_id]
    # Score para todos os usuários para este item
    scores = final_model.predict(
        user_ids=np.arange(len(user_ids)),
        item_ids=np.repeat(item_internal_idx, len(user_ids)),
        user_features=user_features,
        item_features=test_item_features_matrix
    )
    # Top 50 usuários (índices ordenados por score decrescente)
    top_users_idx = np.argsort(-scores)[:top_k]
    # IDs reais dos usuários
    top_users = [user_ids[i] for i in top_users_idx]
    recommendations[item_id] = top_users

In [27]:
# Calculate precision@k and ndcg@k for k = 10, 20, 50
def precision_at_k(recommended, relevant, k):
    recommended_k = recommended[:k]
    relevant_set = set(relevant)
    hits = sum([1 for user in recommended_k if user in relevant_set])
    return hits / k

def dcg_at_k(recommended, relevant, k):
    recommended_k = recommended[:k]
    relevant_set = set(relevant)
    return sum([1 / np.log2(idx + 2) if user in relevant_set else 0
                for idx, user in enumerate(recommended_k)])

def ndcg_at_k(recommended, relevant, k):
    dcg = dcg_at_k(recommended, relevant, k)
    ideal_dcg = sum([1 / np.log2(idx + 2) for idx in range(min(len(relevant), k))])
    return dcg / ideal_dcg if ideal_dcg > 0 else 0.0

ks = [10, 20, 50]
precision_scores = {k: [] for k in ks}
ndcg_scores = {k: [] for k in ks}

# Crie um dicionário: item_id -> lista de usuários relevantes (do test_interactions_df)
test_relevant = (
    test_interactions_df.groupby('item_id')['user_id']
    .apply(list)
    .to_dict()
)

for item_id, recommended_users in recommendations.items():
    # Converta ambos para string para garantir a comparação correta
    relevant_users = [str(u) for u in test_relevant.get(int(item_id), [])]
    recommended_users = [str(u) for u in recommended_users]
    for k in ks:
        prec = precision_at_k(recommended_users, relevant_users, k)
        ndcg = ndcg_at_k(recommended_users, relevant_users, k)
        precision_scores[k].append(prec)
        ndcg_scores[k].append(ndcg)

for k in ks:
    print(f"\nPrecision@{k} por item:")
    print(precision_scores[k])  # Lista com 30 valores

    print(f"NDCG@{k} por item:")
    print(ndcg_scores[k])


Precision@10 por item:
[0.3, 0.0, 0.0, 0.1, 0.2, 0.4, 0.4, 0.1, 0.0, 0.1, 0.5, 0.6, 0.1, 0.2, 0.0, 0.3, 0.1, 0.1, 0.2, 0.0, 0.2, 0.0, 0.0, 0.2, 0.0, 0.2, 0.0, 0.0, 0.1, 0.0, 0.1, 0.0, 0.0, 0.1, 0.2, 0.0, 0.4, 0.4, 0.4, 0.0, 0.1, 0.3, 0.4, 0.1, 0.0, 0.2, 0.0, 0.0, 0.0, 0.0, 0.1, 0.1, 0.0, 0.0, 0.1, 0.0, 0.0, 0.1, 0.1, 0.3, 0.0, 0.0, 0.2, 0.1, 0.0, 0.0, 0.2, 0.1, 0.1, 0.3, 0.3, 0.1, 0.1, 0.1, 0.2, 0.1, 0.1, 0.0, 0.0, 0.1, 0.0, 0.0, 0.1, 0.2, 0.0, 0.2, 0.2, 0.1, 0.2, 0.0, 0.0, 0.0, 0.1, 0.1, 0.1, 0.2, 0.1, 0.2, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.1, 0.0, 0.0, 0.1, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.1, 0.0, 0.0, 0.1, 0.0, 0.0, 0.3, 0.0, 0.0, 0.1, 0.1, 0.0, 0.0, 0.0, 0.1, 0.1, 0.0, 0.0, 0.1, 0.0, 0.1, 0.1, 0.0, 0.0, 0.1, 0.0, 0.0, 0.1, 0.0, 0.2, 0.1, 0.0, 0.1, 0.0, 0.0, 0.1, 0.1, 0.0, 0.1, 0.1, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.2, 0.0, 0.1, 0.0, 0.0, 0.1, 0.0, 0.0, 

In [28]:
# Média para cada k
for k in ks:
    print(f"Precision@{k}: {np.mean(precision_scores[k]):.4f}")
    print(f"NDCG@{k}: {np.mean(ndcg_scores[k]):.4f}")

Precision@10: 0.0562
NDCG@10: 0.0565
Precision@20: 0.0522
NDCG@20: 0.0533
Precision@50: 0.0950
NDCG@50: 0.0849


Média no dataset Movielens 100k
Precision@10: 0.1667
NDCG@10: 0.1859
Precision@20: 0.1733
NDCG@20: 0.1819
Precision@50: 0.2173
NDCG@50: 0.2108

In [29]:
# Salvar listas completas de métricas por item
results = {
    **{f"precision@{k}": precision_scores[k] for k in ks},
    **{f"ndcg@{k}": ndcg_scores[k] for k in ks}
}

with open("../experiments/lightfm_final_metrics.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=4)