In [1]:
pip install recommenders

Collecting recommenders
  Downloading recommenders-1.2.1-py3-none-any.whl.metadata (13 kB)
Collecting cornac<3,>=1.15.2 (from recommenders)
  Downloading cornac-2.3.0-cp310-cp310-manylinux1_x86_64.whl.metadata (37 kB)
Collecting locust<3,>=2.12.2 (from recommenders)
  Downloading locust-2.33.2-py3-none-any.whl.metadata (9.6 kB)
Collecting memory-profiler<1,>=0.61.0 (from recommenders)
  Downloading memory_profiler-0.61.0-py3-none-any.whl.metadata (20 kB)
Collecting nltk<4,>=3.8.1 (from recommenders)
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting notebook<8,>=6.5.5 (from recommenders)
  Downloading notebook-7.3.3-py3-none-any.whl.metadata (10 kB)
Collecting retrying<2,>=1.3.4 (from recommenders)
  Downloading retrying-1.3.4-py3-none-any.whl.metadata (6.9 kB)
Collecting seaborn<1,>=0.13.0 (from recommenders)
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting pandera>=0.15.0 (from pandera[strategies]>=0.15.0; python_version >= "3.9"->recomme

In [2]:
import pandas as pd
import tensorflow as tf
import tempfile
from recommenders.models.ncf.ncf_singlenode import NCF
from recommenders.models.ncf.dataset import Dataset as NCFDataset
from recommenders.datasets.python_splitters import python_chrono_split
from recommenders.evaluation.python_evaluation import (
    rmse, mae, rsquared, exp_var,
    map_at_k, ndcg_at_k, precision_at_k, recall_at_k
)
from sklearn.metrics import mean_squared_error, r2_score
import random
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


def prepare_movie_data(data_path, min_interactions=5):
    """Load and preprocess movie ratings data with quality checks"""
    df = pd.read_csv(data_path)
    
    if not {'userId', 'movieId', 'timestamp'}.issubset(df.columns):
        raise ValueError("Dataset missing required columns")
        
    df = df.rename(columns={
        'userId': 'user_id',
        'movieId': 'item_id',
    })
    
    # Create implicit feedback
    df['rating'] = 1
    
    interaction_counts = df['user_id'].value_counts()
    df = df[df['user_id'].isin(interaction_counts[interaction_counts >= min_interactions].index)]
    
    df['userID'] = df['user_id'].astype('category').cat.codes
    df['itemID'] = df['item_id'].astype('category').cat.codes
    
    return df[['userID', 'itemID', 'rating', 'timestamp']]

def create_ncf_datasets(df, test_size=0.25):
    """Create train/test splits and format for NCF model"""
    df_sorted = df.sort_values('timestamp')
    train, test = python_chrono_split(df_sorted, ratio=(1-test_size))
    
    # Filter unseen entities
    test = test[
        test.userID.isin(train.userID.unique()) & 
        test.itemID.isin(train.itemID.unique())
    ]
    
    return train, test

def train_ncf_model(train_data, test_data, params):
    """Train and evaluate Neural Collaborative Filtering model"""
    # Use temporary files for dataset handling
    with tempfile.NamedTemporaryFile(mode='w+', suffix='.csv') as tmp_train, \
         tempfile.NamedTemporaryFile(mode='w+', suffix='.csv') as tmp_test:
        
        # Save to temporary files
        train_data.to_csv(tmp_train.name, index=False)
        test_data.to_csv(tmp_test.name, index=False)
        
        # Initialize dataset
        dataset = NCFDataset(
            train_file=tmp_train.name,
            test_file=tmp_test.name,
            seed=params['seed'],
            col_rating='rating',  # Explicitly map rating column
            binary=True,  # True if using implicit feedback
            overwrite_test_file_full=True
        )
        
        # Model configuration
        model = NCF(
            n_users=dataset.n_users,
            n_items=dataset.n_items,
            model_type=params.get('model_type', 'NeuMF'),
            n_factors=params.get('n_factors', 32),
            layer_sizes=params.get('layer_sizes', [64, 32, 16]),
            n_epochs=params.get('epochs', 15),
            batch_size=params.get('batch_size', 512),
            learning_rate=params.get('learning_rate', 0.001),
            verbose=params.get('verbose', 10),
            seed=params['seed']
        )
        
        # Model training
        with tf.device(f'/GPU:{params.get("gpu_id", 0)}' if tf.config.list_physical_devices('GPU') else '/CPU:0'):
            model.fit(dataset)
        
        return model, dataset

def evaluate_model(model, train_data, test_data, top_k=10):    
    SAMPLE_SIZE = 3000

    # Sample users
    all_users = train_data['userID'].unique()
    sampled_users = random.sample(list(all_users), SAMPLE_SIZE)

    train_sample = train_data[train_data['userID'].isin(sampled_users)]
    test_sample = test_data[test_data['userID'].isin(sampled_users)]

    rating_true = test_sample[['userID', 'itemID', 'interaction']].rename(columns={
        'interaction': 'rating'
    })

    print("\nGenerating top-K predictions for sampled users...")
    all_items = train_data['itemID'].unique()
    pred_rows = []

    for uid in sampled_users:
        seen_items = train_sample[train_sample['userID'] == uid]['itemID'].tolist()
        unseen_items = [iid for iid in all_items if iid not in seen_items]

        uids = [uid] * len(unseen_items)
        preds = model.predict(uids, unseen_items, is_list=True)

        top_k_items = sorted(zip(unseen_items, preds), key=lambda x: x[1], reverse=True)[:top_k]

        for iid, score in top_k_items:
            pred_rows.append({
                'userID': uid,
                'itemID': iid,
                'prediction': score
            })

    ranking_pred = pd.DataFrame(pred_rows)

    rating_pred_rows = []
    for row in test_sample.itertuples():
        pred_score = model.predict(row.userID, row.itemID)
        rating_pred_rows.append({
            'userID': row.userID,
            'itemID': row.itemID,
            'prediction': pred_score
        })

    rating_pred = pd.DataFrame(rating_pred_rows)

    merged = pd.merge(rating_true, rating_pred, on=['userID', 'itemID'])
    y_true = merged['rating']
    y_pred = merged['prediction']

    print("\nEvaluating performance...")
    metrics = {
        'RMSE': mean_squared_error(y_true, y_pred, squared=False),
        'R²': r2_score(y_true, y_pred),

        'MAP@10': map_at_k(rating_true, ranking_pred, k=top_k,
                           col_user='userID', col_item='itemID', col_prediction='prediction'),

        'NDCG@10': ndcg_at_k(rating_true, ranking_pred, k=top_k,
                             col_user='userID', col_item='itemID', col_rating='rating', col_prediction='prediction'),

        'Precision@10': precision_at_k(rating_true, ranking_pred, k=top_k,
                                       col_user='userID', col_item='itemID', col_prediction='prediction'),

        'Recall@10': recall_at_k(rating_true, ranking_pred, k=top_k,
                                 col_user='userID', col_item='itemID', col_prediction='prediction')
    }

    return metrics



In [None]:
# Configuration parameters
params = {
    'model_type': 'NeuMF',
    'n_factors': 32,
    'layer_sizes': [128, 64, 32],
    'epochs': 20,
    'batch_size': 1024,
    'learning_rate': 0.001,
    'seed': 42,
    'gpu_id': 0
}

# Data processing
print('Preparing Data')
movie_df = prepare_movie_data(
    '/kaggle/input/the-movies-dataset/ratings.csv',
    min_interactions=5
)
train_data, test_data = create_ncf_datasets(movie_df)

# Model training
print('Training')
trained_model, ncf_dataset = train_ncf_model(train_data, test_data, params)

# Evaluation
print('Evaluating')
evaluation_results = evaluate_model(trained_model, train_data, test_data)

# Display results
print("\nRecommendation System Performance:")
for metric, value in evaluation_results.items():
    print(f"{metric}: {value:.4f}")


Preparing Data
Training


100%|██████████| 256099/256099 [2:27:42<00:00, 28.90it/s]  


In [None]:
import os

def save_ncf_model(ncf_model, save_path='saved_ncf_model'):
    if not hasattr(ncf_model, 'model'):
        raise ValueError("The provided object does not have a Keras model to save.")
    
    os.makedirs(save_path, exist_ok=True)
    ncf_model.model.save(save_path)
    print(f"Model saved to: {save_path}")

save_ncf_model(trained_model, save_path='saved_ncf_model')
