In [1]:
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split

# Load the movielens-100k dataset
data = Dataset.load_builtin('ml-100k')

# Split the data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.25, random_state=42)

Dataset ml-100k could not be found. Do you want to download it? [Y/n] 

 Y


Trying to download dataset from https://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /root/.surprise_data/ml-100k


In [10]:
def run_surprise_algorithm(algo_class, algo_params, trainset, testset, verbose=True):
    """
    Train and evaluate a Surprise algorithm on given datasets.
    
    Args:
    algo_class: A Surprise algorithm class
    algo_params: Dictionary of parameters for the algorithm
    trainset: Surprise Trainset object
    testset: Surprise Testset object
    verbose: Boolean to control print statements
    
    Returns:
    train_results, test_results: Dictionaries containing evaluation metrics
    """
    start_time = datetime.now()
    train_results = {}
    test_results = {}
    
    # Initialize the algorithm with provided parameters
    algo = algo_class(**algo_params)
    
    # Train the algorithm
    if verbose:
        print('Training the model...')
    train_start = datetime.now()
    algo.fit(trainset)
    if verbose:
        print(f'Training completed. Time taken: {datetime.now() - train_start}\n')
    
    # Evaluate on training data
    if verbose:
        print('Evaluating on training data...')
    train_predictions = algo.test(trainset.build_testset())
    train_rmse = accuracy.rmse(train_predictions, verbose=False)
    train_mae = accuracy.mae(train_predictions, verbose=False)
    
    train_results['rmse'] = train_rmse
    train_results['mae'] = train_mae
    train_results['predictions'] = train_predictions
    
    if verbose:
        print(f'Training RMSE: {train_rmse}')
        print(f'Training MAE: {train_mae}\n')
    
    # Evaluate on test data
    if verbose:
        print('Evaluating on test data...')
    test_predictions = algo.test(testset)
    test_rmse = accuracy.rmse(test_predictions, verbose=False)
    test_mae = accuracy.mae(test_predictions, verbose=False)
    
    test_results['rmse'] = test_rmse
    test_results['mae'] = test_mae
    test_results['predictions'] = test_predictions
    
    if verbose:
        print(f'Test RMSE: {test_rmse}')
        print(f'Test MAE: {test_mae}\n')
    
    if verbose:
        print(f'Total time taken: {datetime.now() - start_time}')
    
    return train_results, test_results

In [11]:
# Define hyperparameters for KNNBaseline
knn_params = {
    'k': 40,
    'min_k': 1,
    'sim_options': {
        'name': 'pearson_baseline',
        'user_based': False
    }
}

# Initialize and run the KNNBaseline algorithm
train_results_knn, test_results_knn = run_surprise_algorithm(KNNBaseline, knn_params, trainset, testset)

Training the model...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Training completed. Time taken: 0:00:00.658482

Evaluating on training data...
Training RMSE: 0.4062635072543094
Training MAE: 0.3073522748402174

Evaluating on test data...
Test RMSE: 0.9237660818560733
Test MAE: 0.7229722085621993

Total time taken: 0:00:27.298315


In [12]:
# Define hyperparameters for SVD
svd_params = {
    'n_factors': 100,
    'n_epochs': 50,
    'lr_all': 0.005,
    'reg_all': 0.02
}

# Initialize and run the SVD algorithm
train_results_svd, test_results_svd = run_surprise_algorithm(SVD, svd_params, trainset, testset)

Training the model...
Training completed. Time taken: 0:00:02.254334

Evaluating on training data...
Training RMSE: 0.8408960165031524
Training MAE: 0.6663148858583093

Evaluating on test data...
Test RMSE: 0.9529774251658307
Test MAE: 0.7535350804707042

Total time taken: 0:00:03.379350


In [13]:
# Define hyperparameters for SVD
svd_params = {
    'n_factors': 1000,
    'n_epochs': 50,
    'lr_all': 0.001,
    'reg_all': 0.02
}

# Initialize and run the SVD algorithm
train_results_svd, test_results_svd = run_surprise_algorithm(SVD, svd_params, trainset, testset)

Training the model...
Training completed. Time taken: 0:00:14.103099

Evaluating on training data...
Training RMSE: 0.41082959078969183
Training MAE: 0.3264556323585697

Evaluating on test data...
Test RMSE: 0.988778241557796
Test MAE: 0.7868321855609476

Total time taken: 0:00:15.281207


In [16]:
import xgboost as xgb
import numpy as np
from surprise import Dataset, Reader, KNNBaseline, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy
from datetime import datetime
# Function to extract features for XGBoost
def extract_features(trainset):
    """
    Extract features from the Surprise trainset for XGBoost.
    
    Args:
    trainset: Surprise Trainset object
    
    Returns:
    X, y: Feature matrix and target vector
    """
    X = []
    y = []
    for uid, iid, rating in trainset.all_ratings():
        X.append([int(uid), int(iid)])
        y.append(rating)
    return np.array(X), np.array(y)

# Extract features from the trainset
X_train, y_train = extract_features(trainset)

# Train XGBoost model
xgb_params = {
    'objective': 'reg:squarederror',
    'max_depth': 6,
    'learning_rate': 0.1,
    'n_estimators': 100
}
xgb_model = xgb.XGBRegressor(**xgb_params)
xgb_model.fit(X_train, y_train)

# Function to evaluate XGBoost model
def evaluate_xgboost(model, testset):
    """
    Evaluate XGBoost model on the testset.
    
    Args:
    model: Trained XGBoost model
    testset: Surprise Testset object
    
    Returns:
    rmse, mae: Evaluation metrics
    """
    X_test = np.array([[int(uid), int(iid)] for (uid, iid, _) in testset])
    y_test = np.array([rating for (_, _, rating) in testset])
    y_pred = model.predict(X_test)
    rmse = np.sqrt(np.mean((y_test - y_pred) ** 2))
    mae = np.mean(np.abs(y_test - y_pred))
    return rmse, mae

# Evaluate XGBoost model
test_rmse_xgb, test_mae_xgb = evaluate_xgboost(xgb_model, testset)

print(f'XGBoost Test RMSE: {test_rmse_xgb}')
print(f'XGBoost Test MAE: {test_mae_xgb}')

XGBoost Test RMSE: 1.133272591913074
XGBoost Test MAE: 0.9292148569726943
