In [1]:
import numpy as np
from pandas import read_csv
import pickle

## Import datasets and models

In [2]:
data = read_csv('./datasets/merged.csv')
test = read_csv('./datasets/test.csv')

test.head()

Unnamed: 0,user_id,movie_id,rating
0,5412,2431,5
1,5440,111,5
2,368,2976,3
3,425,2139,4
4,4942,2532,3


In [3]:
knn_clf_url = "./models/knn_clf.pkl"
svd_url = "./models/svd_model.pkl"

# Load the model
with open(knn_clf_url, 'rb') as file:
    knn_clf = pickle.load(file)

# Load the model
with open(svd_url, 'rb') as file:
    svd = pickle.load(file)

# Hybrid model

In [4]:
def hybrid_prediction(user_id, movie_id, svd_weight=0.7, knn_weight=0.3):
    try:
        # Try to predict using the SVD model
        svd_pred = svd.predict(user_id, movie_id).est
    except:
        # If the user is not found, use the KNN model instead
        svd_pred = None
    
    # Predict using the KNN model (works for new users)
    knn_pred = knn_clf.predict(user_id, movie_id).est
    
    # Combine predictions (if both exist) or use KNN if SVD prediction fails
    if svd_pred is not None:
        hybrid_pred = svd_weight * svd_pred + knn_weight * knn_pred
    else:
        hybrid_pred = knn_pred
    
    return hybrid_pred

In [5]:
# Example recommendation for a new user
new_user_id = 9999
movie_id = 4 
predicted_rating = hybrid_prediction(new_user_id, movie_id)

print(f"Predicted rating for new user {new_user_id} on movie {movie_id}: {predicted_rating}")

Predicted rating for new user 9999 on movie 4: 3.091237952155751


In [6]:
def evaluate_hybrid_model(svd_weight):

    knn_weight = 1.0 - svd_weight  # Make sure the weights sum to 1
    
    predictions = []
    actual_ratings = []
    
    # Predict the rating using the hybrid model
    for row in test.itertuples():
        user_id, movie_id, actual_rating = row.user_id, row.movie_id, row.rating
        predicted_rating = hybrid_prediction(user_id, movie_id, svd_weight, knn_weight)
        predictions.append(predicted_rating)
        actual_ratings.append(actual_rating)
    
    # Calculate RMSE for the current set of predictions
    rmse = np.sqrt(np.mean(((np.array(actual_ratings) - np.array(predictions))**2)))
    return rmse

# Hyperparameter tuning: testing weights from 0.0 to 1.0 with a step size of 0.1
rmses, svd_weights = [], []

# Assuming test_data is available and contains tuples of (user_id, movie_id, actual_rating)
# Replace `test_data` with your actual test set
for svd_weight in np.arange(0, 1.1, 0.1):  # Step size of 0.1
    rmse = evaluate_hybrid_model(svd_weight)
    
    rmses.append(rmse)
    svd_weights.append(svd_weight)

    print(f"SVD Weight: {svd_weight}, KNN Weight: {1-svd_weight}, RMSE: {rmse}")



SVD Weight: 0.0, KNN Weight: 1.0, RMSE: 0.9978326617962954


## Result and Conclusion

1. SVD is Clearly Superior for Your Data
    - Insight: The fact that the best result is achieved when SVD completely dominates suggests that SVD is much better suited to the structure of your data. KNN likely doesn't add any valuable signal, or its predictions introduce more error than SVD alone.

    - Action: This could be an indication that the dataset has strong latent factors (such as hidden relationships between users and items) that SVD is able to capture effectively. You might want to focus on fine-tuning and optimizing SVD rather than pursuing the hybrid approach further unless KNN is necessary for other reasons.

2. Data Characteristics Favor Latent Factor Models
    - Insight: SVD excels in datasets where latent factors (like user preferences or item attributes) play a crucial role. If your data lacks strong user-user or item-item similarities, KNN will struggle to make useful predictions.
    
    - Action: Since KNN performs better when there are strong clusters or similarities between users or items, you might want to inspect whether your data contains such patterns. If not, you may prefer to stick with SVD or explore other latent factor models like NMF (Non-negative Matrix Factorization).
        
        - Need more user interaction data