In [14]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load MovieLens data
ratings = pd.read_csv('data/ratings.csv')
movies = pd.read_csv('data/movies.csv')

# Prepare data for Surprise
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['user_id', 'movie_id', 'rating']], reader)

# Split the data
trainset, testset = train_test_split(data, test_size=0.25, random_state=42)

# Train SVD model
svd = SVD()
svd.fit(trainset)

def collaborative_filtering(user_id, n_recommendations=10):
    # Get all movies the user hasn't rated
    user_movies = ratings[ratings['user_id'] == user_id]['movie_id'].unique()
    all_movies = ratings['movie_id'].unique()
    movies_to_predict = np.setdiff1d(all_movies, user_movies)
    
    # Predict ratings for all movies
    predictions = [svd.predict(user_id, movie_id) for movie_id in movies_to_predict]
    
    # Sort predictions and get top N
    top_n = sorted(predictions, key=lambda x: x.est, reverse=True)[:n_recommendations]
    return [pred.iid for pred in top_n]

# Prepare genre data for content-based filtering
genre_cols = [col for col in movies.columns if 'genre_' in col]
movies['genres'] = movies[genre_cols].apply(lambda x: ' '.join(genre_cols[i] for i in range(len(genre_cols)) if x.iloc[i] == 1), axis=1)

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(movies['genres'])

def content_based_filtering(movie_id, n_recommendations=10):
    idx = movies.index[movies['movie_id'] == movie_id].tolist()[0]
    movie_vector = tfidf_matrix[idx]
    similarity_scores = cosine_similarity(movie_vector, tfidf_matrix).flatten()
    similar_indices = similarity_scores.argsort()[::-1][1:n_recommendations+1]
    return movies.iloc[similar_indices]['movie_id'].tolist()

def hybrid_recommender(user_id, movie_id, n_recommendations=10):
    cf_recs = collaborative_filtering(user_id, n_recommendations)
    cb_recs = content_based_filtering(movie_id, n_recommendations)
    
    # Combine and deduplicate recommendations
    hybrid_recs = list(dict.fromkeys(cf_recs + cb_recs))
    return hybrid_recs[:n_recommendations]

# Example usage
user_id = 1
movie_id = 50  # Assuming this is a movie the user has interacted with

recommendations = hybrid_recommender(user_id, movie_id)
print(f"Hybrid recommendations for user {user_id} based on movie {movie_id}:")
for i, rec in enumerate(recommendations, 1):
    print(f"{i}. Movie ID: {rec}, Title: {movies[movies['movie_id'] == rec]['title'].values[0]}")


KeyError: "['user_id', 'movie_id'] not in index"

In [13]:
ratings.head()


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
