In [None]:
!pip install lightfm
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3162669 sha256=a6a18946d933d8f1286d7541086e30f39de3410f4fa473d267210efd99fdbbd0
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.3 surprise-0.1


In [None]:
# Import necessary libraries
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import pandas as pd
import time

# Load MovieLens dataset
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')

# Collaborative Filtering
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2)
start_time = time.time()
cf_model = SVD()
cf_model.fit(trainset)

# Content-Based Filtering
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(movies['genres'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
training_time = time.time() - start_time

print(f"Training Time: {training_time} seconds")

# Hybrid Model
def hybrid_recommendation(userId, movieId):
    # Collaborative Filtering prediction
    cf_prediction = cf_model.predict(userId, movieId).est

    # Content-Based Filtering prediction
    movie_idx = movies[movies['movieId'] == movieId].index[0]
    content_based_scores = list(enumerate(cosine_sim[movie_idx]))
    content_based_scores = sorted(content_based_scores, key=lambda x: x[1], reverse=True)
    cb_prediction = content_based_scores[1][1]  # Considering the second most similar movie

    # Weighted Hybrid Model
    hybrid_prediction = 0.7 * cf_prediction + 0.3 * cb_prediction

    return hybrid_prediction

# Example usage
userId = 1
movieId = 47
prediction = hybrid_recommendation(userId, movieId)
print(f'Hybrid Model Prediction: {prediction}')


Training Time: 3.921278953552246 seconds
Hybrid Model Prediction: 3.5151494753270422


Data preproccess

In [None]:
import pandas as pd

# Load ratings data
ratings = pd.read_csv('ratings.csv')

# Load movies data
movies = pd.read_csv('movies.csv')

CF with SVD

In [None]:
from surprise import Reader, Dataset
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise import SVD  # You can try other algorithms as well

# Load data into Surprise format
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# Split data into train and test sets
trainset, testset = train_test_split(data, test_size=0.25)

# Train a collaborative filtering model
algo = SVD()  # You can try other algorithms as well
algo.fit(trainset)

# Make predictions on the test set
predictions = algo.test(testset)

# Evaluate the model using MAE
rmse = accuracy.rmse(predictions)
print(f'Mean Absolute Error (RMSE): {rmse}')

RMSE: 0.8744
Mean Absolute Error (RMSE): 0.8744154675222571


CB with tfidf and cosine similarity

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Create a TF-IDF matrix for movie genres
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(movies['genres'].fillna(''))

# Calculate cosine similarity between movies
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)


Hybrid Model

In [None]:
from surprise import accuracy

# Create a function to get collaborative filtering recommendations
def get_collab_recommendations(userId, movieId):
    return algo.predict(userId, movieId).est

# Create a function to get content-based recommendations
def get_content_recommendations(movieId):
    sim_scores = list(enumerate(cosine_sim[movieId]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    return sim_scores[1:11]

# Create a function to make hybrid recommendations
def hybrid_recommendations(userId, movieId):
    collab_score = get_collab_recommendations(userId, movieId)
    content_scores = get_content_recommendations(movieId)

    # Weighted sum of collaborative and content scores
    hybrid_scores = [(idx, collab_score + 0.2 * content_score) for idx, content_score in content_scores]

    # Sort the recommendations by score
    hybrid_scores = sorted(hybrid_scores, key=lambda x: x[1], reverse=True)

    # Get the top 5 movie recommendations
    top_movies = [movies.iloc[idx]['title'] for idx, _ in hybrid_scores[:10]]
    return top_movies

# Example usage
userId = 2
movieId = 2
recommendations = hybrid_recommendations(userId, movieId)
print(recommendations)


['Sabrina (1995)', 'Clueless (1995)', 'Two if by Sea (1996)', 'French Twist (Gazon maudit) (1995)', 'If Lucy Fell (1996)', 'Boomerang (1992)', 'Pie in the Sky (1996)', 'Mallrats (1995)', 'Nine Months (1995)', 'Forget Paris (1995)']
