# Data Ingestion and Preprocessing:

In [12]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score, recall_score, f1_score

from surprise import SVD, Dataset, Reader,accuracy

from surprise.model_selection import train_test_split

from surprise.accuracy import rmse, mae
from collections import defaultdict
import streamlit as st

import warnings
warnings.filterwarnings("ignore")


In [13]:
movies=pd.read_csv('../data/movies.csv')
ratings=pd.read_csv('../data/ratings.csv')

In [14]:
movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [15]:
ratings.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [16]:
ratings.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [17]:
movies.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [18]:
# Replace missing genre values with an empty string
movies['genres'] =movies['genres'].fillna('')

In [19]:
#check if there outliears 'ratings' > 5
out_of_bounds = ratings[(ratings['rating'] > 5)]

print(out_of_bounds)


Empty DataFrame
Columns: [userId, movieId, rating, timestamp]
Index: []


# Content-Based Filtering Module:

In [20]:

# Initialize TF-IDF Vectorizer to ignore common English stop words
tfidf = TfidfVectorizer(stop_words='english')

# Transform the 'genres' column from the movies DataFrame into TF-IDF feature vectors
tfidf_matrix = tfidf.fit_transform(movies['genres'])

# Compute the cosine similarity matrix between all movie genre vectors
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Create a reverse lookup Series to get movie indices based on movie titles
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

# Recommendation function based on content similarity
def get_recommendations(title, movies, cosine_sim, indices, top_n=10):
    # Check if the movie title exists in the dataset
    if title not in indices:
        return pd.Series([], name='title')
    
    # Get the index of the movie that matches the title
    idx = indices[title]
    
    # Get a list of similarity scores for this movie with all others
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the movies based on similarity scores in descending order
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Skip the first movie (itself) and take the next top_n movies
    sim_scores = sim_scores[1:top_n+1]
    
    # Extract the indices of the recommended movies
    movie_indices = [i[0] for i in sim_scores]
    
    # Return the titles of the recommended movies
    return movies['title'].iloc[movie_indices]

# Example usage
print(get_recommendations("Toy Story (1995)", movies, cosine_sim, indices, top_n=10))


1706                                          Antz (1998)
2355                                   Toy Story 2 (1999)
2809       Adventures of Rocky and Bullwinkle, The (2000)
3000                     Emperor's New Groove, The (2000)
3568                                Monsters, Inc. (2001)
6194                                     Wild, The (2006)
6486                               Shrek the Third (2007)
6948                       Tale of Despereaux, The (2008)
7760    Asterix and the Vikings (Astérix et les Viking...
8219                                         Turbo (2013)
Name: title, dtype: object


# Collaborative Filtering Module:

In [21]:

# 1. Prepare the data for Surprise
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# 2. Split data into training and testing sets
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# 3. Initialize and train the SVD model
model = SVD()
model.fit(trainset)

# 4. Predict on the test set
predictions = model.test(testset)

# 5. Evaluate the model
rmse = accuracy.rmse(predictions)
mae = accuracy.mae(predictions)

# 6. Function to get top-N recommendations for a specific user
def get_top_n_recommendations(predictions, user_id, n=10, movies=None):
    user_id_str = str(user_id)
    # Filter predictions for this user
    user_predictions = [pred for pred in predictions if str(pred.uid) == user_id_str]
    # Sort predictions by estimated rating in descending order
    user_predictions.sort(key=lambda x: x.est, reverse=True)
    top_n = user_predictions[:n]

    if movies is not None:
        # Create a mapping from movieId to title
        movie_id_to_title = pd.Series(movies.title.values, index=movies.movieId.astype(str)).to_dict()
        # Return list of (title, estimated rating)
        return [(movie_id_to_title.get(str(pred.iid), "Unknown Title"), pred.est) for pred in top_n]
    else:
        # Return list of (movieId, estimated rating)
        return [(pred.iid, pred.est) for pred in top_n]

# 7. Generate predictions for all unrated movies by user 1
user_id = 1
all_movie_ids = ratings['movieId'].unique()
rated_movie_ids = ratings[ratings['userId'] == user_id]['movieId'].values
unrated_movie_ids = [movie_id for movie_id in all_movie_ids if movie_id not in rated_movie_ids]

# Generate predictions for all unrated movies
predictions_for_user = [model.predict(str(user_id), str(movie_id)) for movie_id in unrated_movie_ids]

# 8. Get top 10 recommendations
top_movies = get_top_n_recommendations(predictions_for_user, user_id=user_id, n=10, movies=movies)

# 9. Display the recommendations
print(f"Top movie recommendations for user {user_id}:")
for title, score in top_movies:
    print(f"{title}: {score:.2f}")


RMSE: 0.8804
MAE:  0.6747
Top movie recommendations for user 1:
Shawshank Redemption, The (1994): 3.50
Good Will Hunting (1997): 3.50
Kill Bill: Vol. 1 (2003): 3.50
Collateral (2004): 3.50
Talladega Nights: The Ballad of Ricky Bobby (2006): 3.50
Departed, The (2006): 3.50
Dark Knight, The (2008): 3.50
Step Brothers (2008): 3.50
Inglourious Basterds (2009): 3.50
Zombieland (2009): 3.50


# Hybrid Recommendation Engine:

In [None]:
def hybrid_recommendations(userId, title, movies, model, cosine_sim, indices, top_n=10, content_weight=0.5, cf_weight=0.5):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:top_n*5]
    
    hybrid_scores = []
    for movie_idx, sim_score in sim_scores:
        movie_id = movies.iloc[movie_idx]['movieId']
        try:
            cf_pred = model.predict(userId, movie_id).est
            final_score = (content_weight * sim_score) + (cf_weight * cf_pred)
            hybrid_scores.append((movie_idx, final_score))
        except:
            continue
    
    if not hybrid_scores:
        return pd.Series([], name='title')
    
    hybrid_scores = sorted(hybrid_scores, key=lambda x: x[1], reverse=True)[:top_n]
    top_indices = [idx for idx, _ in hybrid_scores]
    return movies['title'].iloc[top_indices]


# User Interface

In [23]:
st.title("🎬 Hybrid Movie Recommendation System")

user_id = st.number_input("Enter User ID", min_value=1, value=1)
movie_title = st.selectbox("Choose a movie you like", movies['title'].sort_values())

recommend_type = st.radio("Recommendation Type", ['Content-Based', 'Collaborative', 'Hybrid'])

top_n = st.slider("Number of recommendations", 5, 20, 10)

if st.button("Recommend"):
    if recommend_type == 'Content-Based':
        recs = get_recommendations(movie_title, movies, cosine_sim, indices, top_n)
    elif recommend_type == 'Collaborative':
        recs = get_top_n_recommendations(predictions, user_id, top_n, movies)
    else:
        recs = hybrid_recommendations(user_id, movie_title, movies, model, cosine_sim, indices, top_n)
    
    st.write("### Recommended Movies:")
    for r in recs:
        st.write(f"🎥 {r}")

2025-05-21 05:03:31.347 
  command:

    streamlit run C:\Users\HP\AppData\Roaming\Python\Python312\site-packages\ipykernel_launcher.py [ARGUMENTS]
2025-05-21 05:03:31.350 Session state does not function when running a script without `streamlit run`


In [24]:


# Define threshold above which a rating is considered relevant
RELEVANCE_THRESHOLD = 4.0

def precision_recall_at_k(predictions, k=10, threshold=RELEVANCE_THRESHOLD):
    """Return precision and recall at k for each user."""
    # Map the predictions to each user.
    user_est_true = defaultdict(list)
    for pred in predictions:
        user_est_true[pred.uid].append((pred.est, pred.r_ui))  # (predicted, actual)

    precisions = {}
    recalls = {}

    for uid, user_ratings in user_est_true.items():
        # Sort user predictions by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        top_k = user_ratings[:k]

        # Compute True Positives (relevant and recommended)
        tp = sum((true_r >= threshold) for (_, true_r) in top_k)
        # Number of recommended items
        recommended = len(top_k)
        # Number of relevant items
        relevant = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        precision = tp / recommended if recommended > 0 else 0
        recall = tp / relevant if relevant > 0 else 0

        precisions[uid] = precision
        recalls[uid] = recall

    # Return average precision and recall across all users
    avg_precision = sum(prec for prec in precisions.values()) / len(precisions)
    avg_recall = sum(rec for rec in recalls.values()) / len(recalls)
    f1 = (2 * avg_precision * avg_recall) / (avg_precision + avg_recall) if (avg_precision + avg_recall) > 0 else 0

    return avg_precision, avg_recall, f1


In [25]:
# Use testset predictions (includes true ratings)
precision, recall, f1 = precision_recall_at_k(predictions, k=10, threshold=4.0)

print(f"Precision@10: {precision:.4f}")
print(f"Recall@10: {recall:.4f}")
print(f"F1-Score@10: {f1:.4f}")


Precision@10: 0.6462
Recall@10: 0.6607
F1-Score@10: 0.6533
