In [2]:
pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl.metadata (327 bytes)
Collecting scikit-surprise (from surprise)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp311-cp311-linux_x86_64.whl size=2505206 sha256=57a7fc918f7dab631fb25f213e66914c3e8e27f2cacf9a550ca00cd564e42e3e
  Stored in directory: /root/.cache/pip/wheels/2a/8f/6e/7e2899163e2d85d8266daab4aa1cdabec7a6c56f83c015b5af
Successfully built scikit-surprise
Installi

In [4]:
!pip install "numpy<2.0"

Collecting numpy<2.0
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m80.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 2.0.2
    Uninstalling numpy-2.0.2:
      Successfully uninstalled numpy-2.0.2
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
thinc 8.3.6 requires numpy<3.0.0,>=2.0.0, but you ha

In [3]:
import pandas as pd
import numpy as np
from surprise import SVD, Dataset, Reader
from sklearn.metrics.pairwise import cosine_similarity

def load_data():
    movies = pd.read_csv("/content/movies.csv")
    ratings = pd.read_csv("/content/ratings.csv")
    return movies, ratings

def preprocess_data(movies, ratings):
    ratings['userId'] = pd.to_numeric(ratings['userId'], errors='coerce')
    ratings['movieId'] = pd.to_numeric(ratings['movieId'], errors='coerce')
    ratings['rating'] = pd.to_numeric(ratings['rating'], errors='coerce')
    movies['genres'] = movies['genres'].str.replace('|', ', ')
    ratings = ratings.dropna()
    movies = movies.dropna()
    return movies, ratings

def train_model(ratings):
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
    trainset = data.build_full_trainset()
    algo = SVD(n_factors=50, n_epochs=20, lr_all=0.005, reg_all=0.02)
    algo.fit(trainset)
    return algo

def get_top_n_recommendations(algo, movies, ratings, user_id, n=20):
    all_movie_ids = movies['movieId'].unique()
    rated_movies = ratings[ratings['userId'] == user_id]['movieId'].unique()
    movies_to_predict = np.setdiff1d(all_movie_ids, rated_movies)
    testset = [[user_id, movie_id, 4.] for movie_id in movies_to_predict]
    predictions = algo.test(testset)
    recs = []
    for pred in predictions:
        movie_info = movies[movies['movieId'] == pred.iid].iloc[0]
        recs.append({
            'movieId': pred.iid,
            'title': movie_info['title'],
            'genres': movie_info['genres'],
            'predicted_rating': round(pred.est, 2)
        })
    return pd.DataFrame(recs).sort_values('predicted_rating', ascending=False)

def diversify_recommendations(recommendations, movies, n=10, diversity_weight=0.7):
    genres = movies.set_index('movieId')['genres'].str.get_dummies(', ')
    genre_matrix = genres.loc[recommendations['movieId']].values
    similarity_matrix = cosine_similarity(genre_matrix)
    selected_indices = []
    remaining_indices = list(range(len(recommendations)))
    selected_indices.append(remaining_indices.pop(0))
    while len(selected_indices) < n and remaining_indices:
        similarities = similarity_matrix[remaining_indices][:, selected_indices]
        avg_similarity = similarities.max(axis=1)
        diversity_score = 1 - avg_similarity
        combined_score = ((1 - diversity_weight) * recommendations.iloc[remaining_indices]['predicted_rating'].values + diversity_weight * diversity_score)
        next_index = remaining_indices[np.argmax(combined_score)]
        selected_indices.append(next_index)
        remaining_indices.remove(next_index)
    return recommendations.iloc[selected_indices[:n]]

def explain_recommendations(user_id, recommendations, ratings, movies):
    user_movies = ratings[ratings['userId'] == user_id]
    user_genres = movies[movies['movieId'].isin(user_movies['movieId'])]['genres']
    top_genres = user_genres.str.split(', ').explode().value_counts().head(3).index.tolist()
    explanations = []
    for _, row in recommendations.iterrows():
        movie_genres = set(row['genres'].split(', '))
        common_genres = list(movie_genres.intersection(top_genres))
        if common_genres:
            explanation = f"You like {', '.join(common_genres)} movies"
        else:
            top_similar = recommendations[recommendations['movieId'] != row['movieId']].iloc[0]['genres'].split(', ')[0]
            explanation = f"Popular with fans of {top_similar} movies"
        explanations.append(explanation)
    recommendations['explanation'] = explanations
    return recommendations

def main():
    try:
        movies, ratings = load_data()
        movies, ratings = preprocess_data(movies, ratings)
        if ratings.empty:
            raise ValueError("No valid ratings found")
        if movies.empty:
            raise ValueError("No valid movie data found")
        algo = train_model(ratings)
        user_id = 1
        basic_recs = get_top_n_recommendations(algo, movies, ratings, user_id, n=50)
        diverse_recs = diversify_recommendations(basic_recs, movies, n=15)
        final_recs = explain_recommendations(user_id, diverse_recs, ratings, movies)
        final_recs = final_recs.head(10)
        print(final_recs[['title', 'genres', 'predicted_rating', 'explanation']].to_string(index=False))
    except Exception as e:
        print(f"Error: {str(e)}")

if __name__ == "__main__":
    main()

                                               title                         genres  predicted_rating                        explanation
                         Boondock Saints, The (2000) Action, Crime, Drama, Thriller              5.00             You like Action movies
Spirited Away (Sen to Chihiro no kamikakushi) (2001)  Adventure, Animation, Fantasy              5.00          You like Adventure movies
                              His Girl Friday (1940)                Comedy, Romance              5.00             You like Comedy movies
                               Little Big Man (1970)                        Western              4.94 Popular with fans of Action movies
                                  Hoop Dreams (1994)                    Documentary              4.94 Popular with fans of Action movies
                          Maltese Falcon, The (1941)             Film-Noir, Mystery              4.85 Popular with fans of Action movies
                                 Interste

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  recommendations['explanation'] = explanations


**1. Data Pipeline**

**Input:**

* movies.csv: Movie metadata (ID, title, genres)

* ratings.csv: User ratings (userID, movieID, rating 1-5)

**Preprocessing:**

* Converted IDs/ratings to numeric types and cleaned genres (e.g., "Adventure|Children" → "Adventure, Children").

* Dropped rows with missing values to ensure data quality.

-----

**2. Model Architecture**

* Algorithm: Singular Value Decomposition (SVD)

* Chosen for its effectiveness in collaborative filtering and handling sparse data.

**Parameters:**

* n_factors=50: Captures 50 latent features (balances complexity and performance).

* n_epochs=20: Trains over 20 iterations for convergence.

* Regularization (reg_all=0.02) to prevent overfitting.

-----


**3. Recommendation Workflow**

Step 1: Initial Predictions
* For a target user (default: user #1):

* Identify movies they haven’t rated.

* Predict ratings using the trained SVD model.

* Return top 50 movies by predicted rating.

Step 2: Diversity Enhancement

* Problem: Top predictions often cluster in similar genres (e.g., all action movies).

* Solution:
Represent movies as genre vectors (e.g., Action=1, Comedy=0).

* Use cosine similarity to measure genre overlap.

**Select movies that balance:**

* High predicted ratings (70% weight).

* Genre diversity (30% weight).

Step 3: Explanations

For each recommendation:

* If the movie shares genres the user previously liked:
"You like Comedy and Romance movies."

* Else: Fallback to "Popular with fans of [Genre]" (based on most common genre in top recommendations).

**This is a collaborative filtering system enhanced with diversity ranking and explainability (which don’t affect the core CF logic).**