In [4]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

In [5]:
column_names = ["user_id", "item_id", "rating", "timestamp"]
dtype = {"user_id": int, "item_id": int, "rating": float, "timestamp": int}
ratings = pd.read_csv("ml-32m/ratings.csv", names=column_names, nrows=500, dtype=dtype, header=0)

ratings["rating"] = pd.to_numeric(ratings["rating"], errors="coerce")  # Converts non-numeric values to NaN
ratings = ratings.dropna(subset=["rating"])

movie_columns = [
    "item_id",
    "title",
    "genres"
]
df_movies = pd.read_csv("ml-32m/movies.csv", names=movie_columns, encoding="latin-1", header=0)
df_movies["item_id"] = df_movies["item_id"].astype(int)
movie_titles = dict(zip(df_movies["item_id"], df_movies["title"]))


movie_user_matrix = ratings.pivot(index="item_id", columns="user_id", values="rating").fillna(0)

# Compute cosine similarity between movies
movie_similarity = cosine_similarity(movie_user_matrix)

# Convert to DataFrame for easier lookup
movie_similarity_df = pd.DataFrame(movie_similarity, index=movie_user_matrix.index, columns=movie_user_matrix.index)
movie_similarity_df.to_parquet("movie_similarity.parquet", compression="snappy")

In [9]:
def recommend_movies(new_user_ratings, top_n=10):
    """
    Recommend movies for a new user based on their ratings.
    :param new_user_ratings: Dict {item_id: rating} for the new user
    :param top_n: Number of recommendations to return
    :return: List of recommended movie titles
    """
    similar_scores = pd.Series(dtype=float)

    for item_id, rating in new_user_ratings.items():
        if item_id in movie_similarity_df.index:
            # Multiply similarity score by the user's rating
            similar_movies = movie_similarity_df[item_id] * rating
            similar_movies = similar_movies.sort_values(ascending=False)

            # Accumulate scores
            similar_scores = similar_scores.add(similar_movies, fill_value=0)

    # Remove already seen movies
    similar_scores = similar_scores.drop(index=new_user_ratings.keys(), errors="ignore")

    # Get top N recommended movie IDs
    recommended_movie_ids = similar_scores.sort_values(ascending=False).head(top_n).index.tolist()

    # Convert item IDs to movie titles
    recommended_movies = [movie_titles.get(movie_id, f"Movie {movie_id}") for movie_id in recommended_movie_ids]

    return recommended_movies


In [11]:
# Example: New user rates these movies
new_user_ratings = {
    1: 5.0,  # User loves movie ID 1
    2: 5.0,  # User likes movie ID 10
    34: 5   # User is neutral about movie ID 50
}

recommended_movies = recommend_movies(new_user_ratings)
print("Recommended Movies:", recommended_movies)


Recommended Movies: ['Lion King, The (1994)', 'Jurassic Park (1993)', 'Aladdin (1992)', 'Mask, The (1994)', 'Mrs. Doubtfire (1993)', 'Forrest Gump (1994)', 'Beauty and the Beast (1991)', 'Fugitive, The (1993)', 'Speed (1994)', 'Batman (1989)']
