In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
import numpy as np
import pandas as pd
import joblib


In [2]:
ratings = pd.read_csv("ratings.csv")
movies = pd.read_csv("movies.csv")

ratings.head()
movies.head()


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
print("Total ratings:", len(ratings))
print("Unique users:", ratings['userId'].nunique())
print("Unique movies rated:", ratings['movieId'].nunique())


Total ratings: 25000095
Unique users: 162541
Unique movies rated: 59047


In [4]:
data = ratings.merge(movies, on="movieId")
data.head()


Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,296,5.0,1147880044,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller
1,1,306,3.5,1147868817,Three Colors: Red (Trois couleurs: Rouge) (1994),Drama
2,1,307,5.0,1147868828,Three Colors: Blue (Trois couleurs: Bleu) (1993),Drama
3,1,665,5.0,1147878820,Underground (1995),Comedy|Drama|War
4,1,899,3.5,1147868510,Singin' in the Rain (1952),Comedy|Musical|Romance


In [5]:
print(data[['userId', 'title', 'rating']].head())


   userId                                             title  rating
0       1                               Pulp Fiction (1994)     5.0
1       1  Three Colors: Red (Trois couleurs: Rouge) (1994)     3.5
2       1  Three Colors: Blue (Trois couleurs: Bleu) (1993)     5.0
3       1                                Underground (1995)     5.0
4       1                        Singin' in the Rain (1952)     3.5


In [6]:
movie_rating_counts = data['movieId'].value_counts()

popular_movie_ids = movie_rating_counts[movie_rating_counts >= 1000].index

data_small = data[data['movieId'].isin(popular_movie_ids)]


In [7]:
print("Ratings after filtering:", len(data_small))
print("Movies after filtering:", data_small['movieId'].nunique())
print("Users after filtering:", data_small['userId'].nunique())


Ratings after filtering: 22141815
Movies after filtering: 3794
Users after filtering: 162539


In [8]:
movie_user_matrix = data_small.pivot_table(
    index="title",
    columns="userId",
    values="rating"
)


In [9]:
movie_user_matrix.shape


(3794, 162539)

In [10]:
movie_user_matrix_filled = movie_user_matrix.fillna(0)

In [11]:
movie_similarity = cosine_similarity(movie_user_matrix_filled)


In [12]:
movie_similarity_df = pd.DataFrame(
    movie_similarity,
    index=movie_user_matrix.index,
    columns=movie_user_matrix.index
)


In [13]:
def recommend_similar_movies(movie_title, n=10):
    if movie_title not in movie_similarity_df:
        return "Movie not found."

    similarity_scores = movie_similarity_df[movie_title].sort_values(ascending=False)
    return similarity_scores.iloc[1:n+1]


In [14]:
recommend_similar_movies("Toy Story (1995)")


title
Star Wars: Episode IV - A New Hope (1977)                0.567295
Toy Story 2 (1999)                                       0.563552
Back to the Future (1985)                                0.548098
Forrest Gump (1994)                                      0.543831
Jurassic Park (1993)                                     0.539747
Star Wars: Episode VI - Return of the Jedi (1983)        0.538788
Independence Day (a.k.a. ID4) (1996)                     0.537744
Lion King, The (1994)                                    0.530681
Aladdin (1992)                                           0.528681
Star Wars: Episode V - The Empire Strikes Back (1980)    0.513260
Name: Toy Story (1995), dtype: float64

In [15]:
user_movie_matrix = movie_user_matrix_filled.T
user_movie_matrix.shape


(162539, 3794)

In [16]:
user_rating_counts = data_small.groupby('userId').size().sort_values(ascending=False)
user_rating_counts.head(10)


userId
57548     3459
72315     3419
20055     3175
107650    2927
80974     2915
33844     2891
49403     2826
92046     2819
162516    2778
30643     2750
dtype: int64

In [17]:
test_user = user_rating_counts.index[0]
test_user


57548

In [18]:
knn_users = NearestNeighbors(metric="cosine", algorithm="brute", n_neighbors=21)
knn_users.fit(user_movie_matrix)


In [19]:
user_vector = user_movie_matrix.loc[[test_user]]
distances, indices = knn_users.kneighbors(user_vector)

neighbor_user_ids = user_movie_matrix.index[indices[0]]
neighbor_distances = distances[0]

list(zip(neighbor_user_ids[:5], neighbor_distances[:5]))


[(57548, 4.440892098500626e-16),
 (107650, 0.3175845574646714),
 (72315, 0.32169407746575396),
 (162516, 0.34643104199807184),
 (37438, 0.3527486872403709)]

In [20]:
def recommend_movies_for_user(user_id, user_movie_matrix, knn_model, n_recs=10, n_neighbors=20):
    # 1) Find nearest neighbors
    user_vector = user_movie_matrix.loc[[user_id]]
    distances, indices = knn_model.kneighbors(user_vector, n_neighbors=n_neighbors+1)

    neighbor_ids = user_movie_matrix.index[indices[0]]
    neighbor_dist = distances[0]

    # Drop the first neighbor (it's the same user)
    neighbor_ids = neighbor_ids[1:]
    neighbor_dist = neighbor_dist[1:]

    # 2) Convert distances to similarities
    sims = 1 - neighbor_dist
    sims = np.clip(sims, 0, None)

    # 3) Weighted rating scores from neighbors
    neighbors_matrix = user_movie_matrix.loc[neighbor_ids]
    weighted_scores = np.dot(sims, neighbors_matrix.values) / (sims.sum() + 1e-9)

    scores = pd.Series(weighted_scores, index=user_movie_matrix.columns)

    # 4) Remove movies the user already rated
    already_rated = user_movie_matrix.loc[user_id]
    already_rated_titles = already_rated[already_rated > 0].index

    recs = scores.drop(index=already_rated_titles).sort_values(ascending=False).head(n_recs)
    return recs


In [21]:
recs = recommend_movies_for_user(
    user_id=test_user,
    user_movie_matrix=user_movie_matrix,
    knn_model=knn_users,
    n_recs=10,
    n_neighbors=20
)

recs


title
Memento (2000)                         4.504928
Terminator 2: Judgment Day (1991)      4.395312
Donnie Darko (2001)                    4.078177
Run Lola Run (Lola rennt) (1998)       4.022651
Minority Report (2002)                 3.995745
Diabolique (Les diaboliques) (1955)    3.014307
Payback (1999)                         2.985319
Primer (2004)                          2.971964
Don't Look Now (1973)                  2.956789
Freaks (1932)                          2.863815
dtype: float64

In [22]:
recs_df = recs.reset_index()
recs_df.columns = ["title", "score"]

recs_df = recs_df.merge(movies[["title", "genres"]], on="title", how="left")
recs_df


Unnamed: 0,title,score,genres
0,Memento (2000),4.504928,Mystery|Thriller
1,Terminator 2: Judgment Day (1991),4.395312,Action|Sci-Fi
2,Donnie Darko (2001),4.078177,Drama|Mystery|Sci-Fi|Thriller
3,Run Lola Run (Lola rennt) (1998),4.022651,Action|Crime
4,Minority Report (2002),3.995745,Action|Crime|Mystery|Sci-Fi|Thriller
5,Diabolique (Les diaboliques) (1955),3.014307,Horror|Mystery|Thriller
6,Payback (1999),2.985319,Action|Thriller
7,Primer (2004),2.971964,Drama|Sci-Fi
8,Don't Look Now (1973),2.956789,Drama|Horror|Thriller
9,Freaks (1932),2.863815,Crime|Drama|Horror


In [23]:
user_ratings = movie_user_matrix.loc[:, test_user] if False else None


In [24]:
top_rated = user_movie_matrix.loc[test_user]
top_rated = top_rated[top_rated > 0].sort_values(ascending=False).head(10)
top_rated


title
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964)    5.0
Iron Giant, The (1999)                                                         5.0
Showgirls (1995)                                                               5.0
Andromeda Strain, The (1971)                                                   5.0
John Wick: Chapter Two (2017)                                                  5.0
Casablanca (1942)                                                              5.0
Bridge on the River Kwai, The (1957)                                           5.0
Adventures of Baron Munchausen, The (1988)                                     5.0
Kelly's Heroes (1970)                                                          5.0
Vampire Hunter D: Bloodlust (Banpaia hantâ D) (2000)                           5.0
Name: 57548, dtype: float64

In [25]:
joblib.dump(movie_similarity_df, "movie_similarity_df.pkl")
joblib.dump(user_movie_matrix, "user_movie_matrix.pkl")
joblib.dump(knn_users, "knn_users.pkl")


['knn_users.pkl']

In [26]:
joblib.load("movie_similarity_df.pkl").shape


(3794, 3794)

In [27]:
def recommend_similar_movies(title, n=10, similarity_df=None):
    """
    Item-Item collaborative filtering.
    Returns the top-n most similar movies to `title` using cosine similarity.
    """
    if similarity_df is None:
        similarity_df = movie_similarity_df  # uses global if not provided

    if title not in similarity_df.columns:
        # helpful fallback: show close matches
        close = [t for t in similarity_df.columns if title.lower() in t.lower()][:10]
        msg = f"Movie not found: {title}"
        if close:
            msg += f"\nDid you mean one of these?\n" + "\n".join(close)
        return msg

    scores = similarity_df[title].sort_values(ascending=False)
    return scores.iloc[1:n+1]  # skip itself


In [28]:
recommend_similar_movies("Toy Story (1995)", n=10)


title
Star Wars: Episode IV - A New Hope (1977)                0.567295
Toy Story 2 (1999)                                       0.563552
Back to the Future (1985)                                0.548098
Forrest Gump (1994)                                      0.543831
Jurassic Park (1993)                                     0.539747
Star Wars: Episode VI - Return of the Jedi (1983)        0.538788
Independence Day (a.k.a. ID4) (1996)                     0.537744
Lion King, The (1994)                                    0.530681
Aladdin (1992)                                           0.528681
Star Wars: Episode V - The Empire Strikes Back (1980)    0.513260
Name: Toy Story (1995), dtype: float64

In [29]:
def recommend_movies_for_user(user_id, n_recs=10, n_neighbors=20,
                             user_item_matrix=None, knn_model=None):
    """
    User-based collaborative filtering with k-NN (cosine).
    Returns top-n recommended movies for `user_id`.
    """
    if user_item_matrix is None:
        user_item_matrix = user_movie_matrix  # global
    if knn_model is None:
        knn_model = knn_users  # global

    if user_id not in user_item_matrix.index:
        return f"User not found: {user_id}"

    # Find neighbors
    user_vector = user_item_matrix.loc[[user_id]]
    distances, indices = knn_model.kneighbors(user_vector, n_neighbors=n_neighbors+1)

    neighbor_ids = user_item_matrix.index[indices[0]]
    neighbor_dist = distances[0]

    # Drop self
    neighbor_ids = neighbor_ids[1:]
    neighbor_dist = neighbor_dist[1:]

    sims = 1 - neighbor_dist
    sims = np.clip(sims, 0, None)

    neighbors_matrix = user_item_matrix.loc[neighbor_ids]

    # Weighted average of neighbors' ratings
    weighted_scores = np.dot(sims, neighbors_matrix.values) / (sims.sum() + 1e-9)
    scores = pd.Series(weighted_scores, index=user_item_matrix.columns)

    # Remove already-rated movies
    already_rated = user_item_matrix.loc[user_id]
    already_rated_titles = already_rated[already_rated > 0].index

    recs = scores.drop(index=already_rated_titles).sort_values(ascending=False).head(n_recs)
    return recs


In [30]:
test_user = 57548  # from your earlier cell
recommend_movies_for_user(test_user, n_recs=10)


title
Memento (2000)                         4.504928
Terminator 2: Judgment Day (1991)      4.395312
Donnie Darko (2001)                    4.078177
Run Lola Run (Lola rennt) (1998)       4.022651
Minority Report (2002)                 3.995745
Diabolique (Les diaboliques) (1955)    3.014307
Payback (1999)                         2.985319
Primer (2004)                          2.971964
Don't Look Now (1973)                  2.956789
Freaks (1932)                          2.863815
dtype: float64

In [31]:
recs = recommend_movies_for_user(test_user, n_recs=10)
recs_df = recs.reset_index()
recs_df.columns = ["title", "score"]
recs_df = recs_df.merge(movies[["title","genres"]], on="title", how="left")
recs_df


Unnamed: 0,title,score,genres
0,Memento (2000),4.504928,Mystery|Thriller
1,Terminator 2: Judgment Day (1991),4.395312,Action|Sci-Fi
2,Donnie Darko (2001),4.078177,Drama|Mystery|Sci-Fi|Thriller
3,Run Lola Run (Lola rennt) (1998),4.022651,Action|Crime
4,Minority Report (2002),3.995745,Action|Crime|Mystery|Sci-Fi|Thriller
5,Diabolique (Les diaboliques) (1955),3.014307,Horror|Mystery|Thriller
6,Payback (1999),2.985319,Action|Thriller
7,Primer (2004),2.971964,Drama|Sci-Fi
8,Don't Look Now (1973),2.956789,Drama|Horror|Thriller
9,Freaks (1932),2.863815,Crime|Drama|Horror
