In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

# -----------------------------
# 1) Load Movies Dataset
# -----------------------------
movies_url = "https://files.grouplens.org/datasets/movielens/ml-100k/u.item"

movies = pd.read_csv(
    movies_url,
    sep="|",
    encoding="latin-1",
    header=None
)

movies = movies[[0, 1]]
movies.columns = ["movie_id", "title"]

# -----------------------------
# 2) Load Ratings Dataset
# -----------------------------
ratings_url = "https://files.grouplens.org/datasets/movielens/ml-100k/u.data"

ratings = pd.read_csv(
    ratings_url,
    sep="\t",
    header=None,
    names=["user_id", "movie_id", "rating", "timestamp"]
)

# -----------------------------
# 3) Merge
# -----------------------------
data = pd.merge(ratings, movies, on="movie_id")

print("Data shape:", data.shape)
print(data.head())
movie_rating_counts = data.groupby("title")["rating"].count()

popular_movies = movie_rating_counts[movie_rating_counts >= 50].index

data_filtered = data[data["title"].isin(popular_movies)]


# -----------------------------
# 4) Create Movie-User Matrix
# -----------------------------
movie_user_matrix = data.pivot_table(
    index="title",
    columns="user_id",
    values="rating"
).fillna(0)

print("Movie-user matrix shape:", movie_user_matrix.shape)

# Convert to sparse matrix for speed
matrix_sparse = csr_matrix(movie_user_matrix.values)

# -----------------------------
# 5) Train KNN Model
# -----------------------------
knn = NearestNeighbors(metric="cosine", algorithm="brute")
knn.fit(matrix_sparse)

# -----------------------------
# 6) Recommendation Function
# -----------------------------
def recommend_movies(movie_name, n_recommendations=10):
    if movie_name not in movie_user_matrix.index:
        return f"‚ùå Movie '{movie_name}' not found. Try another title."

    movie_index = movie_user_matrix.index.get_loc(movie_name)

    distances, indices = knn.kneighbors(
        movie_user_matrix.iloc[movie_index].values.reshape(1, -1),
        n_neighbors=n_recommendations + 1
    )

    recommendations = []
    for i in range(1, len(indices[0])):  # skip itself
        recommended_title = movie_user_matrix.index[indices[0][i]]
        similarity = 1 - distances[0][i]  # cosine similarity
        recommendations.append((recommended_title, round(similarity, 3)))

    return recommendations

# -----------------------------
# 7) Test Recommendation
# -----------------------------
print("\nüé¨ Recommendations for Toy Story (1995):\n")
print(recommend_movies("Toy Story (1995)", 10))


Data shape: (100000, 5)
   user_id  movie_id  rating  timestamp                       title
0      196       242       3  881250949                Kolya (1996)
1      186       302       3  891717742    L.A. Confidential (1997)
2       22       377       1  878887116         Heavyweights (1994)
3      244        51       2  880606923  Legends of the Fall (1994)
4      166       346       1  886397596         Jackie Brown (1997)
Movie-user matrix shape: (1664, 943)

üé¨ Recommendations for Toy Story (1995):

[('Star Wars (1977)', np.float64(0.735)), ('Return of the Jedi (1983)', np.float64(0.7)), ('Independence Day (ID4) (1996)', np.float64(0.69)), ('Rock, The (1996)', np.float64(0.665)), ('Mission: Impossible (1996)', np.float64(0.641)), ('Willy Wonka and the Chocolate Factory (1971)', np.float64(0.638)), ('Star Trek: First Contact (1996)', np.float64(0.637)), ('Fargo (1996)', np.float64(0.631)), ('Jerry Maguire (1996)', np.float64(0.624)), ('Raiders of the Lost Ark (1981)', np.float6