In [1]:
import pandas as pd

# Movies dataset
movies_url = "https://files.grouplens.org/datasets/movielens/ml-100k/u.item"

movies = pd.read_csv(
    movies_url,
    sep="|",
    encoding="latin-1",
    header=None
)

# Keep only movie_id and title
movies = movies[[0, 1]]
movies.columns = ["movie_id", "title"]

print(movies.head())


   movie_id              title
0         1   Toy Story (1995)
1         2   GoldenEye (1995)
2         3  Four Rooms (1995)
3         4  Get Shorty (1995)
4         5     Copycat (1995)


In [2]:
ratings_url = "https://files.grouplens.org/datasets/movielens/ml-100k/u.data"

ratings = pd.read_csv(
    ratings_url,
    sep="\t",
    header=None,
    names=["user_id", "movie_id", "rating", "timestamp"]
)

print(ratings.head())


   user_id  movie_id  rating  timestamp
0      196       242       3  881250949
1      186       302       3  891717742
2       22       377       1  878887116
3      244        51       2  880606923
4      166       346       1  886397596


In [3]:
data = pd.merge(ratings, movies, on="movie_id")

print(data.head())


   user_id  movie_id  rating  timestamp                       title
0      196       242       3  881250949                Kolya (1996)
1      186       302       3  891717742    L.A. Confidential (1997)
2       22       377       1  878887116         Heavyweights (1994)
3      244        51       2  880606923  Legends of the Fall (1994)
4      166       346       1  886397596         Jackie Brown (1997)


In [4]:
top_movies = (
    data.groupby("title")["rating"]
    .mean()
    .sort_values(ascending=False)
    .head(10)
)

print(top_movies)


title
Aiqing wansui (1994)                                 5.0
Entertaining Angels: The Dorothy Day Story (1996)    5.0
Santa with Muscles (1996)                            5.0
Prefontaine (1997)                                   5.0
They Made Me a Criminal (1939)                       5.0
Saint of Fort Washington, The (1993)                 5.0
Great Day in Harlem, A (1994)                        5.0
Star Kid (1997)                                      5.0
Marlene Dietrich: Shadow and Light (1996)            5.0
Someone Else's America (1995)                        5.0
Name: rating, dtype: float64


In [5]:
most_watched = (
    data.groupby("title")["rating"]
    .count()
    .sort_values(ascending=False)
    .head(10)
)

print(most_watched)


title
Star Wars (1977)                 583
Contact (1997)                   509
Fargo (1996)                     508
Return of the Jedi (1983)        507
Liar Liar (1997)                 485
English Patient, The (1996)      481
Scream (1996)                    478
Toy Story (1995)                 452
Air Force One (1997)             431
Independence Day (ID4) (1996)    429
Name: rating, dtype: int64


In [6]:
movie_matrix = data.pivot_table(
    index="user_id",
    columns="title",
    values="rating"
)

print(movie_matrix.head())


title    'Til There Was You (1997)  1-900 (1994)  101 Dalmatians (1996)  \
user_id                                                                   
1                              NaN           NaN                    2.0   
2                              NaN           NaN                    NaN   
3                              NaN           NaN                    NaN   
4                              NaN           NaN                    NaN   
5                              NaN           NaN                    2.0   

title    12 Angry Men (1957)  187 (1997)  2 Days in the Valley (1996)  \
user_id                                                                 
1                        5.0         NaN                          NaN   
2                        NaN         NaN                          NaN   
3                        NaN         2.0                          NaN   
4                        NaN         NaN                          NaN   
5                        NaN        

In [7]:
toy_story_ratings = movie_matrix["Toy Story (1995)"]

similar_movies = movie_matrix.corrwith(toy_story_ratings)

similar_movies = similar_movies.dropna().sort_values(ascending=False)

print(similar_movies.head(10))


  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[:, None]
  c /= stddev[None, :]


title
Infinity (1996)                                                                      1.0
Albino Alligator (1996)                                                              1.0
Across the Sea of Time (1995)                                                        1.0
Wooden Man's Bride, The (Wu Kui) (1994)                                              1.0
Newton Boys, The (1998)                                                              1.0
Toy Story (1995)                                                                     1.0
Late Bloomers (1996)                                                                 1.0
Ladybird Ladybird (1994)                                                             1.0
Guantanamera (1994)                                                                  1.0
Old Lady Who Walked in the Sea, The (Vieille qui marchait dans la mer, La) (1991)    1.0
dtype: float64


In [8]:
movie_counts = data.groupby("title")["rating"].count()

similar_movies_df = pd.DataFrame(similar_movies, columns=["correlation"])
similar_movies_df["count"] = movie_counts

final_recommendations = similar_movies_df[similar_movies_df["count"] > 50] \
    .sort_values("correlation", ascending=False)

print(final_recommendations.head(10))


                                             correlation  count
title                                                          
Toy Story (1995)                                1.000000    452
Raise the Red Lantern (1991)                    0.641535     58
Flubber (1997)                                  0.558389     53
Jackal, The (1997)                              0.557876     87
Craft, The (1996)                               0.549100    104
Island of Dr. Moreau, The (1996)                0.490491     57
Notorious (1946)                                0.486031     52
Cyrano de Bergerac (1990)                       0.480052     66
Beautician and the Beast, The (1997)            0.466064     86
Winnie the Pooh and the Blustery Day (1968)     0.463056     75
