In [1]:
import pandas as pd
import os

# Paths
PROCESSED_DIR = "../data/processed"
ratings_file = os.path.join(PROCESSED_DIR, "ratings_processed.csv")
movies_file = os.path.join(PROCESSED_DIR, "movies_processed.csv")


In [3]:
ratings = pd.read_csv(ratings_file)
movies = pd.read_csv(movies_file)
movies.head()


Unnamed: 0,movieId,movie_idx,title,release_date,imdb_url
0,1,24,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...
1,2,147,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...
2,3,233,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...
3,4,47,Get Shorty (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Get%20Shorty%...
4,5,75,Copycat (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Copycat%20(1995)


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,user_idx,movie_idx,datetime
0,196,242,3.0,881250949,0,0,1997-12-04 15:55:49
1,186,302,3.0,891717742,1,1,1998-04-04 19:22:22
2,22,377,1.0,878887116,2,2,1997-11-07 07:18:36
3,244,51,2.0,880606923,3,3,1997-11-27 05:02:03
4,166,346,1.0,886397596,4,4,1998-02-02 05:33:16


Compute Popularity Scores

Two options for popularity:

By number of ratings (most rated movies)

By average rating (highest rated movies)

In [5]:
# Option 1: Most rated movies
movie_counts = ratings.groupby("movie_idx")["rating"].count().reset_index()
movie_counts = movie_counts.merge(movies, on="movie_idx")
movie_counts = movie_counts.sort_values(by="rating", ascending=False)
movie_counts.rename(columns={"rating":"num_ratings"}, inplace=True)

# Option 2: Highest average rating (optional)
movie_avg = ratings.groupby("movie_idx")["rating"].mean().reset_index()
movie_avg = movie_avg.merge(movies, on="movie_idx")
movie_avg = movie_avg.sort_values(by="rating", ascending=False)
movie_avg.rename(columns={"rating":"avg_rating"}, inplace=True)


In [6]:
def recommend_popularity(top_n=10, method="most_rated"):
    """
    Returns top-N popular movies
    method: 'most_rated' or 'highest_rated'
    """
    if method == "most_rated":
        top_movies = movie_counts.head(top_n)
    elif method == "highest_rated":
        top_movies = movie_avg.head(top_n)
    else:
        raise ValueError("method must be 'most_rated' or 'highest_rated'")
    
    return top_movies[["title", "num_ratings" if method=="most_rated" else "avg_rating"]]


In [7]:
print("Top 10 Most Rated Movies:")
display(recommend_popularity(top_n=10, method="most_rated"))

print("Top 10 Highest Rated Movies:")
display(recommend_popularity(top_n=10, method="highest_rated"))


Top 10 Most Rated Movies:


Unnamed: 0,title,num_ratings
357,Star Wars (1977),583
157,Contact (1997),509
49,Fargo (1996),508
52,Return of the Jedi (1983),507
95,Liar Liar (1997),485
289,"English Patient, The (1996)",481
60,Scream (1996),478
24,Toy Story (1995),452
652,Air Force One (1997),431
403,Independence Day (ID4) (1996),429


Top 10 Highest Rated Movies:


Unnamed: 0,title,avg_rating
1562,Someone Else's America (1995),5.0
1619,"Saint of Fort Washington, The (1993)",5.0
1038,Prefontaine (1997),5.0
1297,Aiqing wansui (1994),5.0
1646,They Made Me a Criminal (1939),5.0
1647,Marlene Dietrich: Shadow and Light (1996),5.0
1436,Star Kid (1997),5.0
1150,"Great Day in Harlem, A (1994)",5.0
1579,Entertaining Angels: The Dorothy Day Story (1996),5.0
1130,Santa with Muscles (1996),5.0


In [8]:
#save popularity 

In [None]:
top_popular = recommend_popularity(top_n=100, method="most_rated")
top_popular.to_csv(os.path.join(PROCESSED_DIR, "top_100_popular.csv"), index=False)
