In [1]:
import os
import pandas as pd
if not os.path.exists('movielens_small'):
    !wget https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
    !unzip ml-latest-small.zip
    !rm ml-latest-small.zip
    !mv ml-latest-small movielens_small

--2024-03-20 15:40:41--  https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: 'ml-latest-small.zip'


2024-03-20 15:40:43 (1.42 MB/s) - 'ml-latest-small.zip' saved [978202/978202]

Archive:  ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


In [2]:
def load_data():
    ratings_df = pd.read_csv('movielens_small/ratings.csv')
    movies_df = pd.read_csv('movielens_small/movies.csv')
    return ratings_df, movies_df
ratings_df, movies_df = load_data()

In [3]:
ratings_df.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
movies_df.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
def calculate_popularity(movies_df, ratings_df, damping_factor=10):
    num_ratings = ratings_df.groupby('movieId')['rating'].count()
    mean_ratings = ratings_df.groupby('movieId')['rating'].mean()
    sum_ratings = ratings_df.groupby('movieId')['rating'].sum()
    
    # Using Damped Mean
    global_mean = ratings_df['rating'].mean() # Mean for all movies
    damp_numerator = sum_ratings + damping_factor * global_mean
    damped_denominator = num_ratings + damping_factor
    damped_mean_rating = damp_numerator / damped_denominator
   
    movies_df['num_ratings'] = movies_df['movieId'].map(num_ratings)
    movies_df['mean_ratings'] = movies_df['movieId'].map(mean_ratings)
    movies_df['damped_mean_rating'] = movies_df['movieId'].map(damped_mean_rating)
    
    return movies_df

In [6]:
movies_df = calculate_popularity(movies_df, ratings_df, damping_factor=10)

In [7]:
movies_df.sort_values(by='num_ratings', ascending=False).head()

Unnamed: 0,movieId,title,genres,num_ratings,mean_ratings,damped_mean_rating
314,356,Forrest Gump (1994),Comedy|Drama|Romance|War,329.0,4.164134,4.144589
277,318,"Shawshank Redemption, The (1994)",Crime|Drama,317.0,4.429022,4.400659
257,296,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,307.0,4.197068,4.175128
510,593,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,279.0,4.16129,4.138462
1939,2571,"Matrix, The (1999)",Action|Sci-Fi|Thriller,278.0,4.192446,4.168457


In [8]:
movies_df.sort_values(by='mean_ratings', ascending=False).head()

Unnamed: 0,movieId,title,genres,num_ratings,mean_ratings,damped_mean_rating
7656,88448,Paper Birds (Pájaros de papel) (2010),Comedy|Drama,1.0,5.0,3.637779
8107,100556,"Act of Killing, The (2012)",Documentary,1.0,5.0,3.637779
9083,143031,Jump In! (2007),Comedy|Drama|Romance,1.0,5.0,3.637779
9094,143511,Human (2015),Documentary,1.0,5.0,3.637779
9096,143559,L.A. Slasher (2015),Comedy|Crime|Fantasy,1.0,5.0,3.637779


In [9]:
movies_df.sort_values(by='damped_mean_rating', ascending=False).head(10)

Unnamed: 0,movieId,title,genres,num_ratings,mean_ratings,damped_mean_rating
277,318,"Shawshank Redemption, The (1994)",Crime|Drama,317.0,4.429022,4.400659
659,858,"Godfather, The (1972)",Crime|Drama,192.0,4.289062,4.250077
2226,2959,Fight Club (1999),Action|Crime|Drama|Thriller,218.0,4.272936,4.239103
922,1221,"Godfather: Part II, The (1974)",Crime|Drama,129.0,4.25969,4.205148
46,50,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,204.0,4.237745,4.203344
224,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,251.0,4.231076,4.203125
602,750,Dr. Strangelove or: How I Learned to Stop Worr...,Comedy|War,97.0,4.268041,4.196407
914,1213,Goodfellas (1990),Crime|Drama,126.0,4.25,4.194967
461,527,Schindler's List (1993),Drama|War,220.0,4.225,4.193546
6710,58559,"Dark Knight, The (2008)",Action|Crime|Drama|IMAX,149.0,4.238255,4.191922
