## Install Necessary Dependencies

In [62]:
import os
import pandas as pd
import numpy as np


## Load Data

In [63]:
#project_root_dir = "Data"
movies_data_filename = "Data/movies.dat"
ratings_data_filename = "Data/ratings.dat"
users_data_filename = "Data/users.dat"


In [64]:
#Movies
movies = pd.read_csv(movies_data_filename, sep='::', engine = 'python',
                     encoding="ISO-8859-1", header = None)
movies.columns = ['MovieID', 'Title', 'Genres']
detailed_movies = movies.copy()
multiple_idx = pd.Series([("|" in movie) for movie in movies['Genres']])
movies.loc[multiple_idx, 'Genres'] = 'Multiple'

#Ratings
ratings = pd.read_csv(ratings_data_filename, sep='::', engine = 'python', header=None)
ratings.columns = ['UserID', 'MovieID', 'Rating', 'Timestamp']
ratings = ratings.drop('Timestamp', axis = 1)

#Users
users = pd.read_csv(users_data_filename, sep='::', engine = 'python', header=None)
users.columns = ['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code']

In [65]:
movies

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Multiple
1,2,Jumanji (1995),Multiple
2,3,Grumpier Old Men (1995),Multiple
3,4,Waiting to Exhale (1995),Multiple
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


## Process Data

### Normalize Ratings

In [66]:
user_avg_ratings = ratings.groupby('UserID')['Rating'].mean().reset_index()
user_avg_ratings.columns = ['UserID', 'Avg_User_Rating']
user_avg_ratings

Unnamed: 0,UserID,Avg_User_Rating
0,1,4.188679
1,2,3.713178
2,3,3.901961
3,4,4.190476
4,5,3.146465
...,...,...
6035,6036,3.302928
6036,6037,3.717822
6037,6038,3.800000
6038,6039,3.878049


In [67]:
ratings = pd.merge(ratings, user_avg_ratings, on='UserID')
ratings['Normalized_Rating'] = ratings['Rating'] / ratings['Avg_User_Rating']
ratings

Unnamed: 0,UserID,MovieID,Rating,Avg_User_Rating,Normalized_Rating
0,1,1193,5,4.188679,1.193694
1,1,661,3,4.188679,0.716216
2,1,914,3,4.188679,0.716216
3,1,3408,4,4.188679,0.954955
4,1,2355,5,4.188679,1.193694
...,...,...,...,...,...
1000204,6040,1091,1,3.577713,0.279508
1000205,6040,1094,5,3.577713,1.397541
1000206,6040,562,5,3.577713,1.397541
1000207,6040,1096,4,3.577713,1.118033


### Full Genre Capture

In [68]:
genre_dummies = detailed_movies['Genres'].str.get_dummies(sep='|')
detailed_movies = pd.concat([detailed_movies, genre_dummies], axis=1)
detailed_movies

Unnamed: 0,MovieID,Title,Genres,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),Animation|Children's|Comedy,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children's|Fantasy,0,1,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,Meet the Parents (2000),Comedy,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3879,3949,Requiem for a Dream (2000),Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3880,3950,Tigerland (2000),Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3881,3951,Two Family House (2000),Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## System 1

### Determine Rating Score

In [69]:
movie_avg_ratings = ratings.groupby('MovieID').agg({'Rating': ['mean', 'count'], 'Normalized_Rating': 'mean'}).reset_index()
movie_avg_ratings.columns = ['MovieID', 'Avg_Rating', 'Num_Ratings', 'Avg_Normalized_Rating']
movies_with_ratings = pd.merge(detailed_movies, movie_avg_ratings, on='MovieID', how='left')
movies_with_ratings.dropna(inplace=True)
movies_with_ratings.sort_values(by='Avg_Normalized_Rating', ascending=False)


Unnamed: 0,MovieID,Title,Genres,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,Avg_Rating,Num_Ratings,Avg_Normalized_Rating
3313,3382,Song of Freedom (1936),Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,5.0,1.0,2.594340
3254,3323,Chain of Fools (2000),Comedy|Crime,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,3.0,1.0,1.556604
553,557,Mamma Roma (1962),Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,4.5,2.0,1.517864
574,578,"Hour of the Pig, The (1993)",Drama|Mystery,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,4.5,2.0,1.410178
748,758,"Jar, The (Khomreh) (1992)",Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,4.0,1.0,1.392796
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
677,684,Windows (1980),Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1.0,1.0,0.300000
3140,3209,"Loves of Carmen, The (1948)",Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1.0,1.0,0.292453
1291,1311,Santa with Muscles (1996),Comedy,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1.0,7.0,0.281847
1406,1430,Underworld (1997),Thriller,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1.0,1.0,0.268789


Obviously this favors movies that have a low number of ratings, so we now look to make a scoring function that combines the value of more rating data and the average normalized rating

We place more weight on the normalized rating, but implement a weight on the number of ratings mainly to ensure that values of 1 rating don't commonly end up at the top

In [70]:
weight_normalized_rating = 0.9 
weight_num_ratings = 1-weight_normalized_rating
max_num_ratings = 1000

movies_with_ratings['Capped_Num_Ratings'] = movies_with_ratings['Num_Ratings'].clip(upper=max_num_ratings)

movies_with_ratings['Score'] = (
    weight_normalized_rating * movies_with_ratings['Avg_Normalized_Rating'] +
    weight_num_ratings * movies_with_ratings['Capped_Num_Ratings']
)

movies_with_ratings.drop('Capped_Num_Ratings', axis=1, inplace=True)

movies_with_ratings.sort_values(by='Score', ascending=False)

Unnamed: 0,MovieID,Title,Genres,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Mystery,Romance,Sci-Fi,Thriller,War,Western,Avg_Rating,Num_Ratings,Avg_Normalized_Rating,Score
315,318,"Shawshank Redemption, The (1994)",Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,4.554558,2227.0,1.238783,101.114905
49,50,"Usual Suspects, The (1995)",Crime|Thriller,0,0,0,0,0,1,0,...,0,0,0,1,0,0,4.517106,1783.0,1.235867,101.112281
523,527,Schindler's List (1993),Drama|War,0,0,0,0,0,0,0,...,0,0,0,0,1,0,4.510417,2304.0,1.223836,101.101452
847,858,"Godfather, The (1972)",Action|Crime|Drama,1,0,0,0,0,1,0,...,0,0,0,0,0,0,4.524966,2223.0,1.223737,101.101363
1180,1198,Raiders of the Lost Ark (1981),Action|Adventure,1,1,0,0,0,0,0,...,0,0,0,0,0,0,4.477725,2514.0,1.217865,101.096079
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3391,3460,Hillbillys in a Haunted House (1967),Comedy,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1.000000,1.0,0.300189,0.370170
677,684,Windows (1980),Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1.000000,1.0,0.300000,0.370000
3140,3209,"Loves of Carmen, The (1948)",Drama,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1.000000,1.0,0.292453,0.363208
1406,1430,Underworld (1997),Thriller,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1.000000,1.0,0.268789,0.341910


### Create Function to Retrieve the top 10 scores from a particular genre

In [71]:
genres = movies['Genres'].unique()

def top_movies_by_genre(df, genre, metric='Score', top_n=10):
    genre_filter = df[genre] == 1
    genre_movies = df[genre_filter]
    sorted_genre_movies = genre_movies.sort_values(by=metric, ascending=False)
    top_genre_movies = sorted_genre_movies.head(top_n)[['MovieID', 'Title', metric]]
    return top_genre_movies

In [73]:
genre = 'Drama'
top_action_movies = top_movies_by_genre(movies_with_ratings, genre)
top_action_movies


Unnamed: 0,MovieID,Title,Score
315,318,"Shawshank Redemption, The (1994)",101.114905
523,527,Schindler's List (1993),101.101452
847,858,"Godfather, The (1972)",101.101363
900,912,Casablanca (1942),101.07225
1176,1193,One Flew Over the Cuckoo's Nest (1975),101.070575
911,923,Citizen Kane (1941),101.068479
589,593,"Silence of the Lambs, The (1991)",101.0666
896,908,North by Northwest (1959),101.066511
1959,2028,Saving Private Ryan (1998),101.065266
2789,2858,American Beauty (1999),101.063404
