In [8]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Import Data

In [9]:
links_df= pd.read_csv("..\datasets\ml-latest-small\links.csv")
links_df

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
...,...,...,...
9737,193581,5476944,432131.0
9738,193583,5914996,445030.0
9739,193585,6397426,479308.0
9740,193587,8391976,483455.0


In [10]:
movies_df= pd.read_csv("..\datasets\ml-latest-small\movies.csv")
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [11]:
ratings_df= pd.read_csv("..\datasets\ml-latest-small\\ratings.csv")
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [12]:
tags_df= pd.read_csv("..\datasets\ml-latest-small\\tags.csv")
tags_df

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
...,...,...,...,...
3678,606,7382,for katie,1171234019
3679,606,7936,austere,1173392334
3680,610,3265,gun fu,1493843984
3681,610,3265,heroic bloodshed,1493843978


## Basic Recommender(0) -- BestScore

In [6]:
def n_top_movies_bestscore(n_top=movies_df.shape[0], names_df=movies_df, ratings_df=ratings_df):
    
    rating_info=(
        ratings_df
        .groupby("movieId")
        .agg(best_score =("rating","mean"))
        .reset_index()
    )
    
    rating_info = rating_info.merge(names_df,how="left")
    if n_top < movies_df.shape[0] :
        rating_info =rating_info.nlargest(n_top,"best_score")
    return round(rating_info[["title","genres","best_score"]].reset_index().drop(columns="index"),3)
    
n_top_movies_bestscore()

Unnamed: 0,title,genres,best_score
0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.921
1,Jumanji (1995),Adventure|Children|Fantasy,3.432
2,Grumpier Old Men (1995),Comedy|Romance,3.260
3,Waiting to Exhale (1995),Comedy|Drama|Romance,2.357
4,Father of the Bride Part II (1995),Comedy,3.071
...,...,...,...
9719,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,4.000
9720,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,3.500
9721,Flint (2017),Drama,3.500
9722,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,3.500


## Basic Recommender(0.5) -- Popular

In [28]:
def n_top_movies_popular(n_top=movies_df.shape[0], names_df=movies_df, ratings_df=ratings_df):
    
    rating_info=(
        ratings_df
        .groupby("movieId")
        .agg(popularity =("rating","count"))
        #.reset_index()
    )
    
    rating_info = rating_info.merge(names_df,how="left",on="movieId")
    if n_top < movies_df.shape[0] :
        rating_info =rating_info.nlargest(n_top,"popularity")
    return round(rating_info[["title","genres","popularity"]].nlargest(n_top,"popularity"),3)
    
n_top_movies_popular()

Unnamed: 0,title,genres,popularity
314,Forrest Gump (1994),Comedy|Drama|Romance|War,329
277,"Shawshank Redemption, The (1994)",Crime|Drama,317
257,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,307
510,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,279
1938,"Matrix, The (1999)",Action|Sci-Fi|Thriller,278
...,...,...,...
3053,Cop (1988),Thriller,1
3049,Born in East L.A. (1987),Comedy,1
6687,City of Men (Cidade dos Homens) (2007),Drama,1
3045,Best Seller (1987),Thriller,1


In [24]:
all_genres_l = movies_df.genres.str.split("|").sum()
unique_genres_l=[]
for x in all_genres_l:
        # check if exists in unique_list or not
        if x not in unique_genres_l:
            unique_genres_l.append(x)
unique_genres_l.remove('(no genres listed)')
unique_genres_l.insert(0,"Surprise Me!")

In [25]:
str(tuple(unique_genres_l))

"('Surprise Me!', 'Adventure', 'Animation', 'Children', 'Comedy', 'Fantasy', 'Romance', 'Drama', 'Action', 'Crime', 'Thriller', 'Horror', 'Mystery', 'Sci-Fi', 'War', 'Musical', 'Documentary', 'IMAX', 'Western', 'Film-Noir')"

## Basic Recommender(1) -- weighted score

In [8]:
def n_top_movies_weighted(n_top=movies_df.shape[0], names_df=movies_df, ratings_df=ratings_df, mode="any"):
    
    rating_info=(
        ratings_df
        .groupby("movieId")
        .agg(rate_mean =("rating","mean"),rate_count=("rating","count"))
        .reset_index()
    )
    rating_info["weighted_score"]= (rating_info.rate_count/rating_info.rate_count.sum()) *100* rating_info.rate_mean
    
    rating_info = rating_info.merge(names_df,how="left")
    if mode != "any":
        rating_info.query("genres == @mode")
    if n_top < movies_df.shape[0] :
        rating_info = rating_info.nlargest(n_top,"weighted_score")
    return round(rating_info[["title","genres","weighted_score","rate_mean","rate_count"]].reset_index().drop(columns="index"),3)
    
n_top_movies_weighted()

Unnamed: 0,title,genres,weighted_score,rate_mean,rate_count
0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0.836,3.921,215
1,Jumanji (1995),Adventure|Children|Fantasy,0.374,3.432,110
2,Grumpier Old Men (1995),Comedy|Romance,0.168,3.260,52
3,Waiting to Exhale (1995),Comedy|Drama|Romance,0.016,2.357,7
4,Father of the Bride Part II (1995),Comedy,0.149,3.071,49
...,...,...,...,...,...
9719,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,0.004,4.000,1
9720,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,0.003,3.500,1
9721,Flint (2017),Drama,0.003,3.500,1
9722,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,0.003,3.500,1


## Basic Recommender(2) --> Bayesian average

In [13]:
def n_top_movies_bayesian(n_top=movies_df.shape[0], names_df=movies_df, ratings_df=ratings_df):
    m = ratings_df.rating.sum()/ratings_df.rating.count()
    c = (
        ratings_df
        .groupby("movieId")
        .agg(rate_count=("rating","count"))
        .reset_index()
    ).rate_count.quantile(0.95)
    rating_info=(
        ratings_df
        .groupby("movieId")
        .agg(rate_mean =("rating","mean"),rate_count=("rating","count"))
        .reset_index()
    )
    rating_info["rate_bayes"]= ((rating_info.rate_mean * rating_info.rate_count) + (c * m)) / (rating_info.rate_count + c)
    
    rating_info = rating_info.merge(names_df,how="left")
    if n_top < movies_df.shape[0] :
        rating_info =rating_info.nlargest(n_top,"rate_bayes")
    return round(rating_info[["title","genres","rate_bayes","rate_mean","rate_count"]].reset_index().drop(columns="index"),3)
    
n_top_movies_bayesian()

Unnamed: 0,title,genres,rate_bayes,rate_mean,rate_count
0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.915,3.921,215
1,Jumanji (1995),Adventure|Children|Fantasy,3.434,3.432,110
2,Grumpier Old Men (1995),Comedy|Romance,3.273,3.260,52
3,Waiting to Exhale (1995),Comedy|Drama|Romance,2.700,2.357,7
4,Father of the Bride Part II (1995),Comedy,3.096,3.071,49
...,...,...,...,...,...
9719,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,3.626,4.000,1
9720,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,3.501,3.500,1
9721,Flint (2017),Drama,3.501,3.500,1
9722,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,3.501,3.500,1


## Compare all Basic

In [10]:
n_top_movies_bestscore().best_score

0       3.921
1       3.432
2       3.260
3       2.357
4       3.071
        ...  
9719    4.000
9720    3.500
9721    3.500
9722    3.500
9723    4.000
Name: best_score, Length: 9724, dtype: float64

In [11]:
scores_df = movies_df[["title","genres"]]
scores_df["best_score"] = n_top_movies_bestscore().best_score
scores_df["popularity"] = n_top_movies_popular().popularity
scores_df["weighted_score"] = n_top_movies_weighted().weighted_score
scores_df["rate_bayes"] = n_top_movies_bayesian().rate_bayes

In [12]:
(
    scores_df
    .query(f"best_score.isnull() != True")
    .nlargest(20,"weighted_score")
)


Unnamed: 0,title,genres,best_score,popularity,weighted_score,rate_bayes
277,"Shawshank Redemption, The (1994)",Crime|Drama,4.429,317.0,1.392,4.403
314,Forrest Gump (1994),Comedy|Drama|Romance|War,4.164,329.0,1.359,4.146
257,Pulp Fiction (1994),Comedy|Crime|Drama|Thriller,4.197,307.0,1.278,4.177
1938,"Walk on the Moon, A (1999)",Drama|Romance,4.192,278.0,1.156,4.171
510,"Silence of the Lambs, The (1991)",Crime|Horror|Thriller,4.161,279.0,1.151,4.141
224,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi,4.231,251.0,1.053,4.206
97,Braveheart (1995),Action|Drama|War,4.032,237.0,0.948,4.012
2224,Home Alone 2: Lost in New York (1992),Children|Comedy,4.273,218.0,0.924,4.242
461,Schindler's List (1993),Drama|War,4.225,220.0,0.922,4.197
418,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller,3.75,238.0,0.885,3.741
