In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("data/demographic.csv")
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year,Action,Adventure,Animation,Comedy,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,Toy Story,Animation; Comedy; Family,81.0,7.7,5415.0,1995,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,Jumanji,Adventure; Fantasy; Family,104.0,6.9,2413.0,1995,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Grumpier Old Men,Romance; Comedy,101.0,6.5,92.0,1995,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
3,Waiting to Exhale,Comedy; Drama; Romance,127.0,6.1,34.0,1995,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
4,Father of the Bride Part II,Comedy,106.0,5.7,173.0,1995,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


### Determine the Filter

In [3]:
genre = ['Animation']
duration = (60, 150)
year = (2000, 2019)
topk = 20

In [13]:
# df = df[df.release_year.between(year[0], year[1]) &
#         df.runtime.between(duration[0], duration[1]) &
#         df[genre].all(axis=1)
#        ]
# df.head()

In [30]:
recommendation = df.loc[:, "title":"release_year"]
recommendation = recommendation.sort_values('vote_average', ascending=False).head(topk)
recommendation

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year
3354,The Road to El Dorado,Adventure; Animation; Comedy; Family,89.0,7.0,892.0,2000
3484,Dinosaur,Animation; Family,82.0,6.2,563.0,2000
3619,Chicken Run,Animation; Comedy; Family,84.0,6.5,1190.0,2000
4168,Shrek,Adventure; Animation; Comedy; Family; Fantasy,90.0,7.3,4183.0,2001
4228,Atlantis: The Lost Empire,Animation; Family; Adventure; Science Fiction,95.0,6.7,1257.0,2001
...,...,...,...,...,...,...
41660,Ballerina,Animation; Family; Adventure; Comedy,89.0,7.1,415.0,2016
41714,The Lego Batman Movie,Action; Animation; Comedy; Family; Fantasy,104.0,7.2,1473.0,2017
41967,The Boss Baby,Animation; Comedy; Family,97.0,6.1,2336.0,2017
43011,Cars 3,Family; Comedy; Animation; Adventure,109.0,6.6,718.0,2017


### Filtering

In [6]:
def imdb_score(df, q=0.82):
    df = df.copy()
    
    m = df.vote_count.quantile(q)
    c = (df.vote_count * df.vote_average).sum() / df.vote_count.sum()
    
    df = df[df.vote_count >= m]
    df['score'] = df.apply(lambda x: (x.vote_average * x.vote_count + c*m) / (x.vote_count + m), axis=1)
    return df

In [7]:
df = imdb_score(df)
recommendation = df.loc[:, "title":"release_year"]
recommendation = recommendation.sort_values('vote_average', ascending=False).head(10)
recommendation

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year
40018,Your Name.,Romance; Animation; Drama,106.0,8.5,1030.0,2016
5471,Spirited Away,Fantasy; Adventure; Animation; Family,125.0,8.3,3968.0,2001
9687,Howl's Moving Castle,Fantasy; Animation; Adventure,119.0,8.2,2049.0,2004
24971,Song of the Sea,Family; Animation; Fantasy,93.0,8.1,420.0,2014
20729,Wolf Children,Animation; Drama; Family; Fantasy,117.0,8.0,483.0,2012
20187,"Batman: The Dark Knight Returns, Part 2",Action; Animation,78.0,7.9,426.0,2013
30208,Inside Out,Drama; Comedy; Animation; Family,94.0,7.9,6737.0,2015
12693,WALL·E,Animation; Family,98.0,7.8,6439.0,2008
13710,Up,Animation; Comedy; Family; Adventure,96.0,7.8,7048.0,2009
14294,Mary and Max,Animation; Comedy; Drama,92.0,7.8,596.0,2009


In [8]:
class RecommenderSystem:
    def __init__(self,data):
        self.df = pd.read_csv(data)
        
    def recommend(self, genre=None, duration=None, year=None, topk = 10):
        df = self.df.copy()
        df = self.demographFilter(df, genre=genre, duration=duration, year=year)
        df = self.compute_imdb_score(df)
        
        result = df.loc[:, "title":"release_year"]
        result = result.sort_values('vote_average', ascending=False).head(10)
        return result
        
    @staticmethod # fungsi yg tdk ada hubunganya dengan class kita, karena tdk ada self tetapi ingin berada dlm class
    def demographFilter(df, genre=None, duration=None, year=None): # fungsi yg tdk ada hubunganya dengan class kita karena tdk ada self
        df = df.copy()
        
        if genre is not None:
            df = df[df[genre].all(axis=1)]
            
        if duration is not None:
            df = df[df.runtime.between(duration[0], duration[1])]
            
        if year is not None:
            df = df[df.release_year.between(year[0], year[1])]
            
        return df
        
    @staticmethod
    def compute_imdb_score(df, q=0.9):
        df = df.copy()
    
        m = df.vote_count.quantile(q)
        c = (df.vote_count * df.vote_average).sum() / df.vote_count.sum()
    
        df = df[df.vote_count >= m]
        df['score'] = df.apply(lambda x: (x.vote_average * x.vote_count + c*m) / (x.vote_count + m), axis=1)
        return df

In [9]:
recSys = RecommenderSystem(data='data/demographic.csv')

In [10]:
recSys.recommend(genre = ['Animation'], duration = (60, 150), year = (2000, 2019))

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year
40018,Your Name.,Romance; Animation; Drama,106.0,8.5,1030.0,2016
5471,Spirited Away,Fantasy; Adventure; Animation; Family,125.0,8.3,3968.0,2001
9687,Howl's Moving Castle,Fantasy; Animation; Adventure,119.0,8.2,2049.0,2004
30208,Inside Out,Drama; Comedy; Animation; Family,94.0,7.9,6737.0,2015
12693,WALL·E,Animation; Family,98.0,7.8,6439.0,2008
13710,Up,Animation; Comedy; Family; Adventure,96.0,7.8,7048.0,2009
24383,Big Hero 6,Adventure; Family; Animation; Action; Comedy,102.0,7.8,6289.0,2014
36082,Zootopia,Animation; Adventure; Family; Comedy,108.0,7.7,4961.0,2016
15328,Toy Story 3,Animation; Family; Comedy,103.0,7.6,4710.0,2010
23489,How to Train Your Dragon 2,Fantasy; Action; Adventure; Animation; Comedy;...,102.0,7.6,3163.0,2014
