In [1]:
import pandas as pd

# Three Most Common Recommended System


- **demographic filtering**:
    - tidak personalized / secara general -> **Top 50 Movies Of The Year**
    - filter berdasarkan fitur demographic seperti genre,durasi,dll.
    - sistem rekomendasi yang simpel
    - Ide : Direkomendasikan berdasarkan apa yang disukai orang secara umum
- **content-based filtering**:
    - rekomendasi barang yang serupa -> **Other movies you like**
    - filter menggunakan fitur yang lebih spesifik
    - Ide : Jika seseorang menonton film X, maka dia akan direkomendasikan film yang serupa dengan X
- **collaborative filtering**
    - Mencocokan orang dengan preferensi serupa -> **Other people also watched**
    - tidak memerlukan filter data apapun, hanya memerlukan kemiripan dengan orang lain

# Simple Demographic Filtering: Filter -> Scoring -> Sort

In [43]:
df = pd.read_csv("data/demographic.csv")
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year,Action,Adventure,Animation,Comedy,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,Toy Story,Animation; Comedy; Family,81.0,7.7,5415.0,1995,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,Jumanji,Adventure; Fantasy; Family,104.0,6.9,2413.0,1995,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Grumpier Old Men,Romance; Comedy,101.0,6.5,92.0,1995,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
3,Waiting to Exhale,Comedy; Drama; Romance,127.0,6.1,34.0,1995,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
4,Father of the Bride Part II,Comedy,106.0,5.7,173.0,1995,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# df.sort_values("vote_average", ascending=False)

# Step 1: Filter

In [44]:
genre = ["Animation"]
duration = (60, 150)
year = (2007, 2019)
topk = 20

In [45]:
df = df[df.release_year.between(year[0], year[1]) &
        df.runtime.between(duration[0], duration[1]) &
        df[genre].all(axis=1)]

df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year,Action,Adventure,Animation,Comedy,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
11556,Ratatouille,Animation; Comedy; Family; Fantasy,111.0,7.5,4510.0,2007,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
11683,TMNT,Adventure; Animation; Comedy; Family,90.0,6.0,349.0,2007,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
11716,Meet the Robinsons,Animation; Comedy; Family,95.0,6.7,787.0,2007,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
11737,Aqua Teen Hunger Force Colon Movie Film for Th...,Animation; Comedy,86.0,6.5,35.0,2007,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
11814,Shrek the Third,Fantasy; Adventure; Animation; Comedy; Family,93.0,6.0,2355.0,2007,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0


# Step 2: Scoring

Dikasus ini tinggal pakai vote average sebagai score

# Step 3: Sort

In [34]:
recommendation = df.loc[:, "title":"release_year"]
recommendation = recommendation.sort_values("vote_average", ascending=False).head(topk)
recommendation

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year
30208,Inside Out,Drama; Comedy; Animation; Family,94.0,7.9,6737.0,2015


# Evaluate Demographic Filtering dengan menggunakan IMDB Scoring

# IMDB Weighted Rating

Film dengan 100 orang yang rate 9.5 seakan-akan lebih baik dari pada 10000 orang yang rate 9.0.<br>
Harusnya dibobot dengan jumlah orang yang vote juga.

\begin{equation}
WR = \frac{Rv+Cm}{v+m}
\end{equation}

- $v$: Jumlah voting film tersebut -> (vote_count)
- $m$: Syarat minimum vote -> kita akan pakai quantile
- $R$: rata-rata rating film itu -> (vote_average)
- $C$: rata-rata rating semua film -> bisa dihitung

In [35]:
def imdb_score(df, q=0.9):
    df = df.copy()
    
    m = df.vote_count.quantile(q)
    C = (df.vote_average * df.vote_count).sum() / df.vote_count.sum() 
    
    df = df[df.vote_count >= m]
    df["score"] = df.apply(lambda x : ((x.vote_average * x.vote_count) + (C * m)) / (x.vote_count + m), axis=1)
    return df

In [46]:
df = imdb_score(df)

In [48]:
recommendation = df.loc[:, "title":"release_year"]
recommendation = recommendation.sort_values("vote_average", ascending=False).head(topk)
recommendation

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year
30208,Inside Out,Drama; Comedy; Animation; Family,94.0,7.9,6737.0,2015
12693,WALL·E,Animation; Family,98.0,7.8,6439.0,2008
24383,Big Hero 6,Adventure; Family; Animation; Action; Comedy,102.0,7.8,6289.0,2014
13710,Up,Animation; Comedy; Family; Adventure,96.0,7.8,7048.0,2009
36082,Zootopia,Animation; Adventure; Family; Comedy,108.0,7.7,4961.0,2016
23489,How to Train Your Dragon 2,Fantasy; Action; Adventure; Animation; Comedy;...,102.0,7.6,3163.0,2014
15328,Toy Story 3,Animation; Family; Comedy,103.0,7.6,4710.0,2010
11556,Ratatouille,Animation; Comedy; Family; Fantasy,111.0,7.5,4510.0,2007
22656,The Lego Movie,Adventure; Animation; Comedy; Family; Fantasy,100.0,7.5,3127.0,2014
14965,How to Train Your Dragon,Fantasy; Adventure; Animation; Family,98.0,7.5,4319.0,2010


# ML Engineering : Wrap them all up

In [54]:
class RecommenderSystem:
    def __init__(self, data):
        self.df = pd.read_csv(data)
    
    def recommend(self, genre=None, duration=None, year=None, topk=10):
        df = self.df.copy()
        df = self.demographic_filter(df, genre=genre, duration=duration, year=year)
        df = self.compute_imdb_score(df)
        
        result = df.loc[:, "title":"release_year"]
        result = result.sort_values("vote_average", ascending=False)
        result = result.head(topk)
        return result
    
    @staticmethod
    def demographic_filter(df,  genre=None, duration=None, year=None):
        df = df.copy()
        
        if genre is not None:
            df = df[df[genre].all(axis=1)]
        
        if duration is not None:
            df = df[df.runtime.between(duration[0], duration[1])]
        
        if year is not None:
            df = df[df.release_year.between(year[0], year[1])]         
        return df
    
    @staticmethod
    def compute_imdb_score(df, q=0.9):
        df = df.copy()
    
        m = df.vote_count.quantile(q)
        C = (df.vote_average * df.vote_count).sum() / df.vote_count.sum() 
    
        df = df[df.vote_count >= m]
        df["score"] = df.apply(lambda x : ((x.vote_average * x.vote_count) + (C * m)) / (x.vote_count + m), axis=1)
        return df

In [58]:
recsys = RecommenderSystem(data="data/demographic.csv")

In [60]:
recsys.recommend(genre=["Animation", "Family"], duration=(60, 150), year=(2015, 2020))

Unnamed: 0,title,genres,runtime,vote_average,vote_count,release_year
30208,Inside Out,Drama; Comedy; Animation; Family,94.0,7.9,6737.0,2015
36082,Zootopia,Animation; Adventure; Family; Comedy,108.0,7.7,4961.0,2016
41203,Moana,Adventure; Animation; Family,107.0,7.3,3471.0,2016
37969,Finding Dory,Adventure; Animation; Comedy; Family,97.0,6.8,4333.0,2016
41433,Sing,Animation; Comedy; Drama; Family; Music,108.0,6.8,2363.0,2016
30588,Minions,Family; Animation; Adventure; Comedy,91.0,6.4,4729.0,2015
30388,The Secret Life of Pets,Animation; Family,87.0,5.9,3536.0,2016
