In [11]:
import os
import re
import sys
import pandas as pd
from datetime import datetime
from google.colab import files
import matplotlib.pyplot as plt
import matplotlib
from cycler import cycler

Carregando os processos do dataset



In [12]:
%%time
_ = files.upload()

Saving ratings.parquet to ratings (1).parquet
CPU times: user 2.81 s, sys: 303 ms, total: 3.11 s
Wall time: 3min 4s


In [13]:
def convert_timestamp_to_date(timestamp:int):
    return datetime.fromtimestamp(timestamp).date()

df_ratings = pd.read_parquet('ratings.parquet')
df_ratings['date'] = df_ratings['timestamp'].apply(convert_timestamp_to_date)
df_ratings.tail()

Unnamed: 0,user_id,item_id,rating,timestamp,date
1000204,6040,1091,1,956716541,2000-04-26
1000205,6040,1094,5,956704887,2000-04-25
1000206,6040,562,5,956704746,2000-04-25
1000207,6040,1096,4,956715648,2000-04-26
1000208,6040,1097,4,956715569,2000-04-26


Abaixo temos os metadados dos itens selecionados

In [14]:
%%time
_ = files.upload()

Saving movies.parquet to movies (1).parquet
CPU times: user 802 ms, sys: 96.7 ms, total: 898 ms
Wall time: 54.8 s


In [15]:
def extract_year_from_title(title:str, regex='(\d{4})'):
    match = re.search(regex, title)
    return None if match is None else match.group()

def convert_genres_to_list(genres:str, separator='|'):
    return genres.split(separator)

df_items = pd.read_parquet('movies.parquet')
df_items['genres'] = df_items['genres'].apply(convert_genres_to_list)
df_items['year'] = df_items['title'].apply(extract_year_from_title)
df_items.tail()

Unnamed: 0,item_id,title,genres,year
3878,3948,Meet the Parents (2000),[Comedy],2000
3879,3949,Requiem for a Dream (2000),[Drama],2000
3880,3950,Tigerland (2000),[Drama],2000
3881,3951,Two Family House (2000),[Drama],2000
3882,3952,"Contender, The (2000)","[Drama, Thriller]",2000


Criando o cálculo do trending

In [16]:
def extract_year_month(date):
    return '{:04d}-{:02d}'.format(date.year, date.month)

df_ratings['window'] = df_ratings['date'].apply(extract_year_month)
df_ratings.tail()

Unnamed: 0,user_id,item_id,rating,timestamp,date,window
1000204,6040,1091,1,956716541,2000-04-26,2000-04
1000205,6040,1094,5,956704887,2000-04-25,2000-04
1000206,6040,562,5,956704746,2000-04-25,2000-04
1000207,6040,1096,4,956715648,2000-04-26,2000-04
1000208,6040,1097,4,956715569,2000-04-26,2000-04


Consumo por janela temporal. Agrupando por item e por janela

In [17]:
df_window_consumptions = (
    df_ratings
    .groupby(['item_id', 'window'])
    .agg({'user_id': 'count'})
    .reset_index()
    .rename({'user_id': 'count'}, axis=1)
    .sort_values(by=['item_id', 'window'])
)
df_window_consumptions

Unnamed: 0,item_id,window,count
0,1,2000-04,17
1,1,2000-05,165
2,1,2000-06,128
3,1,2000-07,203
4,1,2000-08,386
...,...,...,...
65635,3952,2002-09,1
65636,3952,2002-11,1
65637,3952,2002-12,3
65638,3952,2003-01,2


Shift temporal

In [18]:
df_window_consumptions.sort_values(by=['item_id', 'window'], inplace=True)

df_window_consumptions['count_previous'] = (
    df_window_consumptions
    .groupby(['item_id'])['count']
    .shift(1)
)
df_window_consumptions

Unnamed: 0,item_id,window,count,count_previous
0,1,2000-04,17,
1,1,2000-05,165,17.0
2,1,2000-06,128,165.0
3,1,2000-07,203,128.0
4,1,2000-08,386,203.0
...,...,...,...,...
65635,3952,2002-09,1,2.0
65636,3952,2002-11,1,1.0
65637,3952,2002-12,3,1.0
65638,3952,2003-01,2,3.0


Lift

In [19]:
df_window_consumptions['lift'] = (df_window_consumptions['count'] - df_window_consumptions['count_previous'])/df_window_consumptions['count_previous']
df_window_consumptions

Unnamed: 0,item_id,window,count,count_previous,lift
0,1,2000-04,17,,
1,1,2000-05,165,17.0,8.705882
2,1,2000-06,128,165.0,-0.224242
3,1,2000-07,203,128.0,0.585938
4,1,2000-08,386,203.0,0.901478
...,...,...,...,...,...
65635,3952,2002-09,1,2.0,-0.500000
65636,3952,2002-11,1,1.0,0.000000
65637,3952,2002-12,3,1.0,2.000000
65638,3952,2003-01,2,3.0,-0.333333


In [20]:
prediction_window = '2003-01'
(
    df_window_consumptions
    .query('window == @prediction_window')
    .rename({'lift': 'score'}, axis=1)
    .sort_values(by='score', ascending=False)
)

Unnamed: 0,item_id,window,count,count_previous,score
32959,2011,2003-01,7,1.0,6.00
42100,2502,2003-01,7,1.0,6.00
782,32,2003-01,6,1.0,5.00
64914,3897,2003-01,6,1.0,5.00
25525,1527,2003-01,5,1.0,4.00
...,...,...,...,...,...
64221,3847,2003-01,1,4.0,-0.75
21114,1266,2003-01,1,5.0,-0.80
18836,1179,2003-01,1,5.0,-0.80
18426,1136,2003-01,1,5.0,-0.80


Agora vamos reagrupar toda a logica construida acima para criar uma função que vai receber todo o dataframe e criar a recomendação de trending

In [21]:
def recommend_trending_n(ratings:pd.DataFrame, n:int, prediction_window:str=None, min_evaluations:int=None) -> pd.DataFrame:

    prediction_window = max(ratings['window']) if prediction_window is None else prediction_window

    ratings = ratings[['item_id', 'window', 'user_id']]
    # Calculo de janela
    df_window_consumptions = (
        ratings
        .groupby(['item_id', 'window'])['user_id']
        .count()
        .reset_index()
        .rename({'user_id': 'count'}, axis=1)
        .sort_values(by=['item_id', 'window'])
    )

    # Shift temporal
    df_window_consumptions['count_previous'] = (
        df_window_consumptions
        .groupby(['item_id'])['count']
        .shift(1)
    )

    # Calculo do lift
    df_window_consumptions['lift'] = (df_window_consumptions['count'] - df_window_consumptions['count_previous'])/df_window_consumptions['count_previous']

    # Selecao de janela
    recommendations = (
      df_window_consumptions
      .query('window == @prediction_window')
      .rename({'lift': 'score'}, axis=1)
      .sort_values(by='score', ascending=False)
    )

    if min_evaluations is not None:
        recommendations = recommendations.query('count_previous >= @min_evaluations')

    return recommendations.head(n)

df_trending = recommend_trending_n(df_ratings, n=10, prediction_window='2002-12')
df_trending.merge(df_items, on='item_id', how='inner')

Unnamed: 0,item_id,window,count,count_previous,score,title,genres,year
0,1722,2002-12,7,1.0,6.0,Tomorrow Never Dies (1997),"[Action, Romance, Thriller]",1997
1,595,2002-12,7,1.0,6.0,Beauty and the Beast (1991),"[Animation, Children's, Musical]",1991
2,3503,2002-12,6,1.0,5.0,Solaris (Solyaris) (1972),"[Drama, Sci-Fi]",1972
3,3639,2002-12,6,1.0,5.0,"Man with the Golden Gun, The (1974)",[Action],1974
4,2990,2002-12,6,1.0,5.0,Licence to Kill (1989),[Action],1989
5,1266,2002-12,5,1.0,4.0,Unforgiven (1992),[Western],1992
6,1179,2002-12,5,1.0,4.0,"Grifters, The (1990)","[Crime, Drama, Film-Noir]",1990
7,3882,2002-12,5,1.0,4.0,Bring It On (2000),[Comedy],2000
8,2966,2002-12,4,1.0,3.0,"Straight Story, The (1999)",[Drama],1999
9,2942,2002-12,7,2.0,2.5,Flashdance (1983),"[Drama, Romance]",1983


In [22]:
prediction_window = '2002-12'
min_evaluations = 2
n = 10

df_trending = recommend_trending_n(
    df_ratings,
    n=n,
    prediction_window=prediction_window,
    min_evaluations=min_evaluations
)
df_trending.merge(df_items, on='item_id', how='inner')

Unnamed: 0,item_id,window,count,count_previous,score,title,genres,year
0,2942,2002-12,7,2.0,2.5,Flashdance (1983),"[Drama, Romance]",1983
1,3791,2002-12,10,3.0,2.333333,Footloose (1984),[Drama],1984
2,2926,2002-12,6,2.0,2.0,Hairspray (1988),"[Comedy, Drama]",1988
3,3635,2002-12,6,2.0,2.0,"Spy Who Loved Me, The (1977)",[Action],1977
4,1032,2002-12,5,2.0,1.5,Alice in Wonderland (1951),"[Animation, Children's, Musical]",1951
5,1097,2002-12,7,3.0,1.333333,E.T. the Extra-Terrestrial (1982),"[Children's, Drama, Fantasy, Sci-Fi]",1982
6,2054,2002-12,6,3.0,1.0,"Honey, I Shrunk the Kids (1989)","[Adventure, Children's, Comedy, Fantasy, Sci-Fi]",1989
7,1375,2002-12,4,2.0,1.0,Star Trek III: The Search for Spock (1984),"[Action, Adventure, Sci-Fi]",1984
8,1258,2002-12,4,2.0,1.0,"Shining, The (1980)",[Horror],1980
9,1517,2002-12,4,2.0,1.0,Austin Powers: International Man of Mystery (1...,[Comedy],1997


Filtrando por Gênero

In [23]:
genre = "Children's"
item_ids = df_items[df_items['genres'].apply(lambda x: genre in x)]['item_id']
df_ratings_filtered = df_ratings[df_ratings['item_id'].isin(item_ids)]
df_ratings_filtered.tail()

Unnamed: 0,user_id,item_id,rating,timestamp,date,window
999888,6040,919,5,956704191,2000-04-25,2000-04
1000014,6040,34,4,956704584,2000-04-25,2000-04
1000153,6040,2384,4,956703954,2000-04-25,2000-04
1000191,6040,3751,4,964828782,2000-07-28,2000-07
1000208,6040,1097,4,956715569,2000-04-26,2000-04


In [24]:
df_trending = recommend_trending_n(df_ratings_filtered, n=10, prediction_window='2002-12', min_evaluations=2)
df_trending.merge(df_items, on='item_id', how='inner')

Unnamed: 0,item_id,window,count,count_previous,score,title,genres,year
0,1032,2002-12,5,2.0,1.5,Alice in Wonderland (1951),"[Animation, Children's, Musical]",1951
1,1097,2002-12,7,3.0,1.333333,E.T. the Extra-Terrestrial (1982),"[Children's, Drama, Fantasy, Sci-Fi]",1982
2,2054,2002-12,6,3.0,1.0,"Honey, I Shrunk the Kids (1989)","[Adventure, Children's, Comedy, Fantasy, Sci-Fi]",1989
3,317,2002-12,7,4.0,0.75,"Santa Clause, The (1994)","[Children's, Comedy, Fantasy]",1994
4,596,2002-12,3,2.0,0.5,Pinocchio (1940),"[Animation, Children's]",1940
5,1,2002-12,3,2.0,0.5,Toy Story (1995),"[Animation, Children's, Comedy]",1995
6,594,2002-12,2,2.0,0.0,Snow White and the Seven Dwarfs (1937),"[Animation, Children's, Musical]",1937
7,2090,2002-12,2,2.0,0.0,"Rescuers, The (1977)","[Animation, Children's]",1977
8,2761,2002-12,2,2.0,0.0,"Iron Giant, The (1999)","[Animation, Children's]",1999
9,1028,2002-12,2,2.0,0.0,Mary Poppins (1964),"[Children's, Comedy, Musical]",1964
