# Organizando dados no Pandas

In [1]:
import pandas as pd 
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('ratings.csv')
df.head(10)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400
6,1,101,5.0,964980868
7,1,110,4.0,964982176
8,1,151,5.0,964984041
9,1,157,5.0,964984100


In [3]:
df = df.drop(columns=["timestamp"], axis=1)
df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [4]:
movie_titles = pd.read_csv('movies.csv')
movie_titles.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


Adicionando usuario teste

In [5]:
lista_ismenia = [[611,3994,3.0],[611,77846,4.0],[611,48780,5.0],[611,4995,5.0],[611,1246,4.0],
                 [611,140174,5.0],[611,356,5],[611,58047,4],[611,5791,4],[611,33145,5],[611,88405,4],
                 [611,7254,4.0],[611,2706,2.0],[611,69406, 4.0],[611,115667,4.0],[611,1721,5.0],[611,8529,5]]
df_ism = pd.DataFrame(data=lista_ismenia, columns=['userId','movieId','rating'])
df_ism.head()

Unnamed: 0,userId,movieId,rating
0,611,3994,3.0
1,611,77846,4.0
2,611,48780,5.0
3,611,4995,5.0
4,611,1246,4.0


In [6]:
df = pd.concat([df,df_ism])
df.tail(20)

Unnamed: 0,userId,movieId,rating
100833,610,168250,5.0
100834,610,168252,5.0
100835,610,170875,3.0
0,611,3994,3.0
1,611,77846,4.0
2,611,48780,5.0
3,611,4995,5.0
4,611,1246,4.0
5,611,140174,5.0
6,611,356,5.0


Adicionando as informações dos filmes

In [7]:
df = pd.merge(df, movie_titles, on='movieId')
df.sort_values('userId').tail(20)

Unnamed: 0,userId,movieId,rating,title,genres
89768,610,120635,1.0,Taken 3 (2015),Action|Crime|Thriller
31649,610,377,5.0,Speed (1994),Action|Romance|Thriller
98430,610,3973,2.5,Book of Shadows: Blair Witch 2 (2000),Crime|Horror|Mystery|Thriller
78283,611,140174,5.0,Room (2015),Drama
35202,611,3994,3.0,Unbreakable (2000),Drama|Sci-Fi
36253,611,4995,5.0,"Beautiful Mind, A (2001)",Drama|Romance
99891,611,115667,4.0,"Love, Rosie (2014)",Comedy|Romance
92286,611,77846,4.0,12 Angry Men (1997),Crime|Drama
70667,611,5791,4.0,Frida (2002),Drama|Romance
42174,611,58047,4.0,"Definitely, Maybe (2008)",Comedy|Drama|Romance


# Método SVD (Mesmo do netflix)

In [17]:
from surprise import KNNBasic #Outro metodo
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import KFold
from surprise.model_selection import train_test_split
import random
import time

def MetodoSVD(Usuario):

    SEED = 42
    random.seed(SEED)
    np.random.seed(SEED)

    reader = Reader(rating_scale=(0.5,5)) #Definindo o range das notas
    data = Dataset.load_from_df(df[['userId','movieId','rating']], reader) #Carregando o DataFrame
    trainset = data.build_full_trainset() #Treinando com todo o dataset, para poder recomendar o que está faltando

    '''
    #Parametros para o método KNNBasic

    sim_options = {'name':'pearson',
                   'user_based':False,
                   'min_support':1    
                    }
    '''

    pred_algo = SVD()
    pred_algo.fit(trainset)
    
    #Metodo para avaliar cada item(filme) da lista
    recomendados= list() #lista onde terão os mais recomendados 
    cursos_fora = list() #lista onde teram os os cursos que o usuario gostou e não entrarão no algoritimo

    for key,item in df.iterrows():
        usuario = item['userId']
        if usuario == Usuario:
            cursos_fora.append(item['movieId'])
    
    for key,item in df.iterrows():
        if item['movieId'] not in cursos_fora:
            recomendados.append(pred_algo.predict(Usuario, item['movieId']))
            cursos_fora.append(item['movieId'])
            
    tabela_recomendacoes = pd.DataFrame(recomendados)
    tabela_recomendacoes = tabela_recomendacoes.sort_values('est', ascending=False)
    
    # adicionando informações para a tabela de recomendações
    tabela_recomendacoes = tabela_recomendacoes.rename(columns={'iid': 'movieId'})
    tabela_recomendacoes = pd.merge(tabela_recomendacoes, movie_titles, on='movieId')
    
    return tabela_recomendacoes

In [18]:
tic = time.time()
recomendacoes = MetodoSVD(611)
tac = time.time()
tempo_passado = tac - tic
print("Tempo %.2f segundos" % tempo_passado)

recomendacoes.head(10)

Tempo 25.07 segundos


Unnamed: 0,uid,movieId,r_ui,est,details,title,genres
0,611,7153,,5.0,{'was_impossible': False},"Lord of the Rings: The Return of the King, The...",Action|Adventure|Drama|Fantasy
1,611,1198,,5.0,{'was_impossible': False},Raiders of the Lost Ark (Indiana Jones and the...,Action|Adventure
2,611,858,,5.0,{'was_impossible': False},"Godfather, The (1972)",Crime|Drama
3,611,1204,,4.955483,{'was_impossible': False},Lawrence of Arabia (1962),Adventure|Drama|War
4,611,1213,,4.93749,{'was_impossible': False},Goodfellas (1990),Crime|Drama
5,611,260,,4.931011,{'was_impossible': False},Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
6,611,2160,,4.921108,{'was_impossible': False},Rosemary's Baby (1968),Drama|Horror|Thriller
7,611,4993,,4.908207,{'was_impossible': False},"Lord of the Rings: The Fellowship of the Ring,...",Adventure|Fantasy
8,611,1178,,4.889072,{'was_impossible': False},Paths of Glory (1957),Drama|War
9,611,1196,,4.883568,{'was_impossible': False},Star Wars: Episode V - The Empire Strikes Back...,Action|Adventure|Sci-Fi


# Validação do modelo de recomendação

In [26]:
def ValidacaoModelo(numero_splits):
    
    
    reader = Reader(rating_scale=(0.5,5))
    data = Dataset.load_from_df(df[['userId','movieId','rating']], reader)
    pred_algo = SVD()
    
    
    trainset = data.build_full_trainset()
    kf = KFold(n_splits=numero_splits)
    acuracia = list()

    for trainset, testset in kf.split(data):

        # train and test algorithm.
        pred_algo.fit(trainset)
        predictions = pred_algo.test(testset)

        # Compute and print Root Mean Squared Error
        acuracia.append(accuracy.rmse(predictions, verbose=False))
    
    acuracia = pd.DataFrame(acuracia)
    return acuracia

In [29]:
tic = time.time()
acuracia = ValidacaoModelo(5)
tac = time.time()
tempo_passado = tac - tic
print("Tempo %.2f segundos" % tempo_passado)

print("Erro de %.2f na nota" % acuracia.mean())

Tempo 28.14 segundos
Erro de 0.87 na nota
