In [94]:
import numpy as np
import pandas as pd
import sqlite3 as sql
from sklearn.preprocessing import MinMaxScaler
from ipywidgets import interact ## para análisis interactivo
from sklearn import neighbors ### basado en contenido un solo producto consumido
import joblib
from mlxtend.preprocessing import TransactionEncoder

In [95]:
#### Conectar #######

conn=sql.connect('data/db_movies')
cur=conn.cursor()

## 1. Sistema de recomendación basado popularidad 


In [96]:
#ver tablas
cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
cur.fetchall()

[('ratings',),
 ('movies',),
 ('usuarios_sel',),
 ('movies_sel',),
 ('ratings_final',),
 ('movie_final',),
 ('full_ratings',)]

In [97]:
full_ratings = pd.read_sql_query('select * from full_ratings', conn)
full_ratings.head()
full_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13168 entries, 0 to 13167
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   userId   13168 non-null  int64  
 1   movieId  13168 non-null  int64  
 2   rating   13168 non-null  float64
 3   title    13168 non-null  object 
 4   genres   13168 non-null  object 
dtypes: float64(1), int64(2), object(2)
memory usage: 514.5+ KB


In [71]:
#conn.close()

### a. Mejores películas calificadas que tengan más de 20 calificaciones

In [98]:
# Antes de debe desagaregar el año del título 
full_ratings['year'] = full_ratings['title'].str.extract(r'\((\d{4})\)').astype(int)
full_ratings['title'] = full_ratings['title'].str.replace(r'\(\d{4}\)', '', regex=True).str.strip()
full_ratings.head()

Unnamed: 0,userId,movieId,rating,title,genres,year
0,1,1,4.0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,1,6,4.0,Heat,Action|Crime|Thriller,1995
2,1,47,5.0,Seven (a.k.a. Se7en),Mystery|Thriller,1995
3,1,50,5.0,"Usual Suspects, The",Crime|Mystery|Thriller,1995
4,1,110,4.0,Braveheart,Action|Drama|War,1995


In [99]:
full_ratings.info() #verificar que year sí esté como numeric 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13168 entries, 0 to 13167
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   userId   13168 non-null  int64  
 1   movieId  13168 non-null  int64  
 2   rating   13168 non-null  float64
 3   title    13168 non-null  object 
 4   genres   13168 non-null  object 
 5   year     13168 non-null  int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 617.4+ KB


In [100]:
## recomendaciones basado en popularidad ##
###Consultas SQL####

## 1. Mejores calificadas que tengan más de 20 ratings (Calificaciones), esto para más confiabilidad
pd.read_sql("""select title, 
            avg(rating) as avg_rat,
            count(*) as seen_num
            from full_ratings
            where rating<>0
            group by title
            having seen_num > 20
            order by avg_rat desc
            limit 10
            """, conn)

Unnamed: 0,title,avg_rat,seen_num
0,"Shawshank Redemption, The (1994)",4.420382,157
1,"Godfather, The (1972)",4.318548,124
2,Star Wars: Episode V - The Empire Strikes Back...,4.248344,151
3,Schindler's List (1993),4.235294,119
4,Star Wars: Episode IV - A New Hope (1977),4.234568,162
5,"Departed, The (2006)",4.223684,76
6,Monty Python and the Holy Grail (1975),4.222222,108
7,"Princess Bride, The (1987)",4.214286,105
8,Apocalypse Now (1979),4.210227,88
9,Raiders of the Lost Ark (Indiana Jones and the...,4.203008,133


### b. Las películas más vistas, pero con calificación mayor a 4

In [102]:
## Las películas más vistas, pero con calificación mayor a 4
pd.read_sql("""select title, 
            avg(iif(rating = 0, Null, rating)) as avg_rat,
            count(*) as seen_num
            from full_ratings
            group by title
            having avg_rat >= 4
            order by seen_num desc
            limit 10
            """, conn)

Unnamed: 0,title,avg_rat,seen_num
0,Forrest Gump (1994),4.101093,183
1,"Matrix, The (1999)",4.115385,169
2,Pulp Fiction (1994),4.2,165
3,Star Wars: Episode IV - A New Hope (1977),4.234568,162
4,"Shawshank Redemption, The (1994)",4.420382,157
5,Star Wars: Episode V - The Empire Strikes Back...,4.248344,151
6,"Silence of the Lambs, The (1991)",4.183333,150
7,Star Wars: Episode VI - Return of the Jedi (1983),4.088652,141
8,Fight Club (1999),4.184783,138
9,Raiders of the Lost Ark (Indiana Jones and the...,4.203008,133


### c. Películas más calificadas con puntuación de mínimo 4 y que traten sobre Comedia y Romance 

In [103]:
##Antes se debe desagregar los géneros para full_ratings
genres = full_ratings['genres'].str.split('|') # Divide la columna 'genres' en listas
te = TransactionEncoder() # Utiliza TransactionEncoder para convertirlo a variables dummy
genres_encoded = te.fit_transform(genres)
genres_full_ratings = pd.DataFrame(genres_encoded, columns=te.columns_) # Convierte el resultado en un dataframe con los nombres de las columnas correspondientes
full_ratings_dum = pd.concat([full_ratings, genres_full_ratings], axis=1) # Concatena el dataframe original con el dataframe de géneros codificados

# Elimina la columna
full_ratings_dum.drop(columns=['genres'], inplace=True) 

# Muestra los primeros registros del dataframe resultante
full_ratings_dum.head()

# Conexión con SQLite 
conn = sqlite3.connect(':memory:')
full_ratings_dum.to_sql('full_ratings_dum', conn, index=False, if_exists='replace')

13168

In [104]:
## Las películas más vistas, con calificación mínima de 4, pero que traten sobre Comedy y Romance 
pd.read_sql("""select title, 
            avg(iif(rating = 0, Null, rating)) as avg_rat,
            count(*) as seen_num
            from full_ratings_dum
            where Comedy = 1 and Romance = 1
            group by title
            having avg_rat >= 4
            order by seen_num desc
            limit 10
            """, conn)

Unnamed: 0,title,avg_rat,seen_num
0,Forrest Gump,4.101093,183
1,"Princess Bride, The",4.214286,105
2,"Amelie (Fabuleux destin d'Amélie Poulain, Le)",4.066265,83


### d. Películas más recientes estrenadas con mejor calificación y que tengan traten sobre Animación

In [105]:
## Películas más recientes estrenadas con mejor calificación y que sean de animación

pd.read_sql("""select year, title, 
            avg(iif(rating = 0, Null, rating)) as avg_rat,
            count(iif(rating = 0, Null, rating)) as rat_numb,
            count(*) as seen_num
            from full_ratings_dum
            where Animation = 1 
            group by  year, title
            order by year desc, avg_rat desc limit 5
            """, conn)

Unnamed: 0,year,title,avg_rat,rat_numb,seen_num
0,2009,Up,3.949275,69,69
1,2008,WALL·E,4.086957,69,69
2,2004,"Incredibles, The",3.739796,98,98
3,2003,Finding Nemo,3.917476,103,103
4,2001,"Monsters, Inc.",3.845361,97,97


## 2.1 Sistema de recomendación basado en contenido KNN un solo producto visto 

In [26]:
df=pd.read_sql('select * from movie_final',conn)
df['movieId'] = df['movieId'].astype('object')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134 entries, 0 to 133
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  134 non-null    object
 1   title    134 non-null    object
 2   genres   134 non-null    object
dtypes: object(3)
memory usage: 3.3+ KB


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,6,Heat (1995),Action|Crime|Thriller
3,10,GoldenEye (1995),Action|Adventure|Thriller
4,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller


In [32]:
genres = df['genres'].str.split('|') # Divide la columna 'genres' en listas
te = TransactionEncoder() # Utiliza TransactionEncoder para convertirlo a variables dummy
genres_encoded = te.fit_transform(genres)
genres_df = pd.DataFrame(genres_encoded, columns=te.columns_) # Convierte el resultado en un dataframe con los nombres de las columnas correspondientes
df_dum = pd.concat([df, genres_df], axis=1) # Concatena el dataframe original con el dataframe de géneros codificados

# Elimina la columna original 'genres' y 'title'
df_dum.drop(columns=['genres'], inplace=True) 
df_dum.drop(columns=['title'], inplace=True)

# Muestra los primeros registros del dataframe resultante
df_dum.head()

Unnamed: 0,movieId,Action,Adventure,Animation,Children,Comedy,Crime,Drama,Fantasy,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,False,True,True,True,True,False,False,True,False,False,False,False,False,False,False,False,False
1,2,False,True,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False
2,6,True,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False
3,10,True,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False
4,32,False,False,False,False,False,False,False,False,False,False,False,True,False,True,True,False,False


In [40]:
model = neighbors.NearestNeighbors(n_neighbors=6, metric='cosine') 
model.fit(df_dum)
dist, idlist = model.kneighbors(df_dum)

In [41]:
distancias=pd.DataFrame(dist) ## devuelve un ranking de la distancias más cercanas para cada fila (pelicula)
id_list=pd.DataFrame(idlist) ## para saber esas distancias a que item corresponde

In [42]:
def MovieRecommender(movie_name = list(df['title'].value_counts().index)):
    movie_list_name = []
    movie_id = df[df['title'] == movie_name].index
    movie_id = movie_id[0]
    for newid in idlist[movie_id]:
        movie_list_name.append(df.loc[newid].title)
    return movie_list_name


print(interact(MovieRecommender))

interactive(children=(Dropdown(description='movie_name', options=('Toy Story (1995)', 'Truman Show, The (1998)…

<function MovieRecommender at 0x000001BCAE512340>
