In [1]:
import numpy as np
import pandas as pd
import sqlite3 as sql
from sklearn.preprocessing import MinMaxScaler
from ipywidgets import interact ## para análisis interactivo
from sklearn import neighbors ### basado en contenido un solo producto consumido
import joblib
from mlxtend.preprocessing import TransactionEncoder
import a_funciones as fn

In [24]:
#### Conectar #######
conn=sql.connect('data/db_movies')
cur=conn.cursor()

## 1. Sistema de recomendación basado en popularidad 


In [25]:
#ver tablas
cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
cur.fetchall()

[('ratings',),
 ('movies',),
 ('usuarios_sel',),
 ('movies_sel',),
 ('ratings_final',),
 ('movie_final',),
 ('full_ratings',)]

In [26]:
full_ratings = pd.read_sql_query('select * from full_ratings', conn)
full_ratings.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,6,4.0,Heat (1995),Action|Crime|Thriller
2,1,47,5.0,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
3,1,50,5.0,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
4,1,110,4.0,Braveheart (1995),Action|Drama|War


In [27]:
full_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13168 entries, 0 to 13167
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   userId   13168 non-null  int64  
 1   movieId  13168 non-null  int64  
 2   rating   13168 non-null  float64
 3   title    13168 non-null  object 
 4   genres   13168 non-null  object 
dtypes: float64(1), int64(2), object(2)
memory usage: 514.5+ KB


In [28]:
full_ratings['year'] = full_ratings['title'].str.extract(r'\((\d{4})\)').astype(int)
full_ratings['title'] = full_ratings['title'].str.replace(r'\(\d{4}\)', '', regex=True).str.strip()
full_ratings.head()

Unnamed: 0,userId,movieId,rating,title,genres,year
0,1,1,4.0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,1,6,4.0,Heat,Action|Crime|Thriller,1995
2,1,47,5.0,Seven (a.k.a. Se7en),Mystery|Thriller,1995
3,1,50,5.0,"Usual Suspects, The",Crime|Mystery|Thriller,1995
4,1,110,4.0,Braveheart,Action|Drama|War,1995


In [29]:
full_ratings.info() #verificar que year sí esté como numeric 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13168 entries, 0 to 13167
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   userId   13168 non-null  int64  
 1   movieId  13168 non-null  int64  
 2   rating   13168 non-null  float64
 3   title    13168 non-null  object 
 4   genres   13168 non-null  object 
 5   year     13168 non-null  int32  
dtypes: float64(1), int32(1), int64(2), object(2)
memory usage: 565.9+ KB


In [75]:
# Conexión con SQLite 
conn = sql.connect(':memory:')
full_ratings.to_sql('full_ratings', conn, index=False, if_exists='replace')

13168

### a. Las películas más vistas, pero con calificación mayor a 4 (mejor calificadas)

In [76]:
## Las películas más vistas, pero con calificación mayor a 4
pd.read_sql("""select title, 
            avg(iif(rating = 0, Null, rating)) as avg_rat,
            count(*) as seen_num
            from full_ratings
            group by title
            having avg_rat >= 4
            order by seen_num desc
            limit 10
            """, conn)



Unnamed: 0,title,avg_rat,seen_num
0,Forrest Gump,4.101093,183
1,"Matrix, The",4.115385,169
2,Pulp Fiction,4.2,165
3,Star Wars: Episode IV - A New Hope,4.234568,162
4,"Shawshank Redemption, The",4.420382,157
5,Star Wars: Episode V - The Empire Strikes Back,4.248344,151
6,"Silence of the Lambs, The",4.183333,150
7,Star Wars: Episode VI - Return of the Jedi,4.088652,141
8,Fight Club,4.184783,138
9,Raiders of the Lost Ark (Indiana Jones and the...,4.203008,133


### b. Películas más vistas y mejores calificadas que sean para toda la familia

In [77]:
##Antes se debe desagregar los géneros para full_ratings
genres = full_ratings['genres'].str.split('|') # Divide la columna 'genres' en listas
te = TransactionEncoder() # Utiliza TransactionEncoder para convertirlo a variables dummy
genres_encoded = te.fit_transform(genres)
genres_full_ratings = pd.DataFrame(genres_encoded, columns=te.columns_) # Convierte el resultado en un dataframe con los nombres de las columnas correspondientes
full_ratings_dum = pd.concat([full_ratings, genres_full_ratings], axis=1) # Concatena el dataframe original con el dataframe de géneros codificados

# Elimina la columna
full_ratings_dum.drop(columns=['genres'], inplace=True) 

# Muestra los primeros registros del dataframe resultante
full_ratings_dum.head()

# Conexión con SQLite 
conn = sql.connect(':memory:')
full_ratings_dum.to_sql('full_ratings_dum', conn, index=False, if_exists='replace')

13168

In [78]:
## Las películas más vistas, con calificación mínima de 3.8 estrella, y que sean para toda la familia
pd.read_sql("""select title, 
            avg(iif(rating = 0, Null, rating)) as avg_rat,
            count(*) as seen_num
            from full_ratings_dum
            where Children = 1 
            group by title
            having avg_rat >= 3.8
            order by seen_num desc
            limit 10
            """, conn)

Unnamed: 0,title,avg_rat,seen_num
0,Toy Story,3.847328,131
1,Shrek,3.822581,124
2,"Lion King, The",3.914414,111
3,Aladdin,3.825688,109
4,Finding Nemo,3.917476,103
5,"Monsters, Inc.",3.845361,97
6,Willy Wonka & the Chocolate Factory,3.947674,86
7,WALL·E,4.086957,69
8,Up,3.949275,69


### c. Top mejores películas calificadas por año de estreno

In [79]:
# Conexión con SQLite 
conn = sql.connect(':memory:')
full_ratings.to_sql('full_ratings', conn, index=False, if_exists='replace')

13168

In [80]:

pd.read_sql("""select year, title, 
            avg(iif(rating = 0, Null, rating)) as avg_rat,
            count(iif(rating = 0, Null, rating)) as rat_numb,
            count(*) as seen_num
            from full_ratings
            group by  year, title
            order by year desc, avg_rat desc limit 10
            """, conn)

Unnamed: 0,year,title,avg_rat,rat_numb,seen_num
0,2010,Inception,4.042169,83,83
1,2009,Up,3.949275,69,69
2,2008,"Dark Knight, The",4.151042,96,96
3,2008,WALL·E,4.086957,69,69
4,2006,"Departed, The",4.223684,76,76
5,2005,Batman Begins,3.853261,92,92
6,2004,Eternal Sunshine of the Spotless Mind,4.152941,85,85
7,2004,Kill Bill: Vol. 2,3.851648,91,91
8,2004,"Incredibles, The",3.739796,98,98
9,2003,"Lord of the Rings: The Return of the King, The",4.129167,120,120


## 2.1 Sistema de recomendación basado en contenido KNN un solo producto seleccionado 

In [81]:
#### Conectar #######
conn=sql.connect('data/db_movies')
cur=conn.cursor()

In [82]:
df=pd.read_sql('select * from movie_final',conn)
df['movieId'] = df['movieId'].astype('object')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 134 entries, 0 to 133
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  134 non-null    object
 1   title    134 non-null    object
 2   genres   134 non-null    object
dtypes: object(3)
memory usage: 3.3+ KB


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,6,Heat (1995),Action|Crime|Thriller
3,10,GoldenEye (1995),Action|Adventure|Thriller
4,32,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller


In [83]:
genres = df['genres'].str.split('|') # Divide la columna 'genres' en listas
te = TransactionEncoder() # Utiliza TransactionEncoder para convertirlo a variables dummy
genres_encoded = te.fit_transform(genres)
genres_df = pd.DataFrame(genres_encoded, columns=te.columns_) # Convierte el resultado en un dataframe con los nombres de las columnas correspondientes
df_dum = pd.concat([df, genres_df], axis=1) # Concatena el dataframe original con el dataframe de géneros codificados

# Elimina la columna original 'genres' y 'title'
df_dum.drop(columns=['genres'], inplace=True) 
df_dum.drop(columns=['title'], inplace=True)

# Muestra los primeros registros del dataframe resultante
df_dum.head()

Unnamed: 0,movieId,Action,Adventure,Animation,Children,Comedy,Crime,Drama,Fantasy,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,False,True,True,True,True,False,False,True,False,False,False,False,False,False,False,False,False
1,2,False,True,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False
2,6,True,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False
3,10,True,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False
4,32,False,False,False,False,False,False,False,False,False,False,False,True,False,True,True,False,False


In [84]:
model = neighbors.NearestNeighbors(n_neighbors=6, metric='cosine') 
model.fit(df_dum)
dist, idlist = model.kneighbors(df_dum)

In [85]:
distancias=pd.DataFrame(dist) ## devuelve un ranking de la distancias más cercanas para cada fila (pelicula)
id_list=pd.DataFrame(idlist) ## para saber esas distancias a que item corresponde

In [86]:
def MovieRecommender(movie_name = list(df['title'].value_counts().index)):
    movie_list_name = []
    movie_id = df[df['title'] == movie_name].index
    movie_id = movie_id[0]
    for newid in idlist[movie_id]:
        movie_list_name.append(df.loc[newid].title)
    return movie_list_name

#Eliminar la película en la que se está basando
print(interact(MovieRecommender))

interactive(children=(Dropdown(description='movie_name', options=('Toy Story (1995)', 'Truman Show, The (1998)…

<function MovieRecommender at 0x00000245C40C6480>
