## Librerías

In [1]:
import numpy as np
import pandas as pd
import sqlite3 as sql
from sklearn.preprocessing import MinMaxScaler
from ipywidgets import interact ## para análisis interactivo
from sklearn import neighbors ### basado en contenido un solo producto consumido
import joblib
from mlxtend.preprocessing import TransactionEncoder
import a_funciones as fn

In [2]:
#### Conectar #######
conn=sql.connect('data/db_movies')
cur=conn.cursor()

In [3]:
#ver tablas
cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
cur.fetchall()

[('ratings',),
 ('movies',),
 ('reco',),
 ('usuarios_sel',),
 ('movies_sel',),
 ('ratings_final',),
 ('movie_final',),
 ('full_ratings',),
 ('full_ratings_dum',)]

## 1. Sistema de recomendación basado en popularidad 


In [4]:
full_ratings_dum = pd.read_sql_query('select * from full_ratings_dum', conn)
full_ratings_dum.head()

Unnamed: 0,userId,movieId,rating,title,year,year_sc,action,adventure,animation,children,...,film-noir,horror,imax,musical,mystery,romance,sci-fi,thriller,war,western
0,1,1,4.0,Toy Story,1995,0.734177,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1,3,4.0,Grumpier Old Men,1995,0.734177,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,1,6,4.0,Heat,1995,0.734177,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1,47,5.0,Seven (a.k.a. Se7en),1995,0.734177,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,1,50,5.0,"Usual Suspects, The",1995,0.734177,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0


In [5]:
full_ratings_dum.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34779 entries, 0 to 34778
Data columns (total 25 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   userId       34779 non-null  int64  
 1   movieId      34779 non-null  int64  
 2   rating       34779 non-null  float64
 3   title        34779 non-null  object 
 4   year         34779 non-null  int64  
 5   year_sc      34779 non-null  float64
 6   action       34779 non-null  int64  
 7   adventure    34779 non-null  int64  
 8   animation    34779 non-null  int64  
 9   children     34779 non-null  int64  
 10  comedy       34779 non-null  int64  
 11  crime        34779 non-null  int64  
 12  documentary  34779 non-null  int64  
 13  drama        34779 non-null  int64  
 14  fantasy      34779 non-null  int64  
 15  film-noir    34779 non-null  int64  
 16  horror       34779 non-null  int64  
 17  imax         34779 non-null  int64  
 18  musical      34779 non-null  int64  
 19  myst

### a. Las películas más vistas, pero con calificación mayor a 4 (mejor calificadas)

In [7]:
pd.read_sql("""
    select year, title, 
           ROUND(avg(iif(rating = 0, Null, rating)), 2) as avg_rat,
           count(*) as seen_num
    from full_ratings_dum
    group by year, title
    order by year desc, avg_rat desc
    limit 10
""", conn)

Unnamed: 0,year,title,avg_rat,seen_num
0,2016,Deadpool,3.75,42
1,2014,Guardians of the Galaxy,4.06,48
2,2014,Interstellar,3.9,54
3,2014,"Grand Budapest Hotel, The",3.78,47
4,2013,"Wolf of Wall Street, The",3.85,43
5,2012,Django Unchained,3.98,58
6,2012,"Dark Knight Rises, The",3.93,63
7,2012,"Avengers, The",3.89,59
8,2012,The Hunger Games,3.48,50
9,2010,Toy Story 3,4.06,47


### b. Películas más vistas con una calificación mayor a 3.5 que sean para toda la familia

In [22]:
## Las películas más vistas, con calificación mínima de 3.5, y que sean para toda la familia
pd.read_sql("""select title, 
            ROUND(avg(iif(rating = 0, Null, rating)), 2) as avg_rat,
            count(*) as seen_num
            from full_ratings_dum
            where Children = 1 
            group by title
            having avg_rat >= 3.5
            order by seen_num desc
            limit 10
            """, conn)

Unnamed: 0,title,avg_rat,seen_num
0,Toy Story,3.88,169
1,Shrek,3.85,152
2,"Lion King, The",3.91,147
3,Aladdin,3.84,146
4,Finding Nemo,3.93,121
5,Beauty and the Beast,3.78,116
6,"Monsters, Inc.",3.85,114
7,E.T. the Extra-Terrestrial,3.76,112
8,"Incredibles, The",3.81,111
9,Babe,3.64,110


### c. Top mejores películas calificadas por año de estreno

In [19]:

pd.read_sql("""select year, title, 
            ROUND(avg(iif(rating = 0, Null, rating)), 2) as avg_rat,
            count(*) as seen_num
            from full_ratings_dum
            group by  year, title
            order by year desc, avg_rat desc limit 10
            """, conn)

Unnamed: 0,year,title,avg_rat,seen_num
0,2016,Deadpool,3.75,42
1,2014,Guardians of the Galaxy,4.06,48
2,2014,Interstellar,3.9,54
3,2014,"Grand Budapest Hotel, The",3.78,47
4,2013,"Wolf of Wall Street, The",3.85,43
5,2012,Django Unchained,3.98,58
6,2012,"Dark Knight Rises, The",3.93,63
7,2012,"Avengers, The",3.89,59
8,2012,The Hunger Games,3.48,50
9,2010,Toy Story 3,4.06,47


### d. Top 10 de las películas con una calificación mayor a 3.5 y que son del 2010 en adelante

In [24]:
# Las películas más recientes (desde el 2021) y mejor calificadas
recent_best_movies = pd.read_sql("""
                                    SELECT title, 
                                        year,
                                        ROUND(avg(iif(rating = 0, Null, rating)), 2) as avg_rat,
                                        count(*) AS seen_num
                                    FROM full_ratings_dum
                                    WHERE year >= 2010
                                    GROUP BY title, year
                                    HAVING avg_rat >= 3.5
                                    ORDER BY avg_rat DESC
                                    LIMIT 10
                                """, conn)

# Muestra el resultado
recent_best_movies



Unnamed: 0,title,year,avg_rat,seen_num
0,Guardians of the Galaxy,2014,4.06,48
1,Toy Story 3,2010,4.06,47
2,Inception,2010,4.02,108
3,Shutter Island,2010,3.99,57
4,Django Unchained,2012,3.98,58
5,"King's Speech, The",2010,3.96,47
6,"Dark Knight Rises, The",2012,3.93,63
7,How to Train Your Dragon,2010,3.91,44
8,Interstellar,2014,3.9,54
9,"Avengers, The",2012,3.89,59


## 2.1 Sistema de recomendación basado en contenido KNN un solo producto seleccionado 

In [11]:
df=pd.read_sql('select * from full_ratings_dum',conn)
df['movieId'] = df['movieId'].astype('object')
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34779 entries, 0 to 34778
Data columns (total 25 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   userId       34779 non-null  int64  
 1   movieId      34779 non-null  object 
 2   rating       34779 non-null  float64
 3   title        34779 non-null  object 
 4   year         34779 non-null  int64  
 5   year_sc      34779 non-null  float64
 6   action       34779 non-null  int64  
 7   adventure    34779 non-null  int64  
 8   animation    34779 non-null  int64  
 9   children     34779 non-null  int64  
 10  comedy       34779 non-null  int64  
 11  crime        34779 non-null  int64  
 12  documentary  34779 non-null  int64  
 13  drama        34779 non-null  int64  
 14  fantasy      34779 non-null  int64  
 15  film-noir    34779 non-null  int64  
 16  horror       34779 non-null  int64  
 17  imax         34779 non-null  int64  
 18  musical      34779 non-null  int64  
 19  myst

Unnamed: 0,userId,movieId,rating,title,year,year_sc,action,adventure,animation,children,...,film-noir,horror,imax,musical,mystery,romance,sci-fi,thriller,war,western
0,1,1,4.0,Toy Story,1995,0.734177,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1,3,4.0,Grumpier Old Men,1995,0.734177,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,1,6,4.0,Heat,1995,0.734177,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,1,47,5.0,Seven (a.k.a. Se7en),1995,0.734177,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,1,50,5.0,"Usual Suspects, The",1995,0.734177,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0


In [12]:
list_col_del = ['title', 'year', 'year_sc']
df_dum = df
df_dum = df_dum.drop(columns=list_col_del)
df_dum.head()

Unnamed: 0,userId,movieId,rating,action,adventure,animation,children,comedy,crime,documentary,...,film-noir,horror,imax,musical,mystery,romance,sci-fi,thriller,war,western
0,1,1,4.0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,3,4.0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
2,1,6,4.0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,1,47,5.0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
4,1,50,5.0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,1,0,0


In [13]:
model = neighbors.NearestNeighbors(n_neighbors=6, metric='cosine') 
model.fit(df_dum)
dist, idlist = model.kneighbors(df_dum)

In [14]:
distancias=pd.DataFrame(dist) ## devuelve un ranking de la distancias más cercanas para cada fila (pelicula)
id_list=pd.DataFrame(idlist) ## para saber esas distancias a que item corresponde

In [15]:
def MovieRecommender(movie_name = list(df['title'].value_counts().index)):
    """
    Esta función tiene como objetivo recomendar películas similares basadas en una película seleccionada por el usuario. 
    Para ello, recibe como argumento movie_name, que es el título de la película para la cual se desean obtener recomendaciones.
    """
    
    movie_list_name = []
    movie_id = df[df['title'] == movie_name].index[0]
    
    # Recomendaciones de películas similares
    for newid in idlist[movie_id]:
        movie_list_name.append(df.loc[newid].title)
    
    # Eliminar la película seleccionada de las recomendaciones
    if movie_name in movie_list_name:
        movie_list_name.remove(movie_name)
    
    return movie_list_name

In [16]:
# Interfaz interactiva
print(interact(MovieRecommender))

interactive(children=(Dropdown(description='movie_name', options=('Forrest Gump', 'Pulp Fiction', 'Shawshank R…

<function MovieRecommender at 0x0000024E9E34AB60>
