# Sistema de Recomendación Basado en Conocimiento

In [1]:
import pandas as pd
import numpy as np

import warnings

warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('data/movies_metadata.csv')
df.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

Seleccionamos la variables que necesitamos:
* `title` 
* `genres` 
* `release_date` 
* `runtime`
* `vote_average` 
* `vote_count`

In [3]:
df = df[['title','genres', 'release_date', 'runtime', 'vote_average', 'vote_count']]
display(df.head(), df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         45460 non-null  object 
 1   genres        45466 non-null  object 
 2   release_date  45379 non-null  object 
 3   runtime       45203 non-null  float64
 4   vote_average  45460 non-null  float64
 5   vote_count    45460 non-null  float64
dtypes: float64(3), object(3)
memory usage: 2.1+ MB


Unnamed: 0,title,genres,release_date,runtime,vote_average,vote_count
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",1995-10-30,81.0,7.7,5415.0
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",1995-12-15,104.0,6.9,2413.0
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",1995-12-22,101.0,6.5,92.0
3,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",1995-12-22,127.0,6.1,34.0
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",1995-02-10,106.0,5.7,173.0


None

Convertimos la variable `release_date` a formato ***datetime*** y extraemos el año de la fecha.

In [4]:
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')

df['year'] = df['release_date'].apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [5]:
df.head()

Unnamed: 0,title,genres,release_date,runtime,vote_average,vote_count,year
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",1995-10-30,81.0,7.7,5415.0,1995
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",1995-12-15,104.0,6.9,2413.0,1995
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",1995-12-22,101.0,6.5,92.0,1995
3,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",1995-12-22,127.0,6.1,34.0,1995
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",1995-02-10,106.0,5.7,173.0,1995


Función auxiliar para convertir ***NaT*** a 0 y todos los años a enteros.

In [6]:
def convert_int(x):
    try:
        return int(x)
    except:
        return 0

Aplicamos ***convert_int*** a la variable `year`.

In [7]:
df['year'] = df['year'].apply(convert_int)

Eliminamos la columna `release_date`.

In [8]:
df = df.drop('release_date', axis=1)
df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year
0,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",81.0,7.7,5415.0,1995
1,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",104.0,6.9,2413.0,1995
2,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",101.0,6.5,92.0,1995
3,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",127.0,6.1,34.0,1995
4,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",106.0,5.7,173.0,1995


Veamos los géneros de la primera película.

In [9]:
df.iloc[0]['genres']

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

Importamos la función `literal_eval` del módulo ***ast***.

In [10]:
from ast import literal_eval #nos permite evaluar el tipo de variable que es aunque este dentro de un string

a = "[1,2,3]"
print(type(a))

b = literal_eval(a)
print(type(b))

<class 'str'>
<class 'list'>


1. Convertimos todos los ***strings*** de listas vacías en ***NaN***.
1. Convertimos todos los valores a listas.
1. Convertimos la lista de diccionarios en una lista de ***strings***.

In [12]:
df['genres'] = df['genres'].fillna('[]')

df['genres'] = df['genres'].apply(literal_eval)

df['genres'] = df['genres'].apply(lambda x: [i['name'].lower() for i in x] if isinstance(x, list) else [])

df.head()

Unnamed: 0,title,genres,runtime,vote_average,vote_count,year
0,Toy Story,"[animation, comedy, family]",81.0,7.7,5415.0,1995
1,Jumanji,"[adventure, fantasy, family]",104.0,6.9,2413.0,1995
2,Grumpier Old Men,"[romance, comedy]",101.0,6.5,92.0,1995
3,Waiting to Exhale,"[comedy, drama, romance]",127.0,6.1,34.0,1995
4,Father of the Bride Part II,[comedy],106.0,5.7,173.0,1995


1. Creamos una nueva variable que separa los géneros de las películas.
1. Nombramos la nueva variable como `genre`.
1. Creamos un nuevo ***Data Frame*** eliminando la variable `genres`.

In [13]:
s = df.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)

s.name = 'genre'

gen_df = df.drop('genres', axis=1).join(s)

gen_df.head()

Unnamed: 0,title,runtime,vote_average,vote_count,year,genre
0,Toy Story,81.0,7.7,5415.0,1995,animation
0,Toy Story,81.0,7.7,5415.0,1995,comedy
0,Toy Story,81.0,7.7,5415.0,1995,family
1,Jumanji,104.0,6.9,2413.0,1995,adventure
1,Jumanji,104.0,6.9,2413.0,1995,fantasy


Definimos la función basada en conocimiento (cuestiones realizadas al usuario).

In [14]:
def build_chart(gen_df, percentile=0.8):
    
    genre = input("Introduce tu género favorito, ")
    
    low_time = int(input("Introduce la duración más corta deseada, "))
    
    high_time = int(input("Introduce la duración más larga deseada, "))
    
    low_year = int(input("Introduce el año más antiguo, "))
    
    high_year = int(input("Introduce el año más reciente, "))
    
    movies = gen_df.copy()
    movies = movies[(movies['genre'] == genre) & 
                    (movies['runtime'] >= low_time) & 
                    (movies['runtime'] <= high_time) & 
                    (movies['year'] >= low_year) & 
                    (movies['year'] <= high_year)]
    
    
    C = movies['vote_average'].mean()
    m = movies['vote_count'].quantile(percentile)
    
    # Solo consideramos aquellas películas con más de m votos
    q_movies = movies.copy().loc[movies['vote_count'] >= m]
    
    # Calculamos el score aplicando la fórmula de IMDB
    q_movies['score'] = q_movies.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) 
                                       + (m/(m+x['vote_count']) * C)
                                       ,axis=1)

    # Ordenamos las películas de manera descendente segun su respectivo score
    q_movies = q_movies.sort_values('score', ascending=False)
    
    return q_movies

Generamos la lista del ***top*** 10 segun las preferencias del cliente.

In [15]:
build_chart(gen_df).head(10)

Unnamed: 0,title,runtime,vote_average,vote_count,year,genre,score


Guardamos nuestro ***Data Frame*** limpio.

In [None]:
df.to_csv('data/metadata_clean.csv', index=False)