### 0. Imports

In [1]:
# Data transformation
# ---------------------------------------
import pandas as pd
import polars as pl

# API calls
# ---------------------------------------
import requests

# Scraping
# ---------------------------------------
from bs4 import BeautifulSoup


# Aynchronicity
# ---------------------------------------
import asyncio
import aiohttp


import json


import time

# 1. Introducción - Extracción datos Movies Database

Este notebook detalla la exploración acarreada para la extracción de datos a través de la API Movies Database.

# 2. Extraccion

- Películas desde 1990 hasta la actualidad.
- Géneros: Drama, Comedy, Action, Fantasy, Horror, Mystery, Romance, Thriller.
- Información necesaria:
  - Tipo (corto o película).
  - Nombre.
  - Año y mes de estreno.
  - ID de la película.

Request de ejemplo:

In [2]:
url = "https://moviesdatabase.p.rapidapi.com/titles"

querystring = {"genre":"Drama","startYear":"1990","titleType":"movie","page":"70","limit":"50"}

headers = {
	"x-rapidapi-key": "c09b699b97mshc664835db7a2ca3p1f803fjsn67c91d35ded1",
	"x-rapidapi-host": "moviesdatabase.p.rapidapi.com"
}

response = requests.get(url, headers=headers, params=querystring)

print(response.json())

{'page': 1, 'next': '/titles?titleType=movie&genre=Drama&page=2&limit=50&startYear=1990', 'entries': 50, 'results': [{'_id': '662feb6b06e751c276d800b8', 'id': 'tt32230923', 'primaryImage': {'id': 'rm3377102337', 'width': 768, 'height': 1344, 'url': 'https://m.media-amazon.com/images/M/MV5BNzA0Zjc3MzMtZWZkYi00OWI3LWJiOGQtYTQwYjgxZjBiNjg2XkEyXkFqcGdeQXVyMTgwNjM0NzY3._V1_.jpg', 'caption': {'plainText': 'Sardar Soleimani (2027)', '__typename': 'Markdown'}, '__typename': 'Image'}, 'titleType': {'displayableProperty': {'value': {'plainText': '', '__typename': 'Markdown'}, '__typename': 'DisplayableTitleTypeProperty'}, 'text': 'Movie', 'id': 'movie', 'isSeries': False, 'isEpisode': False, 'categories': [{'value': 'movie', '__typename': 'TitleTypeCategory'}], 'canHaveEpisodes': False, '__typename': 'TitleType'}, 'titleText': {'text': 'Sardar Soleimani', '__typename': 'TitleText'}, 'originalTitleText': {'text': 'Sardar Soleimani', '__typename': 'TitleText'}, 'releaseYear': {'year': 2027, 'endYe

Hay paginación. No se conoce el límite de páginas por género y tipo de cinta.

Dado que hay 1000 consultas por hora, que hay que hacer consultas separadas para 'short' y para 'movie', así como para los géneros, se puede tratar de intentar un approach rápido por asincronía de hacer todas las consultas en lugar usar un bucle que trate de encontrar el límite de páginas para cada uno, con el consiguiente tiempo de diseño de la lógica para capturarlo.

Extraer campos de requeridos de cada pelicula.

In [3]:
# page
response.json()
# id 
response.json()["results"][0]["id"]
# type
response.json()["results"][0]["titleType"]["id"]

# nombre
response.json()["results"][0]["titleText"]["text"]

# año y mes de estreno
response.json()["results"][0]["releaseDate"]["year"]
response.json()["results"][0]["releaseDate"]["month"]


12

In [13]:

# miguellv96@gmail.com
headers = {
    "x-rapidapi-key": "c09b699b97mshc664835db7a2ca3p1f803fjsn67c91d35ded1",
    "x-rapidapi-host": "moviesdatabase.p.rapidapi.com"
}


# miguel.lopezvirues@alum.uca.es
headers = {
    'x-rapidapi-key': "5027380e12msh3259c6d3e08bbe5p1b31bbjsn6f9e42d6d53b",
    'x-rapidapi-host': "moviesdatabase.p.rapidapi.com"
}

# miguel.lopezvirues@gmail.com
headers = {
    'x-rapidapi-key': "98ab80c541msh548e16e3c595c79p101433jsn6131790668a3",
    'x-rapidapi-host': "moviesdatabase.p.rapidapi.com"
}

# hackio projects 1
headers = {
    "x-rapidapi-key": "997763c3bamsheeaaefa4d7efc72p100dcfjsnc6ba0f0812da",
    "x-rapidapi-host": "moviesdatabase.p.rapidapi.com"
}

# hackio projects 2 
headers = {
	"x-rapidapi-key": "d42d6bd155mshfbdc1f0c83f26f5p12fca4jsnf8aa6552f0e5",
	"x-rapidapi-host": "moviesdatabase.p.rapidapi.com"
}

In [25]:
async def request_movies_async(querystring):
    
    url = "https://moviesdatabase.p.rapidapi.com/titles"

    headers = {
        "x-rapidapi-key": "c09b699b97mshc664835db7a2ca3p1f803fjsn67c91d35ded1",
        "x-rapidapi-host": "moviesdatabase.p.rapidapi.com"
    }

    async with aiohttp.ClientSession() as session:
        async with session.get(url, headers=headers, params=querystring) as response:
            if response.status == 500:
                print(f"Max operation time for querystring {querystring}. Retrying...")
                await asyncio.sleep(3)  # Wait for 3 seconds before retrying
                return await request_movies_async(querystring)  # Retry the same request
            elif response.status == 200:
                try:
                    results = await response.json()
                    pagina = results["page"]
                    results = results["results"]
                    for result in results:
                        result["pagina"] = pagina
                        result["genre"] = querystring["genre"]

                except KeyError as e:
                    print(f"KeyError with querystring {querystring}: {e}")
                    return []  # returning an empty list for consistency
                except TypeError as e:
                    print(f"TypeError with querystring {querystring}: {e}")
                    return []
                except Exception as e:
                    print(f"Unexpected error with querystring {querystring}: {e}")
                    return []
            else:
                print(f"Request failed with status {response.status} for {querystring}")
                return []
                raise ValueError(f"HTTP Error: {response.status}")
            

    return results

async def get_movies_info(querystring_list): 

    tasks = [request_movies_async(querystring) for querystring in querystring_list]
    results = await asyncio.gather(*tasks)

    return results

In [26]:
def create_querystrings(end_page, title_type_list, genres_list, start_year):
    querystring_list = []

    for page in range(1,end_page+1):
        for title_type in title_type_list:
            for genre in genres_list:
                base_querystring = {"genre":None,"startYear":str(start_year),"titleType":None,"page":None,"limit":"50","sort":"year.incr"}

                base_querystring["genre"] = genre
                base_querystring["titleType"] = title_type
                base_querystring["page"] = page

                querystring_list.append(base_querystring)

    return querystring_list
                



Creación de función de extracción de datos:

In [32]:
def generar_lista_tuplas(lista_jsons):

    lista_tuplas_peliculas = []
    for response in lista_jsons:
        # page
        page = response["pagina"]
        # id 
        id = response["id"]
        # type
        title_type = response["titleType"]["id"]

        # nombre

        title = response["titleText"]["text"]

        # año y mes de estreno

        try:
            
            year = response["releaseDate"]["year"]
            month = response["releaseDate"]["month"]
        except:

            year = response["releaseYear"]["year"]
            month = None

        # genre
        genre = response["genre"]

        pelicula_tupla = (page, id, title_type, title, genre, year, month)

        lista_tuplas_peliculas.append(pelicula_tupla)

    return lista_tuplas_peliculas



In [33]:
test_querystring_list = create_querystrings(1, ["movie"],["Drama"],1990)

results = await get_movies_info(test_querystring_list)

resultados_flat = [sublista for lista in results for sublista in lista]

generar_lista_tuplas(resultados_flat)

[('1', 'tt0059325', 'movie', 'Jahrgang 45', 'Drama', 1990, 10),
 ('1',
  'tt0059900',
  'movie',
  'Wenn du groß bist, lieber Adam',
  'Drama',
  1990,
  10),
 ('1', 'tt0065188', 'movie', 'Vojtech, receny sirotek', 'Drama', 1990, 10),
 ('1', 'tt0068494', 'movie', 'Domo Arigato', 'Drama', 1990, 8),
 ('1', 'tt0075259', 'movie', 'Spy Story', 'Drama', 1990, 3),
 ('1', 'tt0081721', 'movie', 'Vincent et moi', 'Drama', 1990, 12),
 ('1', 'tt0090665', 'movie', 'Asfour Stah', 'Drama', 1991, 3),
 ('1', 'tt0093210', 'movie', 'La hora 24', 'Drama', 1990, None),
 ('1', 'tt0093989', 'movie', 'Xian sha lu', 'Drama', 1990, 11),
 ('1', 'tt0093662', 'movie', 'Object of Desire', 'Drama', 1990, None),
 ('1', 'tt0094666', 'movie', "Any Man's Death", 'Drama', 1990, 5),
 ('1', 'tt0094073', 'movie', 'The Sun and the Moon', 'Drama', 1990, 10),
 ('1', 'tt0094857', 'movie', 'Chi se da feng bao', 'Drama', 1990, 3),
 ('1', 'tt0095840', 'movie', 'Payback', 'Drama', 1990, 4),
 ('1', 'tt0096469', 'movie', 'Wu ye tian 

Realizar la llamada global:

In [28]:
genre_list = ["Drama", "Comedy", "Action", "Fantasy", "Horror", "Mystery", "Romance", "Thriller"]
title_type_list =  ["movie","short"]
end_page = 55
end_year = 1990

querystring_list = create_querystrings(end_page, title_type_list,genre_list,end_year)
print("Len of querystring list" ,len(querystring_list))
querystring_list[:10]

Len of querystring list 880


[{'genre': 'Drama',
  'startYear': '1990',
  'titleType': 'movie',
  'page': 1,
  'limit': '50',
  'sort': 'year.incr'},
 {'genre': 'Comedy',
  'startYear': '1990',
  'titleType': 'movie',
  'page': 1,
  'limit': '50',
  'sort': 'year.incr'},
 {'genre': 'Action',
  'startYear': '1990',
  'titleType': 'movie',
  'page': 1,
  'limit': '50',
  'sort': 'year.incr'},
 {'genre': 'Fantasy',
  'startYear': '1990',
  'titleType': 'movie',
  'page': 1,
  'limit': '50',
  'sort': 'year.incr'},
 {'genre': 'Horror',
  'startYear': '1990',
  'titleType': 'movie',
  'page': 1,
  'limit': '50',
  'sort': 'year.incr'},
 {'genre': 'Mystery',
  'startYear': '1990',
  'titleType': 'movie',
  'page': 1,
  'limit': '50',
  'sort': 'year.incr'},
 {'genre': 'Romance',
  'startYear': '1990',
  'titleType': 'movie',
  'page': 1,
  'limit': '50',
  'sort': 'year.incr'},
 {'genre': 'Thriller',
  'startYear': '1990',
  'titleType': 'movie',
  'page': 1,
  'limit': '50',
  'sort': 'year.incr'},
 {'genre': 'Drama',


In [30]:
results_400 = await get_movies_info(querystring_list[:400])

In [31]:
results_480 = await get_movies_info(querystring_list[400:])

Request failed with status 429 for {'genre': 'Romance', 'startYear': '1990', 'titleType': 'movie', 'page': 33, 'limit': '50', 'sort': 'year.incr'}
Request failed with status 429 for {'genre': 'Mystery', 'startYear': '1990', 'titleType': 'short', 'page': 33, 'limit': '50', 'sort': 'year.incr'}
Request failed with status 429 for {'genre': 'Action', 'startYear': '1990', 'titleType': 'movie', 'page': 34, 'limit': '50', 'sort': 'year.incr'}
Request failed with status 429 for {'genre': 'Action', 'startYear': '1990', 'titleType': 'short', 'page': 53, 'limit': '50', 'sort': 'year.incr'}
Request failed with status 429 for {'genre': 'Fantasy', 'startYear': '1990', 'titleType': 'short', 'page': 48, 'limit': '50', 'sort': 'year.incr'}
Request failed with status 429 for {'genre': 'Drama', 'startYear': '1990', 'titleType': 'short', 'page': 36, 'limit': '50', 'sort': 'year.incr'}
Request failed with status 429 for {'genre': 'Action', 'startYear': '1990', 'titleType': 'short', 'page': 38, 'limit': '50

In [34]:
resultados_400_flat = [sublista for lista in results_400 for sublista in lista]

lista_tuplas_400 = generar_lista_tuplas(resultados_400_flat)
df_400 = pd.DataFrame(lista_tuplas_400)
df_400.to_csv("../data/movies_400.csv")
df_400

Unnamed: 0,0,1,2,3,4,5,6
0,1,tt0059325,movie,Jahrgang 45,Drama,1990,10.0
1,1,tt0059900,movie,"Wenn du groß bist, lieber Adam",Drama,1990,10.0
2,1,tt0065188,movie,"Vojtech, receny sirotek",Drama,1990,10.0
3,1,tt0068494,movie,Domo Arigato,Drama,1990,8.0
4,1,tt0075259,movie,Spy Story,Drama,1990,3.0
...,...,...,...,...,...,...,...
19995,25,tt0446926,short,Be Still My Heart,Thriller,2004,
19996,25,tt0444735,short,Will,Thriller,2004,12.0
19997,25,tt0445019,short,Jill,Thriller,2004,4.0
19998,25,tt0446543,short,Black Jack,Thriller,2004,


In [35]:
resultados_480_flat = [sublista for lista in results_480 for sublista in lista]

lista_tuplas_480 = generar_lista_tuplas(resultados_480_flat)
df_480 = pd.DataFrame(lista_tuplas_480)
df_480.to_csv("../data/movies_480.csv")

df_480

Unnamed: 0,0,1,2,3,4,5,6
0,26,tt10883110,movie,Aji Pamungkas,Drama,1990,
1,26,tt1090298,movie,Ali va ghool-e jangal,Drama,1990,
2,26,tt11016436,movie,Bing leng de tai yang,Drama,1990,
3,26,tt1102282,movie,Naghsh-e eshgh,Drama,1990,
4,26,tt11162952,movie,Ricky (Nakalnya Anak Muda),Drama,1990,
...,...,...,...,...,...,...,...
18595,1,tt30185433,short,Hello Kitty no Mahô no Mori no Ohimesama,Fantasy,1991,7.0
18596,1,tt32513765,short,1991=HERE,Fantasy,1991,1.0
18597,1,tt0101777,short,Drum Struck,Fantasy,1992,4.0
18598,1,tt0101292,short,Alcibíades,Fantasy,1992,


In [40]:
df_total = pd.concat([df_400, df_480],axis=0)
df_total.columns = ["pagina","id","tipo_titulo","titulo","genero","año","mes"]
df_total

Unnamed: 0,pagina,id,tipo_titulo,titulo,genero,año,mes
0,1,tt0059325,movie,Jahrgang 45,Drama,1990,10.0
1,1,tt0059900,movie,"Wenn du groß bist, lieber Adam",Drama,1990,10.0
2,1,tt0065188,movie,"Vojtech, receny sirotek",Drama,1990,10.0
3,1,tt0068494,movie,Domo Arigato,Drama,1990,8.0
4,1,tt0075259,movie,Spy Story,Drama,1990,3.0
...,...,...,...,...,...,...,...
18595,1,tt30185433,short,Hello Kitty no Mahô no Mori no Ohimesama,Fantasy,1991,7.0
18596,1,tt32513765,short,1991=HERE,Fantasy,1991,1.0
18597,1,tt0101777,short,Drum Struck,Fantasy,1992,4.0
18598,1,tt0101292,short,Alcibíades,Fantasy,1992,


In [43]:
df_total = df_total.drop_duplicates(subset="id")
df_total.to_csv("../data/movies_total.csv")