# Extracción de datos con la API MoviesDatabase

In [48]:
# Tratamiento de datos
# ------------------------
import pandas as pd
import numpy as np

# Cargamos el archivo env
# ------------------------
import dotenv
dotenv.load_dotenv()
import os

# Para hacer las llamadas a la API
# ------------------------
import requests

# Otros
# ------------------------
from time import sleep
import pickle

In [None]:
key = os.getenv("key")
host = os.getenv("host")

url = "https://moviesdatabase.p.rapidapi.com/titles"

headers = {
	"x-rapidapi-key": key,
	"x-rapidapi-host": host
}

lista_generos = ["Drama", "Comedy", "Action", "Fantasy", "Horror", "Mystery", "Romance", "Thriller"]
lista_tipos = ["movie", "short"]
lista_final = []
for genero in lista_generos:
    for anio in range(1990, 2025):
        for tipo in lista_tipos:
            querystring = {
                "genre": genero,
                "titleType": tipo,
                "year": anio,
                "sort": "year.decr",
                "limit": "50"
            }
            res = requests.get(url, headers=headers, params=querystring)
            if res.status_code == 200:
                try:
                    contenido = res.json()
                    resultados = contenido["results"]
                    if len(resultados) > 0:
                        lista_peliculas = [resultado for resultado in resultados]
                        
                        
                        for pelicula in lista_peliculas:
                            id_peli = pelicula["id"]
                            nombre_peli = pelicula["titleText"]["text"]
                            anio_peli = pelicula["releaseYear"]["year"]
                            if pelicula["releaseDate"] is not None:
                                mes_peli = pelicula["releaseDate"]["month"]
                            else:
                                mes_peli = "Desconocido"
                            lista_final.append((id_peli, nombre_peli, tipo, genero.lower(), anio_peli, mes_peli))
                    else:
                        break
                except:
                    print(f"Error en la busqueda de peliculas: \nGenero: {genero}, \nAnio: {anio}, \nTipo: {tipo}")
                    print(f"Pelicula: {pelicula}")
                    with open("datos/raw/lista.pkl", "wb") as file:
                        pickle.dump(lista_final, file)
                    pass
            else:
                print(f"Error status code: {res.status_code}")
        sleep(0.5)
    sleep(0.5)

In [56]:
with open("../datos/raw/lista.pkl", "wb") as file:
    pickle.dump(lista_final, file)

In [57]:
with open("../datos/raw/lista.pkl", "rb") as file:
    lista_leida = pickle.load(file)

In [59]:
lista_leida[:5]

[('tt25424920', 'Loaded Deliberation', 'short', 'thriller', 2024, None),
 ('tt26237980', 'Bugbear III', 'short', 'thriller', 2024, 3),
 ('tt26687569', 'Showpiece', 'short', 'thriller', 2024, None),
 ('tt27053740', 'A Christmas Gift', 'short', 'thriller', 2024, 1),
 ('tt27420708', 'Long Way Out', 'short', 'thriller', 2024, None)]

### Web Scraping

In [7]:
# Imports web scraping
# ----------------------------------------------------------
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from time import sleep
from selenium import webdriver

In [43]:
url = "https://www.imdb.com/es-es/title/"
id = "tt0059325"

driver = webdriver.Chrome()
driver.get(f"{url}{id}")
driver.maximize_window()
driver.implicitly_wait(5)
sleep(1)
driver.find_element("xpath", '//*[@id="__next"]/div/div/div[2]/div/button[2]').click()
sleep(0.5)
direcciones = driver.find_elements("xpath", '//*[@id="__next"]/main/div/section[1]/section/div[3]/section/section/div[3]/div[2]/div[2]/div[2]/div/ul/li[1]/div/ul')
lista_directores = []
for n in range(1, len(direcciones)):
        direccion = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, f'//*[@id="__next"]/main/div/section[1]/section/div[3]/section/section/div[3]/div[2]/div[1]/section/div[2]/div/ul/li[1]/div/ul/li[{n}]]')))
        lista_directores.append(direccion.text)
        sleep(0.2)
lista_guionistas = driver.find_element("xpath", '//*[@id="__next"]/main/div/section[1]/section/div[3]/section/section/div[3]/div[2]/div[2]/div[2]/div/ul/li[2]/div/ul')
for n in range(1, len(direcciones)):
        direccion = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, f'//*[@id="__next"]/main/div/section[1]/section/div[3]/section/section/div[3]/div[2]/div[1]/section/div[2]/div/ul/li[2]/div/ul/li[{n}]')))
        lista_directores.append(direccion.text)
        sleep(0.2)
puntuacion = driver.find_element("xpath", '//*[@id="__next"]/main/div/section[1]/section/div[3]/section/section/div[2]/div[2]/div/div[1]/a/span/div/div[2]/div[1]/span[1]').text
argumento = driver.find_element("xpath", '//*[@id="__next"]/main/div/section[1]/div/section/div/div[1]/section[5]/div[2]/div[1]/div/div/div').text

url_especificaciones = driver.find_element("xpath", '//*[@id="__next"]/main/div/section[1]/div/section/div/div[1]/section[9]/div[1]/div/a').get_attribute("href")
driver.get(url_especificaciones)
sleep(1)
duracion = driver.find_element("xpath", '//*[@id="runtime"]/div/ul/li/span[2]').text
duracion = str(duracion).split(" ")[0].replace("(", "").replace(")", "")
duracion

'100'