### 0. Imports

In [1]:
# Data transformation
# ---------------------------------------
import pandas as pd
import polars as pl

# API calls
# ---------------------------------------
import requests

# Scraping
# ---------------------------------------
from bs4 import BeautifulSoup


# Aynchronicity
# ---------------------------------------
import asyncio
import aiohttp


import json


import time


import re

from tqdm import tqdm

from joblib import Parallel, delayed

import sys

if sys.platform == 'win32':
    loop = asyncio.ProactorEventLoop()
    asyncio.set_event_loop(loop)

# 1. Introducción - Extracción datos Movies Database

Este notebook detalla la exploración acarreada para la extracción de datos a través de la API Movies Database.

# 2. Extraccion

- Calificación de IMDB.
- Dirección (director o directores).
- Guionistas.
- Argumento.
- Duración (en minutos).

Ejemplo para una sola pelicula: 'tt4495760'

In [2]:
url = "https://www.imdb.com/es-es/title/tt4495760/?ref_=fn_all_ttl_1"

headers={"User-Agent": "Chrome"}

response = requests.get(url,headers=headers)

sopa = BeautifulSoup(response.content, "html.parser")

sopa

<!DOCTYPE html>
<html lang="es-ES" xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://opengraphprotocol.org/schema/"><head><meta charset="utf-8"/><meta content="width=device-width" name="viewport"/><script>if(typeof uet === 'function'){ uet('bb', 'LoadTitle', {wb: 1}); }</script><script>window.addEventListener('load', (event) => {
        if (typeof window.csa !== 'undefined' && typeof window.csa === 'function') {
            var csaLatencyPlugin = window.csa('Content', {
                element: {
                    slotId: 'LoadTitle',
                    type: 'service-call'
                }
            });
            csaLatencyPlugin('mark', 'clickToBodyBegin', 1734359006562);
        }
    })</script><title>Shtemp (1991) - IMDb</title><meta content="Shtemp: Dirigido por Gennadiy Ivanov. Con Anatoliy Khostikoev, Yelena Kolchugina, Guram Pirtskhalava, Igor Shapovalov. Criminal Investigation Inspector Arkady Ershov is investigating a case related to the Moscow mafia." d

In [3]:

# Puntuacion
sopa.find("div",{"data-testid":"hero-rating-bar__aggregate-rating__score"}).text



'5,6/10'

In [4]:
puntuacion = json.loads(sopa.find('script', type='application/json').string)["props"]["pageProps"]["aboveTheFoldData"]["ratingsSummary"]["aggregateRating"]

Dirección

In [5]:
direccion = sopa.find_all("li",{"role":"presentation", "class":"ipc-metadata-list__item"})[0].find_all("a")
directores = [director_element.text for director_element in direccion]
direccion_formateado = ", ".join(directores)

Alternativamente, se podría hacer con:

In [6]:
json.loads(sopa.find('script', type='application/json').string)["props"]["pageProps"]["aboveTheFoldData"]["principalCredits"][0]["credits"][0]["name"]["nameText"]["text"]

'Gennadiy Ivanov'

Guion

In [7]:
guion = sopa.find_all("li",{"role":"presentation", "class":"ipc-metadata-list__item"})[1].find_all("a")
guionistas = [guion_element.text for guion_element in guion]
guion_formateado = ", ".join(guionistas)
guion_formateado

'Gennadiy Ivanov, Aleksandr Stashkov'

Argumento

In [8]:
plot = json.loads(sopa.find('script', type='application/json').string)["props"]["pageProps"]["aboveTheFoldData"]["plot"]["plotText"]["plainText"]

plot

'Criminal Investigation Inspector Arkady Ershov is investigating a case related to the Moscow mafia.'

Duración

In [9]:
# duracion
duracion = sopa.find("meta",property="og:description")["content"]

def convertir_horas_minutos(duracion):
    # extraer hora
    patron_hora = r"(\d{0,3})h"
    hora = int(re.findall(patron_hora, duracion)[0])

    # extraer minutos
    patron_minutos = r"(\d{0,2})m"
    minutos = int(re.findall(patron_minutos, duracion)[0])

    # duracion total minutos
    return hora*60 + minutos

convertir_horas_minutos(duracion)


92

In [10]:
sopa.find_all("meta")

# puntuacion alternativa
sopa.find_all("meta",property="og:title") # aplicar regex extract \d.\d

[<meta content="Shtemp (1991) ⭐ 5.6 | Acción" property="og:title"/>]

In [11]:
json.loads(sopa.find('script', type='application/json').string)["props"]["pageProps"]

{'tconst': 'tt4495760',
 'aboveTheFoldData': {'id': 'tt4495760',
  'productionStatus': {'currentProductionStage': {'id': 'released',
    'text': 'Estrenada',
    '__typename': 'ProductionStage'},
   'productionStatusHistory': [{'status': {'id': 'released',
      'text': 'Estrenada',
      '__typename': 'ProductionStatus'},
     '__typename': 'ProductionStatusHistory'}],
   'restriction': None,
   '__typename': 'ProductionStatusDetails'},
  'canHaveEpisodes': False,
  'series': None,
  'titleText': {'text': 'Shtemp', '__typename': 'TitleText'},
  'titleType': {'displayableProperty': {'value': {'plainText': '',
     '__typename': 'Markdown'},
    '__typename': 'DisplayableTitleTypeProperty'},
   'text': 'Película',
   'id': 'movie',
   'isSeries': False,
   'isEpisode': False,
   'categories': [{'value': 'movie', '__typename': 'TitleTypeCategory'}],
   'canHaveEpisodes': False,
   '__typename': 'TitleType'},
  'originalTitleText': {'text': 'Shtemp', '__typename': 'TitleText'},
  'certifi

In [12]:
sopa.find_all("div",{"class":"ipc-html-content ipc-html-content--base"})

[]

## 2.2 Realizar peticiones para los ids unicos:

Creacion de funciones asincronas para las consultas de los ids.

In [31]:
async def request_movies_imdb_async(url, headers):
    async with aiohttp.ClientSession() as session:
        async with session.get(url=url,headers=headers) as response:
            if response.status == 500:
                print(f"Max operation time for url {url}. Retrying...")
                await asyncio.sleep(3)  # Wait for 3 seconds before retrying
                return await request_movies_imdb_async(url, headers)  # Retry the same request
            elif response.status == 200:
                results = await response.text()
            #         results = await response.json()
            #         pagina = results["page"]
            #         results = results["results"]
            #         for result in results:
            #             result["pagina"] = pagina
            #             result["genre"] = querystring["genre"]

            #     except KeyError as e:
            #         print(f"KeyError with querystring {querystring}: {e}")
            #         return []  # returning an empty list for consistency
            #     except TypeError as e:
            #         print(f"TypeError with querystring {querystring}: {e}")
            #         return []
            #     except Exception as e:
            #         print(f"Unexpected error with querystring {querystring}: {e}")
            #         return []
            # else:
            #     print(f"Request failed with status {response.status} for {querystring}")
            #     return []
            #     raise ValueError(f"HTTP Error: {response.status}")
                return results


async def gather_with_concurrency(n, *coros):
    semaphore = asyncio.Semaphore(n)

    async def sem_coro(coro):
        async with semaphore:
            return await coro
    return await asyncio.gather(*(sem_coro(c) for c in coros))     

async def get_movies_info(url_list, headers): 

    tasks = [request_movies_imdb_async(url, headers) for url in url_list]
    results = await gather_with_concurrency(64,*tasks)

    return results

Prueba con 2 urls.

In [32]:
urls = ["https://www.imdb.com/es-es/title/tt4495760/?ref_=fn_all_ttl_1","https://www.imdb.com/es-es/title/tt0252196/"]
resultados = await get_movies_info(urls,headers)
resultados

[None, None]

Funciones de parseado y formateado de información.

In [18]:
def parse_with_bs(content):
    sopa = BeautifulSoup(content, "html.parser")

    try:
        page_props = json.loads(sopa.find('script', type='application/json').string)["props"]["pageProps"]
        puntuacion =  get_score(page_props)
        plot = get_plot(page_props)
    except:
        puntuacion =  None
        plot = None


    informacion_pelicula = {
        "id": page_props["tconst"],
        "puntuacion": puntuacion,
        "argumento": plot,
        "direccion": get_direccion(sopa),
        "guionistas": get_guionistas(sopa),
        "duracion": get_duracion(sopa)

    }
    
    return informacion_pelicula

def parse_all_contents(list_of_contents):
    results = Parallel(n_jobs=-1)(
        delayed(parse_with_bs)(content)
        for content in tqdm(list_of_contents)
    )

    return results 

def get_score(page_props):

    return page_props["aboveTheFoldData"]["ratingsSummary"]["aggregateRating"]

def get_direccion(sopa):
    try:
        direccion = sopa.find_all("li",{"role":"presentation", "class":"ipc-metadata-list__item"})[0].find_all("a")
        directores = [director_element.text for director_element in direccion]
        direccion_formateado = ", ".join(directores)
    except:
        direccion_formateado = None

    return direccion_formateado

def get_guionistas(sopa):
    try: 
        guion = sopa.find_all("li",{"role":"presentation", "class":"ipc-metadata-list__item"})[1].find_all("a")
        guionistas = [guion_element.text for guion_element in guion]
        guion_formateado = ", ".join(guionistas)
    
    except:
        guion_formateado = None

    return guion_formateado

def get_plot(page_props):

    return page_props["aboveTheFoldData"]["plot"]["plotText"]["plainText"]

def convertir_horas_minutos(duracion):
    # extraer hora
    patron_hora = r"(\d{0,3})h"
    hora = int(re.findall(patron_hora, duracion)[0])

    # extraer minutos
    patron_minutos = r"(\d{0,2})m"
    minutos = int(re.findall(patron_minutos, duracion)[0])

    # duracion total minutos
    return hora*60 + minutos

def get_duracion(sopa):
    duracion = sopa.find("meta",property="og:description")["content"]

    duracion_formateada = convertir_horas_minutos(duracion)

    return duracion_formateada


Prueba de resultados

In [19]:
pd.DataFrame(parse_all_contents(resultados))

100%|██████████| 2/2 [00:00<00:00, 220.94it/s]


Unnamed: 0,id,puntuacion,argumento,direccion,guionistas,duracion
0,tt4495760,5.6,Criminal Investigation Inspector Arkady Ershov...,Gennadiy Ivanov,"Gennadiy Ivanov, Aleksandr Stashkov",92
1,tt0252196,8.4,A group of children use a time machine to inve...,Singeetam Srinivasa Rao,"Guión, Jandhyala, Singeetam Srinivasa Rao,",140


### 2.2.2 Aplicación a todos los ids unicos

In [20]:
peliculas_df = pd.read_csv("../data/movies_total.csv",index_col=0)
peliculas_df.head()

Unnamed: 0,pagina,id,tipo_titulo,titulo,genero,año,mes
0,1,tt0059325,movie,Jahrgang 45,Drama,1990,10.0
1,1,tt0059900,movie,"Wenn du groß bist, lieber Adam",Drama,1990,10.0
2,1,tt0065188,movie,"Vojtech, receny sirotek",Drama,1990,10.0
3,1,tt0068494,movie,Domo Arigato,Drama,1990,8.0
4,1,tt0075259,movie,Spy Story,Drama,1990,3.0


Obtencion de lista de urls:

In [21]:
lista_de_ids = peliculas_df["id"].to_list()
lista_de_urls = []

for id in lista_de_ids:
    url = f"https://www.imdb.com/es-es/title/{id}/"
    lista_de_urls.append(url)

lista_de_urls[:5]

['https://www.imdb.com/es-es/title/tt0059325/',
 'https://www.imdb.com/es-es/title/tt0059900/',
 'https://www.imdb.com/es-es/title/tt0065188/',
 'https://www.imdb.com/es-es/title/tt0068494/',
 'https://www.imdb.com/es-es/title/tt0075259/']

Obtencion de resultados de consultas:

In [39]:
resultados_consultas = await get_movies_info(lista_de_urls[:100],headers)
resultados_consultas

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

Filtrando por peliculas de 'Horror'

In [45]:
peliculas_df

Unnamed: 0,pagina,id,tipo_titulo,titulo,genero,año,mes
9337,12,tt19797422,short,Resarcimiento,Action,2022,
16602,21,tt18849138,short,The Black Hand,Horror,2022,3.0
1556,2,tt14405434,short,Something Wonderful,Thriller,2021,11.0
11578,15,tt19466142,movie,Meatgrinder,Thriller,2021,11.0
1852,28,tt0312843,movie,Jisatsu sâkuru,Mystery,2021,5.0
...,...,...,...,...,...,...,...
19265,25,tt0132408,movie,Party Plane,Comedy,1989,2.0
73,1,tt0096075,movie,Sexpot,Comedy,1988,11.0
3065,4,tt27036639,short,Gloomy Sunday,Mystery,1988,10.0
12084,16,tt0092984,movie,Esto es un atraco,Comedy,1987,5.0


In [55]:
peliculas_df.sort_values(by="año", inplace=True, ascending=False)
lista_de_ids = peliculas_df.loc[(peliculas_df["genero"]=="Horror") & (peliculas_df["año"] == 2000),"id"].to_list()
lista_de_urls = []

for id in lista_de_ids:
    url = f"https://www.imdb.com/es-es/title/{id}/"
    lista_de_urls.append(url)

len(lista_de_urls)

286

In [56]:
resultados_consultas = await get_movies_info(lista_de_urls,headers)
resultados_consultas

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

Las consultas son demasiadas