In [1]:
import pandas as pd
from selenium import webdriver
from selenium.common.exceptions import ElementClickInterceptedException
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

from bs4 import BeautifulSoup # class for web scraping. It allows you to interact with HTML in a similar way to how you would interact with a web page using developer tools.
import requests

In [100]:
def extract_movie_data(soup):

    movie_data = []
    #find_all busca todos los elementos que coincidan con los criterios de búsqueda y los guarda en una lista
    movie_names = soup.find_all('h3', class_='ipc-title__text')
    movie_cal = soup.find_all('span', class_='ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating')
    movie_reviews = soup.find_all('span', class_='ipc-rating-star--voteCount')
    movie_numbers = soup.find_all('div', class_="sc-b0691f29-7 hrgukm cli-title-metadata")

    # Rest of the code

    for name, cal, reviews, numbers in zip(movie_names[1:], movie_cal, movie_reviews, movie_numbers):
        movie_name = name.text.strip()
        name_index=movie_name.find(" ")

        #Se procesan algunos datos
        if reviews == 0.0:
            movie_reviews = 0.0
        else:
            movie_reviews = reviews.text.strip().replace(')', '').replace('(', '')

        if cal == 0.0:
            movie_cal_str = 0.0
        else:
            movie_cal_str = cal.text.split()[0]

        movie_numbers = numbers.text.strip()
        #print(len(movie_numbers))

        year = movie_numbers[0:4]

        #procesar duracion de la película enfunción de la posicion de m, los minutos
        duration_index = movie_numbers.find("m")
        if duration_index != -1:
            duration = movie_numbers[4:duration_index+1]
            rating = movie_numbers[duration_index+1:].strip()

        #si no hay regsitrados minutos, es decir, si la película dura 1 o 2 horas exactas por ejemplo, no hay m así que
        #se procesa a partir de h y de la posición del "." de la puntuación de la película.
        #Aunque la puntuación de la película sea un numero entero se registra como decimal ("7.0") así que funciona siempre
        else:
            duration_index = movie_numbers.find("h")
            rate_index = movie_numbers.find(".")
            duration = movie_numbers[duration_index-1:rate_index-1]
            rating = movie_numbers[rate_index-1: ]


        try:
            movie_cal = float(movie_cal_str)
            movie_data.append((movie_name[name_index+1:], movie_cal, movie_reviews, year, duration, rating))
        except ValueError:
            print(f"Could not convert rating '{movie_cal_str}' for movie '{movie_name}' to a float.")

    return movie_data

In [101]:
browser = webdriver.Edge("msedgedriver.exe")
browser.maximize_window()
browser.get("https://www.imdb.com/")

# access the web and accept cookies automatically: //*[@id="__next"]/div/div/div[2]/div/button[2]
try:
    WebDriverWait(browser, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="__next"]/div/div/div[2]/div/button[2]'))
    ).click()
    # Clicar en desplegable //*[@id="imdbHeader-navDrawerOpen"]/span
    WebDriverWait(browser, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="imdbHeader-navDrawerOpen"]/span'))
    ).click()
    
    WebDriverWait(browser, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="imdbHeader"]/div[2]/aside[1]/div/div[2]/div/div[1]/span/div/div/ul/a[2]/span'))
    ).click()
except TimeoutException:
    pass

my_url = browser.current_url
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

soup = BeautifulSoup(requests.get(my_url, headers=headers).content, "html.parser")
data = extract_movie_data(soup)
df = pd.DataFrame(data, columns=['Movie', 'Calification', 'N.Reviews', 'Year', 'Duration', 'Rating'])

In [104]:
df.to_csv('../Outputs/top_250_by_duration.csv', index=False)

In [105]:
# Pinchar otra vez en menú
WebDriverWait(browser, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="imdbHeader-navDrawerOpen"]/span'))
    ).click()
# Pinchar en Calendario de lanzamientos
WebDriverWait(browser, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="imdbHeader"]/div[2]/aside[1]/div/div[2]/div/div[1]/span/div/div/ul/a[1]/span'))
    ).click()
# Tabla de lanzamientos
element = WebDriverWait(browser, 10).until(
    EC.presence_of_element_located((By.XPATH, '//*[@id="__next"]/main/div/div[3]/section/section'))
)


table_html = element.get_attribute('outerHTML')
soup = BeautifulSoup(table_html, "html.parser")

# Buscar cada tabla: //*[@id="__next"]/main/div/div[3]/section/section/article[1]

# create a dataframe with movie and release date
df2 = pd.DataFrame(columns=['Date', 'Title', 'Genre'])

tables = soup.find_all('article')
for table in tables:
    date = table.find('h3', class_='ipc-title__text').text
    titles = [title.text for title in table.find_all('a', class_='ipc-metadata-list-summary-item__t')]
    genre = [genre.text for genre in table.find_all('span', class_='ipc-metadata-list-summary-item__li')[0]]
    # Add to dataframe
    for title in titles:
        df2.loc[len(df2)] = [date, title, genre]

In [106]:
df2.tail()

Unnamed: 0,Date,Title,Genre
124,17 ene 2025,M3GAN 2.0 (2025),[Terror]
125,31 ene 2025,Mickey 17 (2025),[Aventura]
126,14 feb 2025,Captain America: Brave New World (2025),[Acción]
127,14 feb 2025,Sigue mi voz (2025),[Acción]
128,22 mar 2025,Snow White (2025),[Aventura]


In [107]:
df2.to_csv('../Outputs/new_movies.csv', index=False)

In [109]:
# Pinchar otra vez en menú
WebDriverWait(browser, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="imdbHeader-navDrawerOpen"]/span'))
    ).click()
# Pinchar en Calendario de lanzamientos
WebDriverWait(browser, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="imdbHeader"]/div[2]/aside[1]/div/div[2]/div/div[3]/span/div/div/ul/a[1]/span'))
    ).click()

WebDriverWait(browser, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="__next"]/main/div/div[3]/div[2]/div/div[2]/ul/a[2]/span '))
    ).click()
WebDriverWait(browser, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="sidebar"]/div[1]/div/div[2]/div[16]/span[6]/a'))
    ).click()

In [110]:
df_awards = pd.DataFrame(columns=['Year', 'Genres'])
for i in range(2005,2025):
    my_url = browser.current_url
    my_url = my_url[0:37]+str(i)+my_url[41:]
    print(my_url)
    # WebDriverWait(browser, 10).until(
    #     EC.element_to_be_clickable((By.XPATH, '//*[@id="center-3-react"]/div/div/div[1]/h3/div[1]/div[2]/div[1]/div[1]/div[2]/div[2]/div[1]/span/span/a'))
    # ).click()
    # TAbla best picture
    element = WebDriverWait(browser, 10).until(
        EC.presence_of_element_located((By.XPATH, '//*[@id="center-3-react"]/div/div/div[1]/h3/div[1]'))
    )
    table_html = element.get_attribute('outerHTML')
    soup = BeautifulSoup(table_html, "html.parser")
    refs = []
    for a in soup.find_all('a', href=True):
        refs.append(a['href'])
    refs = list(set( [ref for ref in refs if 'title' in ref]))
    genres_year = []
    for ref in refs:
        new_url = my_url[:21]+ref
        browser.get(new_url)
        tab = WebDriverWait(browser, 100).until(
            EC.presence_of_element_located((By.XPATH, '//*[@id="__next"]/main/div/section[1]/section/div[3]/section/section/div[3]/div[2]/div[1]/section/div[1]/div[2]'))
        )
        tab_html = tab.get_attribute('outerHTML')
        soup = BeautifulSoup(tab_html, "html.parser")
        genres = [genre.text for genre in soup.find_all('span', class_='ipc-chip__text')]
        for g in genres:
            genres_year.append(g)
    browser.get(my_url)

    df_awards.loc[len(df_awards)] = [i, genres_year]

    

    

https://www.imdb.com/event/ev0000003/2005/1/?ref_=ev_eh
https://www.imdb.com/event/ev0000003/2006/1/?ref_=ev_eh
https://www.imdb.com/event/ev0000003/2007/1/?ref_=ev_eh
https://www.imdb.com/event/ev0000003/2008/1/?ref_=ev_eh
https://www.imdb.com/event/ev0000003/2009/1/?ref_=ev_eh
https://www.imdb.com/event/ev0000003/2010/1/?ref_=ev_eh
https://www.imdb.com/event/ev0000003/2011/1/?ref_=ev_eh
https://www.imdb.com/event/ev0000003/2012/1/?ref_=ev_eh
https://www.imdb.com/event/ev0000003/2013/1/?ref_=ev_eh
https://www.imdb.com/event/ev0000003/2014/1/?ref_=ev_eh
https://www.imdb.com/event/ev0000003/2015/1/?ref_=ev_eh
https://www.imdb.com/event/ev0000003/2016/1/?ref_=ev_eh
https://www.imdb.com/event/ev0000003/2017/1/?ref_=ev_eh
https://www.imdb.com/event/ev0000003/2018/1/?ref_=ev_eh


WebDriverException: Message: disconnected: Unable to receive message from renderer
  (failed to check if window was closed: disconnected: not connected to DevTools)
  (Session info: MicrosoftEdge=123.0.2420.65)


In [23]:
df_awards.tail()

Unnamed: 0,Year,Genres
15,2020,"[Drama, Biografía, Comedia, Drama, Biografía, ..."
16,2021,"[Acción, Biografía, Drama, Biografía, Crimen, ..."
17,2022,"[Drama, Misterio, Drama, Historia, Suspense, B..."
18,2023,"[Crimen, Drama, Musical, Comedia, Drama, Cienc..."
19,2024,"[Biografía, Drama, Música, Acción, Drama, Dram..."


In [26]:
df_awards.to_csv('../Outputs/awards.csv', index=False)