# Imports

In [1]:
import pandas as pd
# Selenium is a web testing library. It is used to automate browser activities. (Dynamic web pages)
from selenium import webdriver
from selenium.common.exceptions import ElementClickInterceptedException
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

# BeautifulSoup is a Python library for pulling data out of HTML and XML files. It creates parse trees that is helpful to extract the data easily.
from bs4 import BeautifulSoup
import requests

from useful import *

# Connect to IMDb web

In [2]:
# Connect to the web
browser = webdriver.Edge("msedgedriver.exe")
browser.maximize_window()
browser.get("https://www.imdb.com/")

# 1. Top 250 Movies

1. Reject Cookies
2. Click on the "Drop-Down Menu"
3. Click on "Top 250 best movies"

Data Extracted
- Movie Title
- Calification
- Number of reviews
- Year
- Duration
- Rating

In [10]:
try:
    # # Change to English version
    # WebDriverWait(browser, 10).until(
    #     EC.element_to_be_clickable((By.XPATH, '//*[@id="imdbHeader"]/div[2]/div[6]/label'))
    # ).click()
    # WebDriverWait(browser, 10).until(
    #     EC.element_to_be_clickable((By.XPATH, '//*[@id="language-option-en-US"]'))
    # ).click()
    try:
        # Reject Cookies
        WebDriverWait(browser, 10).until(
            EC.element_to_be_clickable((By.XPATH, '//*[@id="__next"]/div/div/div[2]/div/button[1]'))
        ).click()
    except TimeoutException:
        pass
    # Clicar en desplegable //*[@id="imdbHeader-navDrawerOpen"]/span
    WebDriverWait(browser, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="imdbHeader-navDrawerOpen"]/span'))
    ).click()
    
    WebDriverWait(browser, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="imdbHeader"]/div[2]/aside[1]/div/div[2]/div/div[1]/span/div/div/ul/a[2]/span'))
    ).click()
except TimeoutException:
    pass

In [11]:
my_url = browser.current_url
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}

soup = BeautifulSoup(requests.get(my_url, headers=headers).content, "html.parser")
data = extract_top250movie_data(soup)
df = pd.DataFrame(data, columns=['Movie', 'Calification', 'N.Reviews', 'Year', 'Duration', 'Rating'])

In [12]:
# Save CSV
df.to_csv('../Outputs/top_250.csv', index=False)

# 2. Upcoming Movie Release
1. Click on the "Drop-Down Menu"
2. Click on "Release Schedule"

In [40]:
# Menu
WebDriverWait(browser, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="imdbHeader-navDrawerOpen"]/span'))
    ).click()
# Release Schedule
WebDriverWait(browser, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="imdbHeader"]/div[2]/aside[1]/div/div[2]/div/div[1]/span/div/div/ul/a[1]/span'))
    ).click()


__Information obtained:__
* Release Date
* Movie Title
* Genres

### Spain

In [41]:
countries = {'ES':'//*[@id="country-selector"]/option[38]',
             'DE':'//*[@id="country-selector"]/option[4]',
             'IT':'//*[@id="country-selector"]/option[61]',
             'GB':'//*[@id="country-selector"]/option[91]',
             }

In [44]:
for countrie in countries.keys():
    # Click on Spain
    WebDriverWait(browser, 10).until(
            EC.element_to_be_clickable((By.XPATH, countries[countrie]))
        ).click()

    # New releases table to html 
    element = WebDriverWait(browser, 10).until(
        EC.presence_of_element_located((By.XPATH, '//*[@id="__next"]/main/div/div[3]/section/section'))
    )
    table_html = element.get_attribute('outerHTML')
    soup = BeautifulSoup(table_html, "html.parser")

    df2 = extract_newreleases_data(soup)
    df2.to_csv('../Outputs/new_movies_{}.csv'.format(countrie), index=False)

In [27]:
df2.head()

Unnamed: 0,Date,Title,Genres
0,05 abr 2024,La primera profecía (2024),"[Terror, Comedia, Crimen, Drama, Acción, Comed..."
1,05 abr 2024,Pequeñas cartas indiscretas (2023),"[Terror, Comedia, Crimen, Drama, Acción, Comed..."
2,05 abr 2024,Freelance (2023),"[Terror, Comedia, Crimen, Drama, Acción, Comed..."
3,05 abr 2024,Johnny Puff. Misión secreta (2024),"[Terror, Comedia, Crimen, Drama, Acción, Comed..."
4,05 abr 2024,El consentimento (2023),"[Terror, Comedia, Crimen, Drama, Acción, Comed..."


In [24]:
df2.tail()

Unnamed: 0,Date,Title,Genres
127,17 ene 2025,M3GAN 2.0 (2025),"[Terror, Suspense]"
128,31 ene 2025,Mickey 17 (2025),"[Aventura, Drama]"
129,14 feb 2025,Captain America: Brave New World (2025),"[Acción, Aventura, Romance]"
130,14 feb 2025,Sigue mi voz (2025),"[Acción, Aventura, Romance]"
131,22 mar 2025,Snow White (2025),"[Aventura, Drama, Familiar]"


In [29]:
df2.to_csv('../Outputs/new_movies.csv', index=False)

# 3. Accademy Awards: Best Motion Picture (Nominees and Winners)
1. Click on the "Drop-Down Menu"
2. Click on "Accademy Awards"
3. Click on Winners

In [3]:
# Pinchar otra vez en menú
WebDriverWait(browser, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="imdbHeader-navDrawerOpen"]/span'))
    ).click()
# Pinchar en Accademy Awards
WebDriverWait(browser, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="imdbHeader"]/div[2]/aside[1]/div/div[2]/div/div[3]/span/div/div/ul/a[1]/span'))
    ).click()

WebDriverWait(browser, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="__next"]/main/div/div[3]/div[2]/div/div[2]/ul/a[2]/span '))
    ).click()


In [4]:
df_awards = extract_accademyawards_data(browser)    

In [5]:
df_awards.head()

Unnamed: 0,Year,Genres
0,2005,"[Comedia, Drama, Crimen, Drama, Historia, Come..."
1,2006,"[Biografía, Drama, Drama, Deporte, Biografía, ..."
2,2007,"[Drama, Romance, Biografía, Drama, Historia, A..."
3,2008,"[Crimen, Drama, Suspense, Drama, Comedia, Dram..."
4,2009,"[Comedia, Drama, Crimen, Drama, Suspense, Dram..."


In [6]:
df_awards.tail()

Unnamed: 0,Year,Genres
15,2020,"[Acción, Aventura, Ciencia ficción, Drama, Mús..."
16,2021,"[Drama, Romance, Crimen, Drama, Suspense, Acci..."
17,2022,"[Biografía, Comedia, Drama, Crimen, Drama, Mis..."
18,2023,"[Comedia, Drama, Romance, Drama, Crimen, Drama..."
19,2024,"[Acción, Aventura, Fantasía, Drama, Comedia, D..."


In [7]:
df_awards.to_csv('../Outputs/awards.csv', index=False)