# Imports

In [30]:
import pandas as pd
# Selenium is a web testing library. It is used to automate browser activities. (Dynamic web pages)
from selenium import webdriver
from selenium.common.exceptions import ElementClickInterceptedException
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities


# BeautifulSoup is a Python library for pulling data out of HTML and XML files. It creates parse trees that is helpful to extract the data easily.
from bs4 import BeautifulSoup
import requests

from useful import *

# Connect to IMDb web

In [44]:
# Connect to the web
browser = webdriver.Edge("msedgedriver.exe")
browser.maximize_window()
browser.get("https://www.imdb.com/")

# 1. Top 250 Movies

1. Reject Cookies
2. Click on the "Drop-Down Menu"
3. Click on "Top 250 best movies"

Data Extracted
- Movie Title
- Calification
- Number of reviews
- Year
- Duration
- Rating

In [45]:
try:
    # Change to English version
    WebDriverWait(browser, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="imdbHeader"]/div[2]/div[6]/label'))
    ).click()

    lang_menu = WebDriverWait(browser, 10).until(
        EC.element_to_be_clickable((By.XPATH, '/html/body/div[2]/nav/div[2]/div[6]/div/div/div/span/ul[1]/li[3]/span[2]'))
    ).click()
    try:
        # Reject Cookies
        WebDriverWait(browser, 10).until(
            EC.element_to_be_clickable((By.XPATH, '//*[@id="__next"]/div/div/div[2]/div/button[1]'))
        ).click()
    except TimeoutException:
        pass
    # Click on the "dropdown menu" and select "Top 250 movies"
    WebDriverWait(browser, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="imdbHeader-navDrawerOpen"]/span'))
    ).click()
    
    WebDriverWait(browser, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="imdbHeader"]/div[2]/aside[1]/div/div[2]/div/div[1]/span/div/div/ul/a[2]/span'))
    ).click()
except TimeoutException:
    pass

In [46]:
my_url = browser.current_url
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 Edg/123.0.2420.65',
    'Accept-Language': 'en-US,en;q=0.9'}

soup = BeautifulSoup(requests.get(my_url, headers=headers).content, "html.parser")
data = extract_top250movie_data(soup)
df = pd.DataFrame(data, columns=['Movie', 'Calification', 'N.Reviews', 'Year', 'Duration', 'Rating'])

In [47]:
df.head()

Unnamed: 0,Movie,Calification,N.Reviews,Year,Duration,Rating
0,The Shawshank Redemption,9.3,2.9M,1994,2h 22m,R
1,The Godfather,9.2,2M,1972,2h 55m,R
2,The Dark Knight,9.0,2.9M,2008,2h 32m,PG-13
3,The Godfather Part II,9.0,1.4M,1974,3h 22m,R
4,12 Angry Men,9.0,862K,1957,1h 36m,Approved


In [48]:
# Save CSV
df.to_csv('../Outputs/top_250.csv', index=False)

# 2. Upcoming Movie Release
1. Click on the "Drop-Down Menu"
2. Click on "Release Schedule"

In [49]:
# Menu
WebDriverWait(browser, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="imdbHeader-navDrawerOpen"]/span'))
    ).click()
# Release Schedule
WebDriverWait(browser, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="imdbHeader"]/div[2]/aside[1]/div/div[2]/div/div[1]/span/div/div/ul/a[1]/span'))
    ).click()


__Information obtained:__
* Release Date
* Movie Title
* Genres

### Spain

In [50]:
countries = {'ES':'//*[@id="country-selector"]/option[96]',
             'DE':'//*[@id="country-selector"]/option[44]',
             'IT':'//*[@id="country-selector"]/option[56]',
             'GB':'//*[@id="country-selector"]/option[105]',
             }

In [51]:
for countrie in countries.keys():
    # Click on Spain
    WebDriverWait(browser, 10).until(
            EC.element_to_be_clickable((By.XPATH, countries[countrie]))
        ).click()

    # New releases table to html 
    element = WebDriverWait(browser, 10).until(
        EC.presence_of_element_located((By.XPATH, '//*[@id="__next"]/main/div/div[3]/section/section'))
    )
    table_html = element.get_attribute('outerHTML')
    soup = BeautifulSoup(table_html, "html.parser")

    df2 = extract_newreleases_data(soup)
    df2.to_csv('../Outputs/new_movies_{}.csv'.format(countrie), index=False)

In [52]:
df2.head()

Unnamed: 0,Date,Title,Genres
0,"Apr 11, 2024",Aavesham (2024),"[Action, Hipzster]"
1,"Apr 12, 2024",Civil War (2024),[Action]
2,"Apr 12, 2024",Back to Black (2024),"[Biography, Drama, Music]"
3,"Apr 12, 2024",Arcadian (2024),"[Action, Horror, Thriller]"
4,"Apr 12, 2024",Bleeding Love (2023),[Drama]


In [53]:
df2.tail()

Unnamed: 0,Date,Title,Genres
77,"Feb 14, 2025",The Smurfs Musical (2025),"[Animation, Adventure, Comedy, Rihanna]"
78,"Feb 21, 2025",The Unbreakable Boy (2025),[Drama]
79,"Mar 22, 2025",Snow White (2025),"[Adventure, Drama, Family]"
80,"Apr 04, 2025",Minecraft (2025),"[Action, Adventure, Family]"
81,"Apr 04, 2025",Fast X: Part 2 (2025),"[Action, Crime, Thriller]"


# 3. Accademy Awards: Best Motion Picture (Nominees and Winners)
1. Click on the "Drop-Down Menu"
2. Click on "Accademy Awards"
3. Click on Winners

In [54]:
# Pinchar otra vez en menú
WebDriverWait(browser, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="imdbHeader-navDrawerOpen"]/span'))
    ).click()
# Pinchar en Accademy Awards
WebDriverWait(browser, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="imdbHeader"]/div[2]/aside[1]/div/div[2]/div/div[3]/span/div/div/ul/a[1]/span'))
    ).click()

WebDriverWait(browser, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="__next"]/main/div/div[3]/div[2]/div/div[2]/ul/a[2]/span '))
    ).click()

WebDriverWait(browser, 10).until(
        EC.element_to_be_clickable((By.XPATH, '//*[@id="sidebar"]/div[1]/div/div[2]/div[4]/span[2]/a'))
    ).click()


In [None]:
df_awards = extract_accademyawards_data(browser)  
browser.quit()  

In [71]:
pd.set_option('display.max_rows', None)
df_awards

Unnamed: 0,Year,Genres
0,2005,"[Biography, Drama, Biography, Drama, Family, C..."
1,2006,"[Biography, Drama, Biography, Drama, Family, C..."
2,2007,"[Biography, Drama, History, Drama, Romance, Cr..."
3,2008,"[Action, Adventure, Drama, Crime, Drama, Thril..."
4,2009,"[Drama, Mystery, Romance, Drama, Comedy, Drama..."
5,2010,"[Crime, Drama, Romance, Biography, Drama, Hist..."
6,2011,"[Action, Adventure, Fantasy, Drama, Drama, Thr..."
7,2012,"[Comedy, Drama, Romance, Biography, Drama, Dra..."
8,2013,"[Drama, Drama, Fantasy, Comedy, Fantasy, Roman..."
9,2014,"[Drama, Musical, Romance, Adventure, Drama, Fa..."


In [74]:
df_awards.to_csv('../Outputs/awards.csv', index=False)