# PROJECT 3
## Deliverable 1: Scrape Golden Globes movies from Golden Globe Award site

In [21]:
import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup as soup

async def scrape_website():
    async with async_playwright() as p:
        browser = await p.chromium.launch()
        page = await browser.new_page()

        for x in range(1, 25):
            print('page:', x, '-----------------------')

            # Navigate to the page
            await page.goto('https://goldenglobes.com/awards-database/')

            # Get the current page's HTML content
            html_content = await page.content()

            # Parse HTML content using BeautifulSoup
            goldenglobes_soup = soup(html_content, 'html.parser')

            # Find text elements on the current page
            text_elements = goldenglobes_soup.find_all('div', class_='c-award-database-item')

            # Extract data from text elements
            for movie in text_elements:
                title = movie.find('h4', class_='c-award-database-item__title u-type-h9').text.strip()
                print(title)

            # Click on the next page button (with waiting)
            try:
                await page.click('//button[@aria-label="Go to Next Page"]')
                await page.wait_for_url('https://goldenglobes.com/awards-database/')
            except Exception as e:
                print("Error:", e)
                break

        # After scraping, close the browser
        await browser.close()

# Run the scraping function
await scrape_website()

page: 1 -----------------------
Succession
Mission: Impossible - Dead Reckoning Part 1
Last of Us, The
Taylor Swift: The Eras Tour
John Wick: Chapter 4
The Super Mario Bros. Movie
Guardians of the Galaxy Vol. 3
Spider-Man: Across the Spider-Verse
Oppenheimer
Barbie
page: 2 -----------------------
Succession
Mission: Impossible - Dead Reckoning Part 1
Last of Us, The
Taylor Swift: The Eras Tour
John Wick: Chapter 4
The Super Mario Bros. Movie
Guardians of the Galaxy Vol. 3
Spider-Man: Across the Spider-Verse
Oppenheimer
Barbie
page: 3 -----------------------
Succession
Mission: Impossible - Dead Reckoning Part 1
Last of Us, The
Taylor Swift: The Eras Tour
John Wick: Chapter 4
The Super Mario Bros. Movie
Guardians of the Galaxy Vol. 3
Spider-Man: Across the Spider-Verse
Oppenheimer
Barbie
page: 4 -----------------------
Succession
Mission: Impossible - Dead Reckoning Part 1
Last of Us, The
Taylor Swift: The Eras Tour
John Wick: Chapter 4
The Super Mario Bros. Movie
Guardians of the Galax

In [23]:
import nest_asyncio
nest_asyncio.apply()

import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup as soup

async def scrape_website():
    async with async_playwright() as p:
        browser = await p.chromium.launch()
        page = await browser.new_page()

        for x in range(1, 7):
            print('page:', x, '-----------------------')

            # Navigate to the page
            await page.goto('https://goldenglobes.com/awards-database/')

            # Get the current page's HTML content
            html_content = await page.content()

            # Parse HTML content using BeautifulSoup
            goldenglobes_soup = soup(html_content, 'html.parser')

            # Find text elements on the current page
            text_elements = goldenglobes_soup.find_all('div', class_='c-award-database-item')

            # Extract data from text elements
            for movie in text_elements:
                title = movie.find('h4', class_='c-award-database-item__title u-type-h9').text.strip()
                print(title)

            # Click on the next page button (with waiting)
            try:
                # Assuming 'page' is the Playwright page object
                await page.click('#award-database > div > div.o-container > div > div.c-award-database__pagination > nav > ul > li:nth-child(7) > button')


            except Exception as e:
                print("Error:", e)
                break

        # After scraping, close the browser
        await browser.close()

# Run the scraping function
await scrape_website()


page: 1 -----------------------
Succession
Mission: Impossible - Dead Reckoning Part 1
Last of Us, The
Taylor Swift: The Eras Tour
John Wick: Chapter 4
The Super Mario Bros. Movie
Guardians of the Galaxy Vol. 3
Spider-Man: Across the Spider-Verse
Oppenheimer
Barbie
page: 2 -----------------------
Succession
Mission: Impossible - Dead Reckoning Part 1
Last of Us, The
Taylor Swift: The Eras Tour
John Wick: Chapter 4
The Super Mario Bros. Movie
Guardians of the Galaxy Vol. 3
Spider-Man: Across the Spider-Verse
Oppenheimer
Barbie
page: 3 -----------------------
Succession
Mission: Impossible - Dead Reckoning Part 1
Last of Us, The
Taylor Swift: The Eras Tour
John Wick: Chapter 4
The Super Mario Bros. Movie
Guardians of the Galaxy Vol. 3
Spider-Man: Across the Spider-Verse
Oppenheimer
Barbie
page: 4 -----------------------
Succession
Mission: Impossible - Dead Reckoning Part 1
Last of Us, The
Taylor Swift: The Eras Tour
John Wick: Chapter 4
The Super Mario Bros. Movie
Guardians of the Galax

In [25]:
# Import Splinter and BeautifulSoup
from urllib.request import urlopen as uReq
import requests
from bs4 import BeautifulSoup as soup
import pandas as pd 
import time 

In [26]:
from splinter import Browser
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

my_service = Service()
browser = Browser('chrome', service=my_service)

### Step 1: Visit the Website

1. Use automated browsing to visit the [Golden Globe Awards site](https://goldenglobes.com/). Inspect the page to identify which elements to scrape.

      > **Hint** To identify which elements to scrape, you might want to inspect the page by using Chrome DevTools.

In [27]:
# Visit the Golden Globe Awards site (I. Vitkovych)
goldenglobes_url = 'https://goldenglobes.com/awards-database/'
browser.visit(goldenglobes_url)

In [28]:
goldenglobes_resp = requests.get(goldenglobes_url) #get the content of the url from the request package. (I. Vitkovych)
print(goldenglobes_resp.text[:250]) # look at the snapshot of the content


<!doctype html>
<html lang="en-US">

<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1">
	<link rel="profile" href="https://gmpg.org/xfn/11">
	<meta name='robots' content='index, follow, max-image-pre


### Step 2: Scrape the Website

Create a Beautiful Soup object and use it to extract text elements from the website.

In [29]:
# Create a Beautiful Soup object (I. Vitkovych)
html = browser.html
goldenglobes_soup = soup(html, 'html.parser')
goldenglobes_soup

<html lang="en-US"><head><meta content="Az520Inasey3TAyqLyojQa8MnmCALSEU29yQFW8dePZ7xQTvSt73pHazLFTK5f7SyLUJSo2uKLesEtEa9aUYcgMAAACPeyJvcmlnaW4iOiJodHRwczovL2dvb2dsZS5jb206NDQzIiwiZmVhdHVyZSI6IkRpc2FibGVUaGlyZFBhcnR5U3RvcmFnZVBhcnRpdGlvbmluZyIsImV4cGlyeSI6MTcyNTQwNzk5OSwiaXNTdWJkb21haW4iOnRydWUsImlzVGhpcmRQYXJ0eSI6dHJ1ZX0=" http-equiv="origin-trial"/>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<link href="https://gmpg.org/xfn/11" rel="profile"/>
<meta content="index, follow, max-image-preview:large, max-snippet:-1, max-video-preview:-1" name="robots"/>
<!-- This site is optimized with the Yoast SEO plugin v22.4 - https://yoast.com/wordpress/plugins/seo/ -->
<title>Awards Database - Golden Globes</title>
<link href="https://goldenglobes.com/awards-database/" rel="canonical"/>
<meta content="en_US" property="og:locale"/>
<meta content="article" property="og:type"/>
<meta content="Awards Database - Golden Globes" property="og:title"/>
<me

In [30]:

# Your existing code to initialize the browser and navigate to the page

# Iterate through pages and scrape data
for x in range(1, 5):
    print('page:', x, '-----------------------')
    
    # Get the current page's HTML content
    html = browser.html
    goldenglobes_soup = soup(html, 'html.parser')
    
    # Find text elements on the current page
    text_elements = goldenglobes_soup.find_all('div', class_='c-award-database-item')
    
    # Extract data from text elements
    for movie in text_elements:
        title = movie.find('h4', class_='c-award-database-item__title u-type-h9').text.strip()
        print(title)
    
    # Click on the next page button (with waiting)
    try:
        next_button = WebDriverWait(browser, 10).until(EC.element_to_be_clickable((By.XPATH, '//button[@aria-label="Go to Next Page"]')))
        next_button.click()
    except Exception as e:
        print("Error:", e)
        break

page: 1 -----------------------
Succession
Mission: Impossible - Dead Reckoning Part 1
Last of Us, The
Taylor Swift: The Eras Tour
John Wick: Chapter 4
The Super Mario Bros. Movie
Guardians of the Galaxy Vol. 3
Spider-Man: Across the Spider-Verse
Oppenheimer
Barbie
Error: 'WebDriver' object has no attribute 'find_element'


In [31]:
# Iterate through pages and scrape data
for x in range(1, 25):
    print('page:', x, '-----------------------')
    
    # Get the current page's HTML content
    html = browser.html
    goldenglobes_soup = soup(html, 'html.parser')
    
    # Find text elements on the current page
    text_elements = goldenglobes_soup.find_all('div', class_='c-award-database-item')
    
    # Extract data from text elements
    for movie in text_elements:
        title = movie.find('h4', class_='c-award-database-item__title u-type-h9').text.strip()
        print(title)
    
    # Dismiss privacy compliance banner if present
    privacy_banner = browser.find_by_css('.c-disclaimer.js-privacy-compliance-banner')
    if privacy_banner:
        privacy_banner.first.click()
    
    # Click on the next page button (with waiting)
    try:
        next_button = WebDriverWait(browser.driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#award-database > div > div.o-container > div > div.c-award-database__pagination > nav > ul > li:nth-child(7) > button')))
        next_button.click()
    except Exception as e:
        print("Error:", e)
        break

page: 1 -----------------------
Succession
Mission: Impossible - Dead Reckoning Part 1
Last of Us, The
Taylor Swift: The Eras Tour
John Wick: Chapter 4
The Super Mario Bros. Movie
Guardians of the Galaxy Vol. 3
Spider-Man: Across the Spider-Verse
Oppenheimer
Barbie
Error: Message: element click intercepted: Element <button class="c-paging__link" aria-label="Go to Next Page">...</button> is not clickable at point (727, 805). Other element would receive the click: <div class="c-disclaimer js-privacy-compliance-banner" style="display: block;">...</div>
  (Session info: chrome=123.0.6312.107)
Stacktrace:
0   chromedriver                        0x0000000104c64474 chromedriver + 4326516
1   chromedriver                        0x0000000104c5c93c chromedriver + 4294972
2   chromedriver                        0x0000000104888088 chromedriver + 278664
3   chromedriver                        0x00000001048d0554 chromedriver + 574804
4   chromedriver                        0x00000001048cead8 chromed

In [33]:
for x in range(1,7):
    text_elements = goldenglobes_soup.find_all('div', class_ ='c-award-database-item')
    print('page:',x,'-----------------------')
    for movie in text_elements:
        print(movie.find_all('h4', class_ ='c-award-database-item__title u-type-h9')[0].text)
    next_button = browser.find_by_xpath('//button[@aria-label="Go to Next Page"]')

# Wait for the button to be clickable
WebDriverWait(browser.driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//button[@aria-label="Go to Next Page"]')))

# Execute JavaScript to click the button
browser.execute_script("arguments[0].click();", next_button._element)

# Wait for the new page to load
time.sleep(2)  # Adjust sleep time as needed

page: 1 -----------------------
Succession
Mission: Impossible - Dead Reckoning Part 1
Last of Us, The
Taylor Swift: The Eras Tour
John Wick: Chapter 4
The Super Mario Bros. Movie
Guardians of the Galaxy Vol. 3
Spider-Man: Across the Spider-Verse
Oppenheimer
Barbie
page: 2 -----------------------
Succession
Mission: Impossible - Dead Reckoning Part 1
Last of Us, The
Taylor Swift: The Eras Tour
John Wick: Chapter 4
The Super Mario Bros. Movie
Guardians of the Galaxy Vol. 3
Spider-Man: Across the Spider-Verse
Oppenheimer
Barbie
page: 3 -----------------------
Succession
Mission: Impossible - Dead Reckoning Part 1
Last of Us, The
Taylor Swift: The Eras Tour
John Wick: Chapter 4
The Super Mario Bros. Movie
Guardians of the Galaxy Vol. 3
Spider-Man: Across the Spider-Verse
Oppenheimer
Barbie
page: 4 -----------------------
Succession
Mission: Impossible - Dead Reckoning Part 1
Last of Us, The
Taylor Swift: The Eras Tour
John Wick: Chapter 4
The Super Mario Bros. Movie
Guardians of the Galax

In [34]:
# Extract all the text elements (I. Vitkovych)
text_elements = goldenglobes_soup.find_all('div', class_ ='c-award-database-item')
text_elements

[<div class="c-award-database-item"><div class="c-award-database-item__header"><div class="c-award-database-item__thumbnail-wrapper c-award-database-item__thumbnail-wrapper--tv-show"><img class="c-award-database-item__thumbnail" height="88" src="https://goldenglobes.com/wp-content/uploads/2023/10/Succession.jpg?w=600" width="66"/></div><div><h4 class="c-award-database-item__title u-type-h9"><span><a href="https://goldenglobes.com/tv-show/succession/">Succession</a></span></h4><div class="c-award-database-item__status u-color-accent-02"><div class="c-award-database-item__winner"><svg height="11" viewbox="0 0 12 11" width="12" xmlns="http://www.w3.org/2000/svg"><path d="M2.49704 10.765L3.42826 6.87097L0.328125 4.24422L4.40663 3.91012L5.99795 0.234985L7.58927 3.91012L11.6678 4.24422L8.56764 6.87097L9.49886 10.765L5.99795 8.69125L2.49704 10.765Z" fill="currentColor"></path></svg>Winner</div></div><div class="c-award-database-item__year u-color-white-07">2024</div></div></div><div class="c-

### Step 3: Store the Results

Extract the titles, year and status text of the list of movies that you scraped. Store the scraping results in Python data structures as follows:

* Store each title-year-status in a Python dictionary. And, give each dictionary three keys: `title`, `yera` and  `status`. An example is the following:

  ```python
  {'title': 'Last of Us, The', 'year': '2024', 'status': 'Nominee'},
 {'title': 'Taylor Swift: The Eras Tour', 'year': '2024', 'status': 'Nominee'},
 {'title': 'John Wick: Chapter 4', 'year': '2024', 'status': 'Nominee'},
 {'title': 'The Super Mario Bros. Movie', 'year': '2024', 'status': 'Nominee'},
  ```

* Store all the dictionaries in a Python list.

* Print the list in your notebook.

In [35]:
# Create an empty list to store the dictionaries (I.Vitkovych)
goldenglobes_data = []

In [36]:
# Loop through the text elements
# Extract the title, year and status text from the elements
# Store each title and preview pair in a dictionary
# Add the dictionary to the list
for text_element in text_elements:
    title = text_element.find_all('h4', class_ ='c-award-database-item__title u-type-h9')[0].text
    year = text_element.find_all('div', class_ = 'c-award-database-item__year u-color-white-07')[0].text
    status = text_element.find_all('div', class_ = 'c-award-database-item__status')[0].text
    award = text_element.find_all('div', class_ = 'c-award-database-item__award')[0].text

# Store each title and preview pair in a dictionary
    movie_dict = {'title': title, 'year': year, 'status': status, 'award': award}

# Add the dictionary to the list
goldenglobes_data.append(movie_dict)



In [23]:
# Loop for clicking on the next page until there are no more pages left (I.Vitkovych) THIS CODE NOT WORKING YET
# Find the button by its class attribute
next_button = browser.find_by_css('.c-paging__link')

# Click on the button
next_button.click()

# After clicking, you can continue with your scraping or other actions

MaxRetryError: HTTPConnectionPool(host='localhost', port=54884): Max retries exceeded with url: /session/6dedb7d038641424a0cfa5809ec9800c/elements (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x309221e40>: Failed to establish a new connection: [Errno 61] Connection refused'))

In [37]:
# Print the list to confirm success (I. Vitkovych)
goldenglobes_data

[{'title': 'Barbie',
  'year': '2024',
  'status': 'Winner',
  'award': 'Cinematic and Box Office Achievement'}]

In [38]:

browser.quit() 

In [39]:
# Write the data to a CSV (I. Vitkovych)
goldenglobes_df = pd.DataFrame(goldenglobes_data)
goldenglobes_df.to_csv("goldenglobes.csv", index=False)
