# PROJECT 3
## Deliverable 1: Scrape Golden Globes movies from Golden Globe Award site

In [1]:
# Import Splinter and BeautifulSoup
from urllib.request import urlopen as uReq
import requests
from bs4 import BeautifulSoup as soup
import pandas as pd 

from splinter import Browser
from selenium import webdriver

In [2]:
browser = Browser('chrome')

### Step 1: Visit the Website

1. Use automated browsing to visit the [Golden Globe Awards site](https://goldenglobes.com/). Inspect the page to identify which elements to scrape.

      > **Hint** To identify which elements to scrape, you might want to inspect the page by using Chrome DevTools.

In [3]:
# Visit the Golden Globe Awards site (I. Vitkovych)
goldenglobes_url = 'https://goldenglobes.com/awards-database/'
browser.visit(goldenglobes_url)

In [4]:
goldenglobes_resp = requests.get(goldenglobes_url) #get the content of the url from the request package. (I. Vitkovych)
print(goldenglobes_resp.text[:250]) # look at the snapshot of the content


<!doctype html>
<html lang="en-US">

<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1">
	<link rel="profile" href="https://gmpg.org/xfn/11">
	<meta name='robots' content='index, follow, max-image-pre


### Step 2: Scrape the Website

Create a Beautiful Soup object and use it to extract text elements from the website.

In [13]:
for x in range(1,5):
    text_elements = goldenglobes_soup.find_all('div', class_ ='c-award-database-item')
    print('page:',x,'-----------------------')
    for movie in text_elements:
        print(movie.find_all('h4', class_ ='c-award-database-item__title u-type-h9')[0].text)
    browser.find_by_xpath('//button[@aria-label="Go to Next Page"]').click()

page: 1 -----------------------
Succession
Mission: Impossible - Dead Reckoning Part 1
Last of Us, The
Taylor Swift: The Eras Tour
John Wick: Chapter 4
The Super Mario Bros. Movie
Guardians of the Galaxy Vol. 3
Spider-Man: Across the Spider-Verse
Oppenheimer
Barbie


ElementClickInterceptedException: Message: element click intercepted: Element <button class="c-paging__link" aria-label="Go to Next Page">...</button> is not clickable at point (381, 629). Other element would receive the click: <div class="c-disclaimer js-privacy-compliance-banner" style="display: block;">...</div>
  (Session info: chrome=123.0.6312.106)
Stacktrace:
	GetHandleVerifier [0x00007FF7BDF37072+63090]
	(No symbol) [0x00007FF7BDEA2CC2]
	(No symbol) [0x00007FF7BDD3EC65]
	(No symbol) [0x00007FF7BDD8BB34]
	(No symbol) [0x00007FF7BDD89954]
	(No symbol) [0x00007FF7BDD87164]
	(No symbol) [0x00007FF7BDD85EF9]
	(No symbol) [0x00007FF7BDD7A708]
	(No symbol) [0x00007FF7BDDA6FDA]
	(No symbol) [0x00007FF7BDD7A00A]
	(No symbol) [0x00007FF7BDDA71F0]
	(No symbol) [0x00007FF7BDDC3412]
	(No symbol) [0x00007FF7BDDA6D83]
	(No symbol) [0x00007FF7BDD783A8]
	(No symbol) [0x00007FF7BDD79441]
	GetHandleVerifier [0x00007FF7BE3325CD+4238285]
	GetHandleVerifier [0x00007FF7BE36F72D+4488493]
	GetHandleVerifier [0x00007FF7BE367A0F+4456463]
	GetHandleVerifier [0x00007FF7BE0105B6+953270]
	(No symbol) [0x00007FF7BDEAE58F]
	(No symbol) [0x00007FF7BDEA9264]
	(No symbol) [0x00007FF7BDEA939B]
	(No symbol) [0x00007FF7BDE99BD4]
	BaseThreadInitThunk [0x00007FFBBC7A257D+29]
	RtlUserThreadStart [0x00007FFBBCF6AA58+40]


### Step 3: Store the Results

Extract the titles, year and status text of the list of movies that you scraped. Store the scraping results in Python data structures as follows:

* Store each title-year-status in a Python dictionary. And, give each dictionary three keys: `title`, `yera` and  `status`. An example is the following:

  ```python
  {'title': 'Last of Us, The', 'year': '2024', 'status': 'Nominee'},
 {'title': 'Taylor Swift: The Eras Tour', 'year': '2024', 'status': 'Nominee'},
 {'title': 'John Wick: Chapter 4', 'year': '2024', 'status': 'Nominee'},
 {'title': 'The Super Mario Bros. Movie', 'year': '2024', 'status': 'Nominee'},
  ```

* Store all the dictionaries in a Python list.

* Print the list in your notebook.

In [6]:
# Create empty list to store data
goldenglobes_data = []

# Loop through the pages
while True:
    # Parse the HTML with Beautiful Soup
    html = browser.html
    goldenglobes_soup = soup(html, 'html.parser')
    
    # Extract all the text elements
    text_elements = goldenglobes_soup.find_all('div', class_='c-award-database-item')
    
    # Loop through the text elements and extract data
    for text_element in text_elements:
        title = text_element.find('h4', class_='c-award-database-item__title u-type-h9').text
        year = text_element.find('div', class_='c-award-database-item__year u-color-white-07').text
        status = text_element.find('div', class_='c-award-database-item__status').text
        award = text_element.find('div', class_='c-award-database-item__award').text
        
        # Store each item in a dictionary
        movie_dict = {
            'title': title,
            'year': year,
            'status': status,
            'award': award
        }
        
        # Add the dictionary to the list
        goldenglobes_data.append(movie_dict)
    
    # Try to click the next page button, if not found, break the loop
    try:
        browser.find_by_xpath('//button[@aria-label="Go to Next Page"]').click()
    except:
        break

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(goldenglobes_data)

# Display the DataFrame
print(df) 


                                            title  year   status  \
0                                      Succession  2024   Winner   
1     Mission: Impossible - Dead Reckoning Part 1  2024  Nominee   
2                                 Last of Us, The  2024  Nominee   
3                     Taylor Swift: The Eras Tour  2024  Nominee   
4                            John Wick: Chapter 4  2024  Nominee   
...                                           ...   ...      ...   
8575                                 Billy Wilder  1946   Winner   
8576                              Angela Lansbury  1946   Winner   
8577                             J. Carroll Naish  1946   Winner   
8578                               Ingrid Bergman  1946   Winner   
8579                          The House I Live In  1946   Winner   

                                                  award  
0                        Best Television Series - Drama  
1                  Cinematic and Box Office Achievement  
2        

In [7]:
browser.quit() 

In [8]:
# Write the data to a CSV (I. Vitkovych)
goldenglobes_df = pd.DataFrame(goldenglobes_data)
goldenglobes_df.to_csv("goldenglobes.csv", index=False)
