# Install Dependencies
```pip install bs4 selenium webdriver_manager numpy pandas -U```

In [4]:
a=[]
b=[1,2,4,6,7]

a.extend(b)

In [5]:
a

[1, 2, 4, 6, 7]

# Approach 1) Sitemap/Requests
    - visit rottentomatoes.com/robots.txt
        - Whats there, what each item means
        - Use sitemap if exists
    - XML parsing (sitemap)
        - use xmltree pacakge in python
        - task: get all MOVIE urls out of sitemap
        - show solution (use simple requests -> xmltree parse -> store in list)

# Step 1: Read Root Sitemap

In [None]:
# 1) Read Sitemap
import requests
sitemap_url = 'https://www.rottentomatoes.com/sitemaps/sitemap.xml'
sitemap = requests.get(sitemap_url).text

## 1-1) Parsing Sitemap

In [None]:
import xml.etree.ElementTree as ET
tree = ET.ElementTree(ET.fromstring(sitemap))
print(tree)

<xml.etree.ElementTree.ElementTree object at 0x7fc9f0264310>


## 1-2) Make XML tree iterable
### Note: <br>
    - getroot(): Returns the root element for this tree.<br>
    - iter(): Creates and returns a tree iterator for the root element. The iterator loops over all elements in this tree, in section order. tag is the tag to look for (default is to return all elements).<br>
    - pprint: Print result in a readable format

In [None]:
from pprint import pprint
parsed_tree = list(tree.getroot().iter())
pprint(parsed_tree[:5])
len(parsed_tree)

[<Element '{http://www.sitemaps.org/schemas/sitemap/0.9}sitemapindex' at 0x7fc9f01a4310>,
 <Element '{http://www.sitemaps.org/schemas/sitemap/0.9}sitemap' at 0x7fc9f01a43b0>,
 <Element '{http://www.sitemaps.org/schemas/sitemap/0.9}loc' at 0x7fc9f01a4450>,
 <Element '{http://www.sitemaps.org/schemas/sitemap/0.9}lastmod' at 0x7fc9f01a44a0>,
 <Element '{http://www.sitemaps.org/schemas/sitemap/0.9}sitemap' at 0x7fc9f01a44f0>]


412

## 1-3) Get Secondary (Movie) Sitemaps from Root Sitemap

In [None]:
# Make an empty list to contain output
movie_sitemap_urls = list()
# Loop through parsed_tree
for element in parsed_tree:
    ele_text = element.text  # Get text from child element
    # Look for urls containing 'movie'
    if 'movie' in ele_text:
        movie_sitemap_urls.append(ele_text)  # Append each url to output
    continue

In [None]:
# Examine output
pprint(movie_sitemap_urls[:5])
len(movie_sitemap_urls)

['https://www.rottentomatoes.com/sitemaps/movie_0.xml',
 'https://www.rottentomatoes.com/sitemaps/movie_1.xml',
 'https://www.rottentomatoes.com/sitemaps/movie_2.xml',
 'https://www.rottentomatoes.com/sitemaps/movie_3.xml',
 'https://www.rottentomatoes.com/sitemaps/movie_4.xml']


33

# Step 2: Read Movie Sitemaps

## 2-1) Read each movie sitemaps (sample)

In [None]:
print(f'Example: {movie_sitemap_urls[0]}')
movie_urls = list()  # Empty list to store resuslts
response = requests.get(movie_sitemap_urls[0])  # Use request to get content
tree = ET.ElementTree(ET.fromstring(response.text))  # Get XML Tree
parsed_tree = list(tree.getroot().iter())  # Make tree iterable
for url in parsed_tree:
    # for each 'url' element, there's 2 other elements: 'loc' and 'image'
    # we need to loop through them and find each 'loc' for urls
    for element in url:
        if 'loc' in element.tag and 'rottentomatoes.com/m/' in element.text:
            movie_urls.append(element.text)
        continue
    # print(url)
    continue
# Examine Results
print('Results: ', end='')
pprint(movie_urls[:5])
len(movie_urls)

Example: https://www.rottentomatoes.com/sitemaps/movie_0.xml
Results: ['https://www.rottentomatoes.com/m/beg1994',
 'https://www.rottentomatoes.com/m/beg1994/pictures',
 'https://www.rottentomatoes.com/m/limo',
 'https://www.rottentomatoes.com/m/limo/pictures',
 'https://www.rottentomatoes.com/m/tejano']


17563

## 2-2) Loop through all movie sitemaps

In [None]:
# Empty list to store all results
all_movie_urls = list()
for url in movie_sitemap_urls:
    response = requests.get(url)
    tree = ET.ElementTree(ET.fromstring(response.text))
    parsed_tree = list(tree.getroot().iter())
    for url in parsed_tree:
        for element in url:
            if 'loc' in element.tag and 'rottentomatoes.com/m/' in element.text:
                all_movie_urls.append(element.text)
            continue
        continue
    continue
# De-duplicate
all_movie_urls = list(set(all_movie_urls))
# Examine Results
print(f'{len(all_movie_urls)} Total URLs: ', end='')
pprint(all_movie_urls[:5])

228263 Total URLs: ['https://www.rottentomatoes.com/m/meteora_2013',
 'https://www.rottentomatoes.com/m/night_creature/pictures',
 'https://www.rottentomatoes.com/m/fathers_and_daughters/pictures',
 'https://www.rottentomatoes.com/m/dracula_09',
 'https://www.rottentomatoes.com/m/a_perfect_match_2021']


# Step 3: Get Basic Info for each movie

## 3-1) Import Packages ##
| Packages               | Use                                                         |
|------------------------|-------------------------------------------------------------|
| bs4 (BeautifulSoup)    | HTML Parser, to find and extract info from HTML source code |
| pprint                 | PrettyPrint, to beautify print results                      |
| time/sleep             | Pause between loading pages                                 |
| requests               | Request web pages                                           |
| re (RegularExpression) | To clean scraped text                                       |
### RTM: ###
- bs4: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
- requests: https://requests.readthedocs.io/en/latest/
- re: https://docs.python.org/3/library/re.html

In [8]:
from bs4 import BeautifulSoup
from pprint import pprint
from time import sleep
import requests
import re


## 3-2) Example for scraping 1 movie

In [None]:
example_movie_url = 'https://www.rottentomatoes.com/m/top_gun_maverick'
# Make soup
soup = BeautifulSoup(requests.get(example_movie_url).content, 'html.parser')
print(soup.prettify())

### Note: ###
Before we proceed, we'll inspect HTML elements inside Chrome to get the target attributes, class names, etc. <br>
### Note: ###
For this exercise, we will get the below information from a movie's page:
- Synopsis
- Genre
- Language
- Director/Producer
- Release Date
- Runtime

In [10]:
# Create Empty Dictionary to hold all data
output = dict()

In [11]:
# Find parent element
target_class = 'panel panel-rt panel-box movie_info media'
p_block = soup.find(class_=target_class)
# pprint(p_block)

In [12]:
# Find Synopsis
movie_synopsis = p_block.find(class_='movie_synopsis clamp clamp-6 js-clamp')
movie_synopsis = movie_synopsis.text.strip()
print(movie_synopsis)

After more than thirty years of service as one of the Navy’s top aviators, Pete “Maverick” Mitchell (Tom Cruise) is where he belongs, pushing the envelope as a courageous test pilot and dodging the advancement in rank that would ground him. When he finds himself training a detachment of Top Gun graduates for a specialized mission the likes of which no living pilot has ever seen, Maverick encounters Lt. Bradley Bradshaw (Miles Teller), call sign: “Rooster,” the son of Maverick’s late friend and Radar Intercept Officer Lt. Nick Bradshaw, aka “Goose”.

Facing an uncertain future and confronting the ghosts of his past, Maverick is drawn into a confrontation with his own deepest fears, culminating in a mission that demands the ultimate sacrifice from those who will be chosen to fly it.


In [13]:
# Insert result into dictionary
output['Movie Synopsis'] = movie_synopsis

In [14]:
# Find All Other Info: Genre/Language/etc.
# Using find_all() to get all rows, then parse detail info from the root element
meta_values = p_block.find_all(class_='meta-row clearfix')
# print(meta_values)

In [15]:
# Inside each row, find its 'label' and 'value'
for mv in meta_values:
    label = mv.find(class_='meta-label subtle').text.strip()
    value = mv.find(class_='meta-value').text.strip()
    output[label] = value  # Insert into output dictionary
    print(f'{label}: {value}')
    continue

Rating:: PG-13 (Some Strong Language|Sequences of Intense Action)
Genre:: Action, 
                        
                        Adventure
Original Language:: English
Director:: Joseph Kosinski
Producer:: Jerry Bruckheimer, 
                            
                        
                            
                                Tom Cruise, 
                            
                        
                            
                                David Ellison, 
                            
                        
                            
                                Christopher McQuarrie
Writer:: Ehren Kruger, 
                            
                        
                            
                                Eric Warren Singer, 
                            
                        
                            
                                Christopher McQuarrie
Release Date (Theaters):: May 27, 2022
 wide
Box Office (Gross USA):: $715.8M
R

In [16]:
# Clean fields
for key in output:
    original_value = output[key]
    new_value = original_value.replace('\n', '')  # Remove all new-lines
    new_value = re.sub(' +', ' ', new_value)  # Remove consequtive white-spaces
    output[key] = new_value  # Re-assign into the output dictionary
    continue
# Examine Result
pprint(output)

{'Aspect Ratio:': 'Scope (2.35:1)',
 'Box Office (Gross USA):': '$715.8M',
 'Director:': 'Joseph Kosinski',
 'Distributor:': 'Paramount Pictures',
 'Genre:': 'Action, Adventure',
 'Movie Synopsis': 'After more than thirty years of service as one of the '
                   'Navy’s top aviators, Pete “Maverick” Mitchell (Tom Cruise) '
                   'is where he belongs, pushing the envelope as a courageous '
                   'test pilot and dodging the advancement in rank that would '
                   'ground him. When he finds himself training a detachment of '
                   'Top Gun graduates for a specialized mission the likes of '
                   'which no living pilot has ever seen, Maverick encounters '
                   'Lt. Bradley Bradshaw (Miles Teller), call sign: “Rooster,” '
                   'the son of Maverick’s late friend and Radar Intercept '
                   'Officer Lt. Nick Bradshaw, aka “Goose”.Facing an uncertain '
                   'future 

## 3-2) Let's loop through all movies!

In [17]:
# Create empty dictionary to hold them all
all_movies = dict()
for url in all_movie_urls[:10]:
    output = dict()
    # Step 1: Make soup
    soup = BeautifulSoup(requests.get(url).content, 'html.parser')
    # Step 2: Find parent element
    target_class = 'panel panel-rt panel-box movie_info media'
    p_block = soup.find(class_=target_class)
    # Step 3: Find child elements
    if not p_block:
        continue  # skip to next movie
    try:
        movie_synopsis = p_block.find(class_='movie_synopsis clamp clamp-6 js-clamp')
        movie_synopsis = movie_synopsis.text.strip()
        output['Movie Synopsis'] = movie_synopsis
    except AttributeError:
        output['Movie Synopsis'] = None
        pass
    # Find individual meta-values:
    # Using find_all() to get each row
    meta_values = p_block.find_all(class_='meta-row clearfix')
    # Inside each row, find its 'label' and 'value'
    for mv in meta_values:
        label = mv.find(class_='meta-label subtle').text.strip()
        value = mv.find(class_='meta-value').text.strip()
        output[label] = value
        continue
    # Clean fields
    for key in output:
        original_value = output[key]
        new_value = original_value.replace('\n', '')
        new_value = re.sub(' +', ' ', new_value)
        output[key] = new_value
        continue
    all_movies[url] = output
    pprint(output)
    # sleep(1)
    continue

{'Director:': 'Kamika Verma',
 'Genre:': 'Drama',
 'Movie Synopsis': "A former militant recognizes her kidnapper's voice, and "
                   'makes plans to extract a confession.',
 'Original Language:': 'Hindi',
 'Runtime:': '1h 43m',
 'Writer:': 'Anish'}
{'Director:': 'Raafi Rivero',
 'Genre:': 'Comedy, Drama, Romance',
 'Movie Synopsis': 'Two African-American college students get ready to '
                   'graduate from Princeton University.',
 'Original Language:': 'English',
 'Producer:': 'Jason Pollard, Roody Dorsainvil, Christopher Poindexter',
 'Runtime:': '24m'}
{'Director:': 'Jeannot Szwarc',
 'Genre:': 'Horror',
 'Movie Synopsis': "Edgar Allan Poe's Paris sleuth, C. Auguste Dupin (George "
                   'C. Scott), solves a macabre case of double murder.',
 'Original Language:': 'English',
 'Producer:': 'Robert A. Halmi',
 'Rating:': 'PG',
 'Release Date (Streaming):': 'Feb 4, 2020',
 'Runtime:': '1h 40m'}
{'Director:': 'Ben Bowie', 'Movie Synopsis': ''}
{'Dir

# Step 4: More Practice!
## 1) Hidden API ##
    - rottentomatoes.com
    - Read more: https://ianlondon.github.io/blog/web-scraping-discovering-hidden-apis/

## 2) Good websites to scrape: ##
    - rottentomatoes.com (user/critcs reviews, tv shows, etc.)
    - justwatch.com (movies/tv shows)
    - glassdoor.com
    - indeed.com
    - etc.
## 3) More advanced/Read more ##
    - yelp.com
        - User login
        - Captcha
        - Use proxies/VPNs/random delays/scrappy framework
    - airbnb.com
    - Multiprocessing/Threading/Async