# Set-up

In [1]:
# load packages
import requests
from bs4 import BeautifulSoup

In [2]:
# Define the URL of the site
base_site = "https://editorial.rottentomatoes.com/guide/140-essential-action-movies-to-watch-now/2/"

In [3]:
# sending a request to the webpage
response = requests.get(base_site)
response

<Response [200]>

In [4]:
# get the HTML from the webpage
html = response.content

In [5]:
# convert the HTML to a BeatifulSoup object
soup = BeautifulSoup(html, 'lxml')

In [6]:
# Find the encompassing div tags
divs = soup.find_all("div", {"class": "col-sm-18 col-full-xs countdown-item-content"})
divs

[<div class="col-sm-18 col-full-xs countdown-item-content">
 <div class="row countdown-item-title-bar">
 <div class="col-sm-20 col-full-xs" style="height: 100%;">
 <div class="article_movie_title" style="float: left;">
 <div><h2><a href="https://www.rottentomatoes.com/m/13_assassins_2011/">13 Assassins</a> <span class="subtle start-year">(2011)</span> <span class="icon tiny certified" title="Certified Fresh"></span> <span class="tMeterScore">95%</span></h2></div>
 </div>
 </div>
 <div class="col-sm-4 col-full-xs" style="height: 100%;">
 <div class="countdown-index">#70</div>
 </div>
 </div>
 <div class="row countdown-item-details">
 <div class="col-sm-24">
 <div class="info countdown-adjusted-score"><span class="descriptor">Adjusted Score: </span>98.424% <span class="glyphicon glyphicon-question-sign" data-html="true" data-original-title="The Adjusted Score comes from a weighted formula (Bayesian) that we use that accounts for variation in the number of reviews per movie." data-placeme

# Extracting the rest of the information

## Adjusted score

In [7]:
# The adjusted scores can be found in a div with class 'info countdown-adjusted-score'
adj_scores = [div.find("div", {"class": "info countdown-adjusted-score"}) for div in divs]
adj_scores

[<div class="info countdown-adjusted-score"><span class="descriptor">Adjusted Score: </span>98.424% <span class="glyphicon glyphicon-question-sign" data-html="true" data-original-title="The Adjusted Score comes from a weighted formula (Bayesian) that we use that accounts for variation in the number of reviews per movie." data-placement="top" data-toggle="tooltip" rel="tooltip" title=""></span></div>,
 <div class="info countdown-adjusted-score"><span class="descriptor">Adjusted Score: </span>61.566% <span class="glyphicon glyphicon-question-sign" data-html="true" data-original-title="The Adjusted Score comes from a weighted formula (Bayesian) that we use that accounts for variation in the number of reviews per movie." data-placement="top" data-toggle="tooltip" rel="tooltip" title=""></span></div>,
 <div class="info countdown-adjusted-score"><span class="descriptor">Adjusted Score: </span>93.242% <span class="glyphicon glyphicon-question-sign" data-html="true" data-original-title="The Ad

In [8]:
# Inspecting an element
adj_scores[0]

<div class="info countdown-adjusted-score"><span class="descriptor">Adjusted Score: </span>98.424% <span class="glyphicon glyphicon-question-sign" data-html="true" data-original-title="The Adjusted Score comes from a weighted formula (Bayesian) that we use that accounts for variation in the number of reviews per movie." data-placement="top" data-toggle="tooltip" rel="tooltip" title=""></span></div>

In [9]:
# By inspection we see that the string we are looking for is the second child of the 'div' tag
adj_scores[0].contents[1]  # Note the extra whitespace at the end

'98.424% '

In [10]:
# Extracting the string (without '%' sign and extra space)
adj_scores_clean = [score.contents[1].strip('% ') for score in adj_scores]
adj_scores_clean

['98.424',
 '61.566',
 '93.242',
 '96.081',
 '43.856',
 '104.639',
 '90.367',
 '110.355',
 '103.101',
 '102.299',
 '112.287',
 '92.779',
 '101.637',
 '86.566',
 '104.153',
 '91.175',
 '90.562',
 '98.253',
 '95.62',
 '102.453',
 '92.17',
 '101.95',
 '119.101',
 '97.823',
 '81.83',
 '97.304',
 '117.873',
 '84.548',
 '88.278',
 '74.441',
 '97.234',
 '93.011',
 '107.067',
 '100.434',
 '82.844',
 '86.338',
 '69.338',
 '94.309',
 '100.237',
 '103.837',
 '108.929',
 '77.178',
 '100.619',
 '86.264',
 '98.687',
 '72.011',
 '90.925',
 '105.058',
 '104.919',
 '83.141',
 '91.869',
 '76.862',
 '97.862',
 '87.788',
 '94.155',
 '101.879',
 '84.562',
 '100.908',
 '86.768',
 '98.816',
 '91.219',
 '91.589',
 '115.565',
 '101.091',
 '105.846',
 '96.061',
 '94.662',
 '99.241',
 '98.793',
 '113.244']

In [11]:
# Converting the strings to numbers
final_adj = [float(score) for score in adj_scores_clean] # Note that this time the scores are float, not int!
final_adj

[98.424,
 61.566,
 93.242,
 96.081,
 43.856,
 104.639,
 90.367,
 110.355,
 103.101,
 102.299,
 112.287,
 92.779,
 101.637,
 86.566,
 104.153,
 91.175,
 90.562,
 98.253,
 95.62,
 102.453,
 92.17,
 101.95,
 119.101,
 97.823,
 81.83,
 97.304,
 117.873,
 84.548,
 88.278,
 74.441,
 97.234,
 93.011,
 107.067,
 100.434,
 82.844,
 86.338,
 69.338,
 94.309,
 100.237,
 103.837,
 108.929,
 77.178,
 100.619,
 86.264,
 98.687,
 72.011,
 90.925,
 105.058,
 104.919,
 83.141,
 91.869,
 76.862,
 97.862,
 87.788,
 94.155,
 101.879,
 84.562,
 100.908,
 86.768,
 98.816,
 91.219,
 91.589,
 115.565,
 101.091,
 105.846,
 96.061,
 94.662,
 99.241,
 98.793,
 113.244]

## Synopsis

In [12]:
# The synopsis is located inside a 'div' tag with the class 'info synopsis'
synopsis = [div.find('div', class_='synopsis') for div in divs]
synopsis

[<div class="info synopsis"><span class="descriptor">Synopsis:</span> Cult director Takeshi Miike (Ichi the Killer, Audition) delivers a bravado period action film set at the end of Japan's...<a class="" data-pageheader="" href="https://www.rottentomatoes.com/m/13_assassins_2011/" target="_top"> [More]</a></div>,
 <div class="info synopsis"><span class="descriptor">Synopsis:</span> Another entry into the "cheer for the most likeable bad guy" series of Hong Kong action flicks, Full Contact tells...<a class="" data-pageheader="" href="https://www.rottentomatoes.com/m/full_contact/" target="_top"> [More]</a></div>,
 <div class="info synopsis"><span class="descriptor">Synopsis:</span> The third installment in the Spielberg/Lucas Indiana Jones saga, Indiana Jones and the Last Crusade evokes many of the thrills...<a class="" data-pageheader="" href="https://www.rottentomatoes.com/m/indiana_jones_and_the_last_crusade/" target="_top"> [More]</a></div>,
 <div class="info synopsis"><span class="

In [13]:
# Inspecting the element
synopsis[0]

<div class="info synopsis"><span class="descriptor">Synopsis:</span> Cult director Takeshi Miike (Ichi the Killer, Audition) delivers a bravado period action film set at the end of Japan's...<a class="" data-pageheader="" href="https://www.rottentomatoes.com/m/13_assassins_2011/" target="_top"> [More]</a></div>

In [14]:
# The text is the second child
synopsis[0].contents[1]

" Cult director Takeshi Miike (Ichi the Killer, Audition) delivers a bravado period action film set at the end of Japan's..."

In [15]:
# Extracting the text
synopsis_text = [syn.contents[1] for syn in synopsis]
synopsis_text

[" Cult director Takeshi Miike (Ichi the Killer, Audition) delivers a bravado period action film set at the end of Japan's...",
 ' Another entry into the "cheer for the most likeable bad guy" series of Hong Kong action flicks, Full Contact tells...',
 ' The third installment in the Spielberg/Lucas Indiana Jones saga, Indiana Jones and the Last Crusade evokes many of the thrills...',
 ' The work of international superstar Stephen Chow, Kung Fu Hustle is a humorous, special-effects-filled, action-packed martial arts epic set in...',
 " John Woo established himself as one of Hong Kong's premiere action directors with this ultra-hip, ultra-violent action classic. The film...",
 ' Billionaire industrialist and genius inventor Tony Stark is kidnapped and forced to build a devastating weapon. Instead, using his intelligence...',
 ' The Night Comes For Us follows Ito (played by Joe Taslim), a former triad enforcer, and his mission to protect...',
 " It's 2029. Mutants are gone--or very nearly 