In [5]:
import pandas as pd 
import requests
from bs4 import BeautifulSoup

# 1 Learning how to work with html pags

In [6]:
# 1st steep - create the html variable
html_doc="""
<!DOCTYPE html>
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
​
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
​
<p class="story">...</p>
</html>
"""

In [9]:
# parse the html
soup=BeautifulSoup(html_doc, 'html.parser')

In [10]:
# print the formatted html
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   The Dormouse's story
  </title>
 </head>
 <body>
  <p class="title">
   <b>
    The Dormouse's story
   </b>
  </p>
  ​
  <p class="story">
   Once upon a time there were three little sisters; and their names were
   <a class="sister" href="http://example.com/elsie" id="link1">
    Elsie
   </a>
   ,
   <a class="sister" href="http://example.com/lacie" id="link2">
    Lacie
   </a>
   and
   <a class="sister" href="http://example.com/tillie" id="link3">
    Tillie
   </a>
   ;
and they lived at the bottom of a well.
  </p>
  ​
  <p class="story">
   ...
  </p>
 </body>
</html>



# 2 Navigate using html

In [11]:
soup.title # from the soup, select class=title

<title>The Dormouse's story</title>

In [12]:
soup.title.string

"The Dormouse's story"

In [13]:
soup.p

<p class="title"><b>The Dormouse's story</b></p>

In [14]:
p_tags=soup.find_all('p')

In [51]:
p_tags

[<p class="title"><b>The Dormouse's story</b></p>,
 <p class="story">Once upon a time there were three little sisters; and their names were
 <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
 and they lived at the bottom of a well.</p>,
 <p class="story">...</p>]

In [16]:
for p in p_tags:
    print(p.get_text())
    
# warning to use a method to do a repetitive task    

The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...


In [17]:
a_tags=soup.find_all('a')

In [18]:
a_tags

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [19]:
for a in a_tags:
    print(a.get_text())

Elsie
Lacie
Tillie


In [20]:
for a in a_tags:
    print(a.get('href'))

http://example.com/elsie
http://example.com/lacie
http://example.com/tillie


In [21]:
# as a list
links=[]
for a in a_tags:
    links.append(a.get('href'))

In [22]:
links

['http://example.com/elsie',
 'http://example.com/lacie',
 'http://example.com/tillie']

In [23]:
soup.title


<title>The Dormouse's story</title>

In [24]:
soup.title.parent.string

"The Dormouse's story"

In [25]:
soup.title.parent.name

'head'

In [30]:
soup.a.parent.parent.parent.parent


<!DOCTYPE html>

<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
​
<p class="story">Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
​
<p class="story">...</p>
</body></html>

In [31]:
import re # regex

In [33]:
soup.find_all(string=re.compile('we'))
# to look for a specific word or letters

['Once upon a time there were three little sisters; and their names were\n',
 ';\nand they lived at the bottom of a well.']

In [34]:
soup.text.count('we')
# number of times we is mentioned

3

# 2.1 Eg searching one webpage and counting the number of mentions of a specific word (casee sensitive)
 · remember this is the raw html

In [36]:
re.findall(r'\w+', requests.get('http://ironhack.com/en').text).count('bootcamp')

60

In [38]:
response=requests.get('http://ironhack.com/en')

In [39]:
response.status_code

200

In [49]:
response=requests.get('https://www.firabarcelona.com/en')

In [50]:
response.status_code

200

# 3 CSS method to get data

In [43]:
soup.select('#link2') # # for css id

[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

In [44]:
soup.select('.sister') # . for css class

[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
 <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
 <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

In [45]:
# iteerate through the class sister and collect the text for each record
for s in soup.select('.sister'):
    print(s.get_text())

Elsie
Lacie
Tillie


In [47]:
print(soup.select('.sister')[2].get_text()) # index

Tillie


In [48]:
print(soup.select('.sister').get_text()) # this wont work because we have multiple results

AttributeError: ResultSet object has no attribute 'get_text'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?

In [None]:
# 5 Scrape 250 movies from IMDB
· get website url
· run request on url
· parse the html as soup
· pretty print to read the soup
· find the right html or css
· get movie title, rank, year, stars + director
· clean and turn the data into a data frame

In [63]:
# we get the page
url="https://www.imdb.com/chart/top"
page= requests.get(url, headers = {"Accept-Language": "en-US"})
# for the language. Otherwise picks the location

In [64]:
response.status_code

200

In [65]:
# we make the soup
soup=BeautifulSoup(page.content, 'html.parser')

In [56]:
print(soup.prettify())

<!DOCTYPE html>
<html class="no-js" lang="en-US">
 <head>
  <meta charset="utf-8"/>
  <script type="text/javascript">
   (window.NREUM||(NREUM={})).init={ajax:{deny_list:["bam.eu01.nr-data.net"]}};(window.NREUM||(NREUM={})).loader_config={licenseKey:"ffdb9806ac",applicationID:"9786299"};window.NREUM||(NREUM={}),__nr_require=function(t,e,n){function r(n){if(!e[n]){var i=e[n]={exports:{}};t[n][0].call(i.exports,function(e){var i=t[n][1][e];return r(i||e)},i,i.exports)}return e[n].exports}if("function"==typeof __nr_require)return __nr_require;for(var i=0;i<n.length;i++)r(n[i]);return r}({1:[function(t,e,n){function r(){}function i(t,e,n,r){return function(){return s.recordSupportability("API/"+e+"/called"),o(t+e,[u.now()].concat(c(arguments)),n?null:this,r),n?void 0:this}}var o=t("handle"),a=t(9),c=t(10),f=t("ee").get("tracer"),u=t("loader"),s=t(4),d=NREUM;"undefined"==typeof window.newrelic&&(newrelic=d);var p=["setPageViewName","setCustomAttribute","setErrorHandler","finished","addToTra

In [66]:
# main > div > span > div > div > div.lister > table > tbody > tr:nth-child(1) > td.titleColumn
soup.select('.lister-list > tr:nth-child(1) > td:nth-child(2)')

[<td class="titleColumn">
       1.
       <a href="/title/tt0111161/" title="Frank Darabont (dir.), Tim Robbins, Morgan Freeman">The Shawshank Redemption</a>
 <span class="secondaryInfo">(1994)</span>
 </td>]

In [70]:
# for the title, year, director and the actors
soup.select('td.titleColumn')

[<td class="titleColumn">
       1.
       <a href="/title/tt0111161/" title="Frank Darabont (dir.), Tim Robbins, Morgan Freeman">The Shawshank Redemption</a>
 <span class="secondaryInfo">(1994)</span>
 </td>,
 <td class="titleColumn">
       2.
       <a href="/title/tt0068646/" title="Francis Ford Coppola (dir.), Marlon Brando, Al Pacino">The Godfather</a>
 <span class="secondaryInfo">(1972)</span>
 </td>,
 <td class="titleColumn">
       3.
       <a href="/title/tt0468569/" title="Christopher Nolan (dir.), Christian Bale, Heath Ledger">The Dark Knight</a>
 <span class="secondaryInfo">(2008)</span>
 </td>,
 <td class="titleColumn">
       4.
       <a href="/title/tt0071562/" title="Francis Ford Coppola (dir.), Al Pacino, Robert De Niro">The Godfather: Part II</a>
 <span class="secondaryInfo">(1974)</span>
 </td>,
 <td class="titleColumn">
       5.
       <a href="/title/tt0050083/" title="Sidney Lumet (dir.), Henry Fonda, Lee J. Cobb">12 Angry Men</a>
 <span class="secondaryInfo">

In [69]:
# for rating
soup.select('td.ratingColumn.imdbRating')

[<td class="ratingColumn imdbRating">
 <strong title="9.2 based on 2,589,809 user ratings">9.2</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="9.2 based on 1,785,082 user ratings">9.2</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="9.0 based on 2,560,486 user ratings">9.0</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="9.0 based on 1,232,269 user ratings">9.0</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="8.9 based on 764,973 user ratings">8.9</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="8.9 based on 1,317,836 user ratings">8.9</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="8.9 based on 1,779,414 user ratings">8.9</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="8.9 based on 1,985,942 user ratings">8.9</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="8.8 based on 1,800,641 user ratings">8.8</strong>
 <

In [73]:
len(soup.select('td.titleColumn'))

250

# 5.1 Get it right for 1 result, then expand to all

In [71]:
#sample one movie. and get th information back
#iterate for all movies
title=soup.select('td.titleColumn a') [22].get_text()

In [74]:
title

'City of God'

In [75]:
dir_stars=soup.select('td.titleColumn a')[22]['title']

In [76]:
dir_stars

'Fernando Meirelles (dir.), Alexandre Rodrigues, Leandro Firmino'

In [77]:
year=soup.select('td.titleColumn span.secondaryInfo')[22].get_text()

In [78]:
year

'(2002)'

In [82]:
rating=soup.select('td.ratingColumn.imdbRating strong')[22].get_text()

In [83]:
rating

'8.6'

# 5.2 iterate to assemble all the information

In [85]:
imdb_position=[]
movie_title=[]
release_year=[]
imdb_rating=[]
dir_star_names=[]

len_movies=len(soup.select('td.ratingColumn.imdbRating strong'))
len_movies_check=len(soup.select('td.titleColumn span.secondaryInfo'))

In [88]:
#check both have the same length
print(len_movies, len_movies_check)

250 250


In [91]:
for i in tqdm(range(len_movies)):
    imdb_position.append(i+1)
    movie_title.append(soup.select('td.titleColumn a')[i].get_text())
    imdb_rating.append(soup.select('td.titleColumn span.secondaryInfo')[i].get_text())
    release_year.append(soup.select('td.ratingColumn.imdbRating strong')[i].get_text())
    dir_star_names.append(soup.select('td.titleColumn a')[i]['title'])

  0%|          | 0/250 [00:00<?, ?it/s]

In [90]:
from tqdm.notebook import tqdm

# 5.3 create a clean df of this information

In [93]:
clean_year=[yr.strip('(').strip(')') for yr in release_year]

In [94]:
movies_250=pd.DataFrame({'rank':imdb_position, 
                         'title':movie_title, 
                         'released':clean_year,
                         'rating':imdb_rating, 
                         'people': dir_star_names})

In [95]:
movies_250

Unnamed: 0,rank,title,released,rating,people
0,1,The Shawshank Redemption,9.2,(1994),"Frank Darabont (dir.), Tim Robbins, Morgan Fre..."
1,2,The Godfather,9.2,(1972),"Francis Ford Coppola (dir.), Marlon Brando, Al..."
2,3,The Dark Knight,9.0,(2008),"Christopher Nolan (dir.), Christian Bale, Heat..."
3,4,The Godfather: Part II,9.0,(1974),"Francis Ford Coppola (dir.), Al Pacino, Robert..."
4,5,12 Angry Men,8.9,(1957),"Sidney Lumet (dir.), Henry Fonda, Lee J. Cobb"
...,...,...,...,...,...
245,246,Aladdin,8.0,(1992),"Ron Clements (dir.), Scott Weinger, Robin Will..."
246,247,The Help,8.0,(2011),"Tate Taylor (dir.), Emma Stone, Viola Davis"
247,248,Beauty and the Beast,8.0,(1991),"Gary Trousdale (dir.), Paige O'Hara, Robby Benson"
248,249,Dances with Wolves,8.0,(1990),"Kevin Costner (dir.), Kevin Costner, Mary McDo..."
