In [7]:
# Web scraping is general term for technologies involving automating the gathering of data from a website
# lxml, bs4 and requests are the libraries for web scraping
import requests
import bs4

In [9]:
## Reaching out to the website
result = requests.get("https://www.iana.org/help/example-domains")

In [10]:
## Return all the details as a giant string
result.text

'\n<!doctype html>\n<html>\n<head>\n\t<title>Example Domains</title>\n\n\t<meta charset="utf-8" />\n\t<meta http-equiv="Content-type" content="text/html; charset=utf-8" />\n\t<meta name="viewport" content="width=device-width, initial-scale=1" />\n\t\n\t<link rel="stylesheet" href="/_css/2022/iana_website.css"/>\n\t<link rel="shortcut icon" type="image/ico" href="/_img/bookmark_icon.ico"/>\n\n\t<script type="text/javascript" src="/_js/jquery.js"></script>\n\t<script type="text/javascript" src="/_js/iana.js"></script>\n\n\t\n</head>\n\n<body>\n\n<header>\n    <div id="header">\n        <div id="logo">\n            <a href="/"><img src="/_img/2022/iana-logo-header.svg" alt="Homepage"/></a>\n        </div>\n        <div class="navigation">\n            <ul>\n                <li><a href="/domains">Domains</a></li>\n                <li><a href="/protocols">Protocols</a></li>\n                <li><a href="/numbers">Numbers</a></li>\n<!--                <li><a href="/news">News</a></li>-->\n  

In [11]:
## To make the text readable bs4 library can be used
## Let's grab title 
soup = bs4.BeautifulSoup(result.text, 'lxml')
soup

<!DOCTYPE html>
<html>
<head>
<title>Example Domains</title>
<meta charset="utf-8"/>
<meta content="text/html; charset=utf-8" http-equiv="Content-type"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<link href="/_css/2022/iana_website.css" rel="stylesheet"/>
<link href="/_img/bookmark_icon.ico" rel="shortcut icon" type="image/ico"/>
<script src="/_js/jquery.js" type="text/javascript"></script>
<script src="/_js/iana.js" type="text/javascript"></script>
</head>
<body>
<header>
<div id="header">
<div id="logo">
<a href="/"><img alt="Homepage" src="/_img/2022/iana-logo-header.svg"/></a>
</div>
<div class="navigation">
<ul>
<li><a href="/domains">Domains</a></li>
<li><a href="/protocols">Protocols</a></li>
<li><a href="/numbers">Numbers</a></li>
<!--                <li><a href="/news">News</a></li>-->
<li><a href="/about">About</a></li>
</ul>
</div>
</div>
</header>
<div id="body">
<article class="hemmed sidenav">
<main>
<div class="help-article">
<h1>Example Domai

In [12]:
soup.select('title')

[<title>Example Domains</title>]

In [13]:
soup.select('title')[0].getText()

'Example Domains'

In [15]:
# Let's grab paragraph
site_paragraphs = soup.select('p')
site_paragraphs

[<p>As described in <a href="/go/rfc2606">RFC 2606</a> and <a href="/go/rfc6761">RFC 6761</a>, a
 number of domains such as example.com and example.org are maintained
 for documentation purposes. These domains may be used as illustrative
 examples in documents without prior coordination with us. They are not
 available for registration or transfer.</p>,
 <p>We provide a web service on the example domain hosts to provide basic
 information on the purpose of the domain. These web services are
 provided as best effort, but are not designed to support production
 applications. While incidental traffic for incorrectly configured
 applications is expected, please do not design applications that require
 the example domains to have operating HTTP service.</p>,
 <p>The IANA functions coordinate the Internet’s globally unique identifiers, and
                 are provided by <a href="http://pti.icann.org">Public Technical Identifiers</a>, an affiliate of
                 <a href="http://www.ica

In [16]:
site_paragraphs[0].getText()

'As described in RFC 2606 and RFC 6761, a\nnumber of domains such as example.com and example.org are maintained\nfor documentation purposes. These domains may be used as illustrative\nexamples in documents without prior coordination with us. They are not\navailable for registration or transfer.'

In [18]:
## Let's grab more elements by class
res = requests.get('https://en.wikipedia.org/wiki/Grace_Hopper')

In [20]:
soup = bs4.BeautifulSoup(res.text, 'lxml')

In [22]:
## Selecting element with class name 'vector-toc-text'
soup.select('.vector-toc-text')

[<div class="vector-toc-text">(Top)</div>,
 <div class="vector-toc-text">
 <span class="vector-toc-numb">1</span>Early life and education</div>,
 <div class="vector-toc-text">
 <span class="vector-toc-numb">2</span>Career</div>,
 <div class="vector-toc-text">
 <span class="vector-toc-numb">2.1</span>World War II</div>,
 <div class="vector-toc-text">
 <span class="vector-toc-numb">2.2</span>UNIVAC</div>,
 <div class="vector-toc-text">
 <span class="vector-toc-numb">2.3</span>COBOL</div>,
 <div class="vector-toc-text">
 <span class="vector-toc-numb">2.4</span>Standards</div>,
 <div class="vector-toc-text">
 <span class="vector-toc-numb">3</span>Retirement</div>,
 <div class="vector-toc-text">
 <span class="vector-toc-numb">4</span>Post-retirement</div>,
 <div class="vector-toc-text">
 <span class="vector-toc-numb">5</span>Anecdotes</div>,
 <div class="vector-toc-text">
 <span class="vector-toc-numb">6</span>Death</div>,
 <div class="vector-toc-text">
 <span class="vector-toc-numb">7</spa

In [24]:
# Print text from all the sub elements in the class
for item in soup.select('.vector-toc-text'):
    text = item.getText()
    print(text)

(Top)

1Early life and education

2Career

2.1World War II

2.2UNIVAC

2.3COBOL

2.4Standards

3Retirement

4Post-retirement

5Anecdotes

6Death

7Dates of rank

8Awards and honors

8.1Military awards

8.2Other awards

9Legacy

9.1Places

9.2Programs

9.3In popular culture

9.3.1Grace Hopper Celebration of Women in Computing

10See also

11Notes

12References

13Obituary notices

14Further reading

15External links


In [28]:
# Lets grab an image

## This will grab all the img tags
soup.select('img')

[<img alt="" aria-hidden="true" class="mw-logo-icon" height="50" src="/static/images/icons/wikipedia.png" width="50"/>,
 <img alt="Wikipedia" class="mw-logo-wordmark" src="/static/images/mobile/copyright/wikipedia-wordmark-en.svg" style="width: 7.5em; height: 1.125em;"/>,
 <img alt="The Free Encyclopedia" class="mw-logo-tagline" height="13" src="/static/images/mobile/copyright/wikipedia-tagline-en.svg" style="width: 7.3125em; height: 0.8125em;" width="117"/>,
 <img class="mw-file-element" data-file-height="3000" data-file-width="2400" decoding="async" height="300" src="//upload.wikimedia.org/wikipedia/commons/thumb/a/ad/Commodore_Grace_M._Hopper%2C_USN_%28covered%29.jpg/240px-Commodore_Grace_M._Hopper%2C_USN_%28covered%29.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/a/ad/Commodore_Grace_M._Hopper%2C_USN_%28covered%29.jpg/360px-Commodore_Grace_M._Hopper%2C_USN_%28covered%29.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/a/ad/Commodore_Grace_M._Hopper%2C_USN_%28c

In [31]:
## Grab the class of the image needed
soup.select('.mw-file-element')

[<img class="mw-file-element" data-file-height="3000" data-file-width="2400" decoding="async" height="300" src="//upload.wikimedia.org/wikipedia/commons/thumb/a/ad/Commodore_Grace_M._Hopper%2C_USN_%28covered%29.jpg/240px-Commodore_Grace_M._Hopper%2C_USN_%28covered%29.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/a/ad/Commodore_Grace_M._Hopper%2C_USN_%28covered%29.jpg/360px-Commodore_Grace_M._Hopper%2C_USN_%28covered%29.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/a/ad/Commodore_Grace_M._Hopper%2C_USN_%28covered%29.jpg/480px-Commodore_Grace_M._Hopper%2C_USN_%28covered%29.jpg 2x" width="240"/>,
 <img alt="" class="mw-file-element" data-file-height="650" data-file-width="1235" decoding="async" height="12" src="//upload.wikimedia.org/wikipedia/en/thumb/a/a4/Flag_of_the_United_States.svg/23px-Flag_of_the_United_States.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/a/a4/Flag_of_the_United_States.svg/35px-Flag_of_the_United_States.svg.png 1.5x, //upload.w

In [33]:
final_img = soup.select('.mw-file-element')[0]
final_img

<img class="mw-file-element" data-file-height="3000" data-file-width="2400" decoding="async" height="300" src="//upload.wikimedia.org/wikipedia/commons/thumb/a/ad/Commodore_Grace_M._Hopper%2C_USN_%28covered%29.jpg/240px-Commodore_Grace_M._Hopper%2C_USN_%28covered%29.jpg" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/a/ad/Commodore_Grace_M._Hopper%2C_USN_%28covered%29.jpg/360px-Commodore_Grace_M._Hopper%2C_USN_%28covered%29.jpg 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/a/ad/Commodore_Grace_M._Hopper%2C_USN_%28covered%29.jpg/480px-Commodore_Grace_M._Hopper%2C_USN_%28covered%29.jpg 2x" width="240"/>

In [36]:
## This final_img can be treated like a dictionary, so we can grab src, class and all other elements from the tag
final_img['class']

['mw-file-element']

In [37]:
final_img['src']

'//upload.wikimedia.org/wikipedia/commons/thumb/a/ad/Commodore_Grace_M._Hopper%2C_USN_%28covered%29.jpg/240px-Commodore_Grace_M._Hopper%2C_USN_%28covered%29.jpg'

<img src="//upload.wikimedia.org/wikipedia/commons/thumb/a/ad/Commodore_Grace_M._Hopper%2C_USN_%28covered%29.jpg/240px-Commodore_Grace_M._Hopper%2C_USN_%28covered%29.jpg">

In [44]:
## Download image
## Make a request to the image url with https
img_link = requests.get('https://upload.wikimedia.org/wikipedia/commons/thumb/a/ad/Commodore_Grace_M._Hopper%2C_USN_%28covered%29.jpg/240px-Commodore_Grace_M._Hopper%2C_USN_%28covered%29.jpg')

In [45]:
## This is a binary file
img_link.content

b'\xff\xd8\xff\xe1\x00rExif\x00\x00MM\x00*\x00\x00\x00\x08\x00\x05\x01\x1a\x00\x05\x00\x00\x00\x01\x00\x00\x00J\x01\x1b\x00\x05\x00\x00\x00\x01\x00\x00\x00R\x01(\x00\x03\x00\x00\x00\x01\x00\x02\x00\x00\x01;\x00\x02\x00\x00\x00\x0f\x00\x00\x00Z\x02\x13\x00\x03\x00\x00\x00\x01\x00\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00H\x00\x00\x00\x01\x00\x00\x00H\x00\x00\x00\x01James S. Davis\x00\x00\xff\xe1\x0bwhttp://ns.adobe.com/xap/1.0/\x00<?xpacket begin=\'\xef\xbb\xbf\' id=\'W5M0MpCehiHzreSzNTczkc9d\'?>\n<x:xmpmeta xmlns:x=\'adobe:ns:meta/\' x:xmptk=\'Image::ExifTool 9.74\'>\n<rdf:RDF xmlns:rdf=\'http://www.w3.org/1999/02/22-rdf-syntax-ns#\'>\n\n <rdf:Description rdf:about=\'\'\n  xmlns:dc=\'http://purl.org/dc/elements/1.1/\'>\n  <dc:description>\n   <rdf:Alt>\n    <rdf:li xml:lang=\'x-default\'>Commodore Grace M. Hopper, USN (covered).</rdf:li>\n   </rdf:Alt>\n  </dc:description>\n </rdf:Description>\n</rdf:RDF>\n</x:xmpmeta>\n                                                                    

In [46]:
## Open a new file and write this image to that
## Mode is wb(write binary)
f = open('myImage.jpg', 'wb')

In [47]:
## Write the image content to the file
f.write(img_link.content)

24355

In [48]:
f.close()