# Lab | Web Scraping Multiple Pages

In [1]:
# import libraries
from bs4 import BeautifulSoup
import requests
import pandas as pd
from tqdm.notebook import tqdm


## Retrieve an arbitrary Wikipedia page of "Python" and create a list of links on that page

In [2]:
# store url in a variable
url ='https://en.wikipedia.org/wiki/Python'

In [3]:
# download html with a request, check response code 
response=requests.get(url)
response.status_code

200

In [4]:
# create a beautiful soup
soup = BeautifulSoup(response.content, 'html.parser')

In [5]:
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Python - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"d05e02a0-ffde-4e62-83b0-8c10bed671e6","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Python","wgTitle":"Python","wgCurRevisionId":1048703433,"wgRevisionId":1048703433,"wgArticleId":46332325,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Disambiguation pages with short descriptions","Short description is different from Wikidata","All article disambiguation pages","All disambiguation pages",

In [6]:
# select links

a_tags = soup.find_all("a")
a_tags

[<a id="top"></a>,
 <a class="mw-jump-link" href="#mw-head">Jump to navigation</a>,
 <a class="mw-jump-link" href="#searchInput">Jump to search</a>,
 <a class="extiw" href="https://en.wiktionary.org/wiki/Python" title="wiktionary:Python">Python</a>,
 <a class="extiw" href="https://en.wiktionary.org/wiki/python" title="wiktionary:python">python</a>,
 <a href="/wiki/Pythonidae" title="Pythonidae">Pythonidae</a>,
 <a href="/wiki/Python_(genus)" title="Python (genus)"><i>Python</i> (genus)</a>,
 <a href="#Computing"><span class="tocnumber">1</span> <span class="toctext">Computing</span></a>,
 <a href="#People"><span class="tocnumber">2</span> <span class="toctext">People</span></a>,
 <a href="#Roller_coasters"><span class="tocnumber">3</span> <span class="toctext">Roller coasters</span></a>,
 <a href="#Vehicles"><span class="tocnumber">4</span> <span class="toctext">Vehicles</span></a>,
 <a href="#Weaponry"><span class="tocnumber">5</span> <span class="toctext">Weaponry</span></a>,
 <a hre

In [7]:
# to get the link we need to: .get("href")
for a in a_tags:
    print(a.get("href"))

None
#mw-head
#searchInput
https://en.wiktionary.org/wiki/Python
https://en.wiktionary.org/wiki/python
/wiki/Pythonidae
/wiki/Python_(genus)
#Computing
#People
#Roller_coasters
#Vehicles
#Weaponry
#Other_uses
#See_also
/w/index.php?title=Python&action=edit&section=1
/wiki/Python_(programming_language)
/wiki/CMU_Common_Lisp
/wiki/PERQ#PERQ_3
/w/index.php?title=Python&action=edit&section=2
/wiki/Python_of_Aenus
/wiki/Python_(painter)
/wiki/Python_of_Byzantium
/wiki/Python_of_Catana
/wiki/Python_Anghelo
/w/index.php?title=Python&action=edit&section=3
/wiki/Python_(Efteling)
/wiki/Python_(Busch_Gardens_Tampa_Bay)
/wiki/Python_(Coney_Island,_Cincinnati,_Ohio)
/w/index.php?title=Python&action=edit&section=4
/wiki/Python_(automobile_maker)
/wiki/Python_(Ford_prototype)
/w/index.php?title=Python&action=edit&section=5
/wiki/Python_(missile)
/wiki/Python_(nuclear_primary)
/wiki/Colt_Python
/w/index.php?title=Python&action=edit&section=6
/wiki/PYTHON
/wiki/Python_(film)
/wiki/Python_(mythology)
/

## Find the number of titles that have changed in the United States Code since its last release point

In [8]:
# find url and store it in variable
url2 = 'http://uscode.house.gov/download/download.shtml'

In [9]:
# download html with a request, check response code 
response2=requests.get(url2)
response2.status_code

200

In [10]:
# create a beautiful soup
soup2 = BeautifulSoup(response2.content, 'html.parser')

In [11]:
print(soup2.prettify())

<?xml version='1.0' encoding='UTF-8' ?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="IE=8" http-equiv="X-UA-Compatible"/>
  <meta content="no-cache" http-equiv="pragma"/>
  <!-- HTTP 1.0 -->
  <meta content="no-cache,must-revalidate" http-equiv="cache-control"/>
  <!-- HTTP 1.1 -->
  <meta content="0" http-equiv="expires"/>
  <link href="/javax.faces.resource/favicon.ico.xhtml?ln=images" rel="shortcut icon"/>
  <link href="/javax.faces.resource/cssLayout.css.xhtml?ln=css" rel="stylesheet" type="text/css"/>
  <script src="/javax.faces.resource/jsf.js.xhtml?ln=javax.faces" type="text/javascript">
  </script>
  <link href="/javax.faces.resource/static.css.xhtml?ln=css" rel="stylesheet" type="text/css"/>
 </head>
 <body>
  <script src="/javax.faces.resource/browserPreferences.

In [12]:
for bold in soup2.select('div.usctitlechanged '):
    print(bold.get_text())



          Title 18 - Crimes and Criminal Procedure ٭



          Title 34 - Crime Control and Law Enforcement

        


In [35]:
bold = []
for i in tqdm(range(2)):
    bold.append(soup2.select('div.usctitlechanged ')[i].get_text())
print('There are', len(bold), 'titles that have changed in the United States Code since its last release point')

  0%|          | 0/2 [00:00<?, ?it/s]

There are 2 titles that have changed in the United States Code since its last release point


## Create a Python list with the top ten FBI's Most Wanted names

In [15]:
# find url and store it in variable
url3 = 'https://www.fbi.gov/wanted/topten'

In [16]:
# download html with a request, check response code 
response3=requests.get(url3)
response3.status_code

200

In [17]:
# create a beautiful soup
soup3 = BeautifulSoup(response3.content, 'html.parser')

In [18]:
print(soup3.prettify())

<!DOCTYPE html>
<html data-gridsystem="bs3" lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="ie=edge" http-equiv="x-ua-compatible"/>
  <meta content="width=device-width, initial-scale=1.0" name="viewport"/>
  <link href="https://www.fbi.gov/wanted/topten" rel="canonical"/>
  <link href="https://www.fbi.gov/wanted/topten/RSS" rel="alternate" title="Ten Most Wanted Fugitives - RSS 1.0" type="application/rss+xml"/>
  <link href="https://www.fbi.gov/wanted/topten/rss.xml" rel="alternate" title="Ten Most Wanted Fugitives - RSS 2.0" type="application/rss+xml"/>
  <link href="https://www.fbi.gov/wanted/topten/atom.xml" rel="alternate" title="Ten Most Wanted Fugitives - Atom" type="application/rss+xml"/>
  <meta content="summary_large_image" name="twitter:card"/>
  <meta content="Ten Most Wanted Fugitives | Federal Bureau of Investigation" name="twitter:title"/>
  <meta content="Federal Bureau of Investigation" property="og:site_name"/>
  <meta content="Ten Most Wanted Fugitives |

In [19]:
for a in soup3.select('h3.title'):
    print(a.get_text())


OCTAVIANO JUAREZ-CORRO


RAFAEL CARO-QUINTERO


YULAN ADONAY ARCHAGA CARIAS


EUGENE PALMER


BHADRESHKUMAR CHETANBHAI PATEL


ALEJANDRO ROSALES CASTILLO


ARNOLDO JIMENEZ


JASON DEREK BROWN


ALEXIS FLORES


JOSE RODOLFO VILLARREAL-HERNANDEZ



## Display the 20 latest earthquakes info (date, time, latitude, longitude and region name) by the EMSC as a pandas dataframe:

In [40]:
# find url and store it in variable
url4 = 'https://www.emsc-csem.org/Earthquake/'

# download html with a request, check response code 
response4=requests.get(url4)
response4.status_code

# create a beautiful soup
soup4 = BeautifulSoup(response4.content, 'html.parser')

In [23]:
print(soup4.prettify())

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html lang="en" xml:lang="en" xmlns="http://www.w3.org/1999/xhtml" xmlns:og="http://opengraphprotocol.org/schema/">
 <head>
  <meta content="srFzNKBTd0FbRhtnzP--Tjxl01NfbscjYwkp4yOWuQY" name="google-site-verification"/>
  <meta content="BCAA3C04C41AE6E6AFAF117B9469C66F" name="msvalidate.01"/>
  <meta content="43b36314ccb77957" name="y_key"/>
  <!-- 5-Clk8f50tFFdPTU97Bw7ygWE1A -->
  <meta content="en" http-equiv="Content-Language"/>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="all" name="robots"/>
  <meta content="earthquake,earthquakes,last earthquake,earthquake today,earthquakes today,earth quake,earth quakes,real time seismicity,seismic,seismicity,seismicity map,seismology,sismologie,EMSC,CSEM,seismicity on google earth,sumatra,tsunami,tsunamis,map,maps,richter,mercalli,moment tensors,epicenter,magnitude,seismology,foreshock,aftersho

In [24]:
#date, time, latitude, longitude and region name)
date = []
lat = []
long = []
region = []

In [25]:
for i in tqdm(range(10)):
    date.append(soup4.select("tbody a")[i].text)
    lat.append(soup4.select(".tabev1")[i].text)
    region.append(soup4.select(".tb_region")[i].text)

  0%|          | 0/10 [00:00<?, ?it/s]

In [26]:
earthquakes = pd.DataFrame({'date_time':date,'lat&longitude':lat,'region':region})
earthquakes = earthquakes[0:10]
earthquakes

Unnamed: 0,date_time,lat&longitude,region
0,2021-11-29 21:23:49.5,28.56,"CANARY ISLANDS, SPAIN REGION"
1,2021-11-29 21:11:14.2,17.83,"CANARY ISLANDS, SPAIN REGION"
2,2021-11-29 21:07:47.5,28.55,"CANARY ISLANDS, SPAIN REGION"
3,2021-11-29 21:07:13.1,17.86,NORTHERN CALIFORNIA
4,2021-11-29 21:01:01.0,28.59,"CERAM SEA, INDONESIA"
5,2021-11-29 21:00:29.5,17.81,"CANARY ISLANDS, SPAIN REGION"
6,2021-11-29 20:55:27.2,39.49,"CANARY ISLANDS, SPAIN REGION"
7,2021-11-29 20:48:43.4,122.94,PYRENEES
8,2021-11-29 20:45:38.7,2.91,STRAIT OF GIBRALTAR
9,2021-11-29 20:45:07.6,130.33,UTAH


## List all language names and number of related articles in the order they appear in wikipedia.org

In [41]:
# find url and store it in variable
url5 = 'https://www.wikipedia.org/'

# download html with a request, check response code 
response5=requests.get(url5)
response5.status_code

# create a beautiful soup
soup5 = BeautifulSoup(response5.content, 'html.parser')

In [42]:
print(soup4.prettify())

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html lang="en" xml:lang="en" xmlns="http://www.w3.org/1999/xhtml" xmlns:og="http://opengraphprotocol.org/schema/">
 <head>
  <meta content="srFzNKBTd0FbRhtnzP--Tjxl01NfbscjYwkp4yOWuQY" name="google-site-verification"/>
  <meta content="BCAA3C04C41AE6E6AFAF117B9469C66F" name="msvalidate.01"/>
  <meta content="43b36314ccb77957" name="y_key"/>
  <!-- 5-Clk8f50tFFdPTU97Bw7ygWE1A -->
  <meta content="en" http-equiv="Content-Language"/>
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="all" name="robots"/>
  <meta content="earthquake,earthquakes,last earthquake,earthquake today,earthquakes today,earth quake,earth quakes,real time seismicity,seismic,seismicity,seismicity map,seismology,sismologie,EMSC,CSEM,seismicity on google earth,sumatra,tsunami,tsunamis,map,maps,richter,mercalli,moment tensors,epicenter,magnitude,seismology,foreshock,aftersho

In [84]:
soup5.select(".central-featured ")
soup5.select(".langlist ")


[<div class="langlist langlist-large hlist" data-el-section="secondary links">
 <ul>
 <li><a href="//pl.wikipedia.org/" lang="pl">Polski</a></li>
 <li><a href="//ar.wikipedia.org/" lang="ar" title="Al-ʿArabīyah"><bdi dir="rtl">العربية</bdi></a></li>
 <li><a href="//de.wikipedia.org/" lang="de">Deutsch</a></li>
 <li><a href="//en.wikipedia.org/" lang="en" title="English">English</a></li>
 <li><a href="//es.wikipedia.org/" lang="es">Español</a></li>
 <li><a href="//fr.wikipedia.org/" lang="fr">Français</a></li>
 <li><a href="//it.wikipedia.org/" lang="it">Italiano</a></li>
 <li><a href="//arz.wikipedia.org/" lang="arz" title="Maṣrī"><bdi dir="rtl">مصرى</bdi></a></li>
 <li><a href="//nl.wikipedia.org/" lang="nl">Nederlands</a></li>
 <li><a href="//ja.wikipedia.org/" lang="ja" title="Nihongo">日本語</a></li>
 <li><a href="//pt.wikipedia.org/" lang="pt">Português</a></li>
 <li><a href="//ru.wikipedia.org/" lang="ru" title="Russkiy">Русский</a></li>
 <li><a href="//ceb.wikipedia.org/" lang="ceb

## A list with the different kind of datasets available in data.gov.uk

In [28]:
# find url and store it in variable
url = 'https://data.gov.uk/'

## Display the top 10 languages by number of native speakers stored in a pandas dataframe:

In [29]:
# find url and store it in variable
url = 'https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers'