# Webscraping

In [1]:
from bs4 import BeautifulSoup

import requests as req

import os

import pandas as pd

from urllib.parse import urljoin

### Simple request to retrieve webpage title

In [2]:
url = 'https://www.scwcsu.nhs.uk/about/our-values'

response = req.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')

    print('Webpage title:', soup.title.string)

else:
    print(f'Failed to fetch webpage: {response.status_code}')

Webpage title: Our values - NHS SCW Support and Transformation for Health and Care


### Display the HTML for a page

In [3]:
print(soup.prettify()) # i.e. accessing the BeautifulSoup object created above

<!DOCTYPE html>
<html dir="ltr" lang="en-gb">
 <head>
  <!-- Google tag (gtag.js) -->
  <script async="" src="https://www.googletagmanager.com/gtag/js?id=G-TGFSCM0BEC">
  </script>
  <script>
   window.dataLayer = window.dataLayer || [];
		function gtag(){dataLayer.push(arguments);}
		gtag('js', new Date());

		gtag('config', 'G-TGFSCM0BEC');
  </script>
  <meta content="width=device-width, initial-scale=1, shrink-to-fit=no" name="viewport"/>
  <meta charset="utf-8"/>
  <meta content="NHS SCW values" name="description"/>
  <meta content="HELIX_ULTIMATE_GENERATOR_TEXT" name="generator"/>
  <title>
   Our values - NHS SCW Support and Transformation for Health and Care
  </title>
  <link href="/images/scwfavicon.ico" rel="icon" type="image/vnd.microsoft.icon"/>
  <link href="https://www.scwcsu.nhs.uk/scw-search?format=opensearch" rel="search" title="OpenSearch NHS SCW Support and Transformation for Health and Care" type="application/opensearchdescription+xml"/>
  <link href="/media/vendor

### Locate a .csv file on a webpage

In [4]:
url2 = ('https://digital.nhs.uk/data-and-information/publications/statistical/out-of-area-placements-in-mental-health-services/march-2024') 

response2 = req.get(url2)

if response2.status_code == 200:
    soup2 = BeautifulSoup(response2.content, 'html.parser')
    csv_link = soup2.find("a", href=lambda href: href and href.endswith('csv'))
    file_url = csv_link["href"]
    print("Found .csv file:", file_url)

else:
    print(f'Failed to fetch webpage: {response2.status_code}')

Found .csv file: https://files.digital.nhs.uk/32/0B358C/oaps-open-data-mar-2024.csv


### Check the response and download the file to the current directory

In [5]:
file_name = file_url.split("/")[-1]  # Extract the file name from the URL i.e. the bit after the last "/"
file_response = req.get(file_url)

if file_response.status_code == 200:
    # Save the file to the current directory
    with open(f'{file_name}', "wb") as file:
        file.write(file_response.content)
    print(f"Downloaded: {file_name}")
else:
    print(f"Failed to download: {file_url}")

Downloaded: oaps-open-data-mar-2024.csv


### Read the .csv directly into a Pandas dataframe

In [6]:
from io import StringIO                     # io is native to Python

csv_content = StringIO(file_response.text)  # convert the context to a file-like object

df = pd.read_csv(csv_content)

df.head()

  df = pd.read_csv(csv_content)


Unnamed: 0,Grouping,PublicationPeriod,PublicationDate,Question,Breakdown1,Breakdown1Code,Breakdown1Description,Breakdown2,Breakdown2Code,Breakdown2Description,Value
0,Month,01/03/2024-31/03/2024,2024/06,Average recorded daily cost over the period,BedType,10,Acute adult mental health care,,,,695
1,Month,01/03/2024-31/03/2024,2024/06,Lower quartile daily cost over the period,BedType,10,Acute adult mental health care,,,,576
2,Month,01/03/2024-31/03/2024,2024/06,Number of OAPs active during the period with a...,BedType,10,Acute adult mental health care,,,,280
3,Month,01/03/2024-31/03/2024,2024/06,Number of OAPs active during the period with a...,BedType,10,Acute adult mental health care,,,,195
4,Month,01/03/2024-31/03/2024,2024/06,Number of OAPs active during the period with a...,BedType,10,Acute adult mental health care,,,,50


### Download multiple files of a particular type from a webpage

In [7]:
url3 = ('https://www.england.nhs.uk/statistics/statistical-work-areas/serious-mental-illness-smi/')

response3 = req.get(url3)

if response3.status_code == 200:
    soup3 = BeautifulSoup(response3.content, "html.parser")
    
  
    for link in soup3.find_all("a", href=True):
        file_url = link["href"]
        if file_url.endswith(('.xlsx')):  # Check for .xlsx file extensions
            print("Found .xlsx file:", file_url)
            
            # Step 4: Download the file
            file_name = file_url.split("/")[-1]  # Extract the file name from the URL i.e. everything after the last /
            file_response = req.get(file_url)
            
            if file_response.status_code == 200:
                # Save the file to the current directory
                with open(file_name, "wb") as file:
                    file.write(file_response.content)
                print(f"Downloaded: {file_name}")
            else:
                print(f"Failed to download: {file_url}")
    
else:
    print(f"Failed to fetch webpage: {response.status_code}")

Found .xlsx file: https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2024/05/PH-SMI-Q4-2023-24.xlsx
Downloaded: PH-SMI-Q4-2023-24.xlsx
Found .xlsx file: https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2024/02/Physical-Health-Checks-SMI-Q3-2023-24.xlsx
Downloaded: Physical-Health-Checks-SMI-Q3-2023-24.xlsx
Found .xlsx file: https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2023/11/Physical-Health-Checks-SMI-Q2-2023-24.xlsx
Downloaded: Physical-Health-Checks-SMI-Q2-2023-24.xlsx
Found .xlsx file: https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2023/11/Physical-Health-Checks-SMI-Q1-2023-24-Revised-v2.xlsx
Downloaded: Physical-Health-Checks-SMI-Q1-2023-24-Revised-v2.xlsx
Found .xlsx file: https://www.england.nhs.uk/statistics/wp-content/uploads/sites/2/2023/05/Physical-Health-Checks-SMI-Q4-2022-23-1.xlsx
Downloaded: Physical-Health-Checks-SMI-Q4-2022-23-1.xlsx
Found .xlsx file: https://www.england.nhs.uk/statistics/wp-content/up

### Scrape information from a table on a webpage

For example, if you wanted to scrape a publication schedule, or in this case a submission schedule for Providers' MHSDS data

In [8]:
url4 = ('https://digital.nhs.uk/data-and-information/data-collections-and-data-sets/data-sets/mental-health-services-data-set/submit-data')

response4 = req.get(url4)

if response4.status_code == 200:
    soup4 = BeautifulSoup(response4.content, "html.parser")

tables = soup4.find_all('table') # 
print(type(tables))

table_df = pd.read_html(str(tables))[0] # get the first item in the beautifulsoup ResultSet, convert it into a string, and read the html into a pandas DataFrame

table_df

<class 'bs4.element.ResultSet'>


  table_df = pd.read_html(str(tables))[0] # get the first item in the beautifulsoup ResultSet, convert it into a string, and read the html into a pandas DataFrame


Unnamed: 0,Performance month,Performance month restrictive interventions,Updates/Resubmissions,Cut of data taken at 11:59:59pm
0,April,Not applicable,Not applicable,17 June 2024
1,May,April,April1,26 June 2024
2,June,May,"April, May1",24 July 2024
3,July,June,"April, May, June1",27 August 2024
4,August,July,"April, May, June, July1",25 September 2024
5,September,August,"April, May, June, July August1",24 October 2024
6,October,September and October,"April, May, June, July, August, September1",26 November 2024


Doesn't appear to cover the whole year. Is there another section to the table?

In [9]:
len(pd.read_html((str(tables)))) # check how many individual tables have been found

  len(pd.read_html((str(tables)))) # check how many individual tables have been found


2

In [10]:
table_df2 = pd.read_html(str(tables))[1] # return the second item from the ResultSet


table_df2 

  table_df2 = pd.read_html(str(tables))[1] # return the second item from the ResultSet


Unnamed: 0,Performance month,Updates/Resubmissions,Cut of data taken at 11:59:59pm
0,November,"April, May, June, July, August, September, Oct...",20 December 2024
1,December,"April, May, June, July, August, September, Oct...",27 January 2025
2,January,"April, May, June, July, August, September, Oct...",25 February 2025
3,February,"April, May, June, July, August, September, Oct...",25 March 2025
4,March,"April, May, June, July, August, September, Oct...",To be confirmed


### Using urljoin to construct URLs

This will extract any .csv files for the calendar year 2024, which are all saved to their individual web pages, meaning that urljoin is required to construct the URLs dynamically (i.e. so that you don't have to hard code all the indidual web pages).

The package "re" is imported so that regular expression logic can be used in the construction of the URLs i.e. anything matching the patterm of the regular expression will be considered a web page of interest. (NOTE: you do not need to install "re", it is native to Python)

It's been limited to 2024 files to reduce the amount of data being transferred, but you could use a different regular expression to cover more.

In [11]:
from urllib.parse import urljoin
import re

url5 = 'https://digital.nhs.uk/data-and-information/publications/statistical/learning-disabilities-health-check-scheme'

target_urls = []                           # empty list that will later get filled with target URLs in a for loop.

dynamic_section = r'^england-[a-z]+-2024$' # the regular expression for the URLs we are interested in. note that the $ implies that you don't want anything else to follow.

response5 = req.get(url5)                  # get the response from the base URL

if response5.status_code == 200:
    soup5 = BeautifulSoup(response5.content, "html.parser")     # if there is a successful response, create a BeautifulSoup object.

    for link in soup5.find_all('a', href = True):
        sublink = link["href"]
        if re.match(dynamic_section,sublink.split('/')[-1]):
            full_url = urljoin(url5, sublink)                   # for each of the instances of the pattern we are looking for
            target_urls.append(full_url)                        # add the constructed full URL to a list of target URLs
        
    for link in target_urls:                                    # check for a successful response (code 200) from each URL
        response = req.get(link)                                # and create a BeautifulSoup object for each.
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")

            for link in soup.find_all("a", href=True):          # for each URL found on each of the pages in target_urls...
                file_url = link['href']                         

                if file_url.endswith(('.csv')):                 # ... check for .csv file extensions
                    print("Found .csv file:", file_url)

                    file_name = file_url.split("/")[-1]         # extract the file name from the URL i.e. everything after the last /
                    file_response = req.get(file_url)           # check the response for each file
            
                    if file_response.status_code == 200:        # if there's a successful response
                        
                        with open(file_name, "wb") as file:     # save the file to the current directory
                            file.write(file_response.content)
                        print(f"Downloaded: {file_name}")
                    else:
                        print(f"Failed to download: {file_url}")

else:
    print(f'Failed to fetch webpage: {response.status_code}')   # this else statement pairs with the original response code check for the base URL
                                                                # (see the first "if" in this code block)

Found .csv file: https://files.digital.nhs.uk/1E/56812A/learning-disabilities-health-check-scheme-eng-Sep-2024.csv
Downloaded: learning-disabilities-health-check-scheme-eng-Sep-2024.csv
Found .csv file: https://files.digital.nhs.uk/1C/BF3E28/learning-disabilities-health-check-scheme-eng-Aug-2024.csv
Downloaded: learning-disabilities-health-check-scheme-eng-Aug-2024.csv
Found .csv file: https://files.digital.nhs.uk/0C/DC2F3D/learning-disabilities-health-check-scheme-eng-Jul-2024.csv
Downloaded: learning-disabilities-health-check-scheme-eng-Jul-2024.csv
Found .csv file: https://files.digital.nhs.uk/BF/E282F0/learning-disabilities-health-check-scheme-eng-Jun-2024.csv
Downloaded: learning-disabilities-health-check-scheme-eng-Jun-2024.csv
Found .csv file: https://files.digital.nhs.uk/71/2A817F/learning-disabilities-health-check-scheme-eng-May-2024.csv
Downloaded: learning-disabilities-health-check-scheme-eng-May-2024.csv
Found .csv file: https://files.digital.nhs.uk/A7/B01710/learning-disab

In [12]:
urlpop = ('https://www.worldometers.info/world-population/population-by-country/')

responsepop = req.get(urlpop)

if responsepop.status_code == 200:
    souppop = BeautifulSoup(responsepop.content, "html.parser")

poptable = souppop.find_all('table') # 
print(type(poptable))

poptable_df = pd.read_html(str(poptable))[0] # get the first item in the beautifulsoup ResultSet, convert it into a string, and read the html into a pandas DataFrame

poptable_df

<class 'bs4.element.ResultSet'>


  poptable_df = pd.read_html(str(poptable))[0] # get the first item in the beautifulsoup ResultSet, convert it into a string, and read the html into a pandas DataFrame


Unnamed: 0,#,Country (or dependency),Population (2024),Yearly Change,Net Change,Density (P/Km²),Land Area (Km²),Migrants (net),Fert. Rate,Med. Age,Urban Pop %,World Share
0,1,India,1450935791,0.89 %,12866195,488,2973190,-630830,2.0,28,37 %,17.78 %
1,2,China,1419321278,-0.23 %,-3263655,151,9388211,-318992,1.0,40,66 %,17.39 %
2,3,United States,345426571,0.57 %,1949236,38,9147420,1286132,1.6,38,82 %,4.23 %
3,4,Indonesia,283487931,0.82 %,2297864,156,1811570,-38469,2.1,30,59 %,3.47 %
4,5,Pakistan,251269164,1.52 %,3764669,326,770880,-1401173,3.5,20,34 %,3.08 %
...,...,...,...,...,...,...,...,...,...,...,...,...
229,230,Montserrat,4389,-0.70 %,-31,44,100,-7,1.4,42,11 %,0.00 %
230,231,Falkland Islands,3470,-0.20 %,-7,0,12170,-13,1.7,42,68 %,0.00 %
231,232,Tokelau,2506,4.55 %,109,251,10,72,2.6,27,0 %,0.00 %
232,233,Niue,1819,0.11 %,2,7,260,10,2.5,36,44 %,0.00 %


In [14]:
poptable_df.to_csv('population.csv')