# Webscraping

In [None]:
from bs4 import BeautifulSoup

import requests as req

import os

import pandas as pd

from urllib.parse import urljoin

### Simple request to retrieve webpage title

In [None]:
url = 'https://www.scwcsu.nhs.uk/about/our-values'

response = req.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')

    print('Webpage title:', soup.title.string)

else:
    print(f'Failed to fetch webpage: {response.status_code}')

### Display the HTML for a page

In [None]:
print(soup.prettify()) # i.e. accessing the BeautifulSoup object created above

### Locate a .csv file on a webpage

In [None]:
url2 = ('https://digital.nhs.uk/data-and-information/publications/statistical/out-of-area-placements-in-mental-health-services/march-2024') 

response2 = req.get(url2)

if response2.status_code == 200:
    soup2 = BeautifulSoup(response2.content, 'html.parser')
    csv_link = soup2.find("a", href=lambda href: href and href.endswith('csv'))
    file_url = csv_link["href"]
    print("Found .csv file:", file_url)

else:
    print(f'Failed to fetch webpage: {response2.status_code}')

### Check the response and download the file to the current directory

In [None]:
file_name = file_url.split("/")[-1]  # Extract the file name from the URL i.e. the bit after the last "/"
file_response = req.get(file_url)

if file_response.status_code == 200:
    # Save the file to the current directory
    with open(f'{file_name}', "wb") as file:
        file.write(file_response.content)
    print(f"Downloaded: {file_name}")
else:
    print(f"Failed to download: {file_url}")

### Read the .csv directly into a Pandas dataframe

In [None]:
from io import StringIO                     # io is native to Python

csv_content = StringIO(file_response.text)  # convert the context to a file-like object

df = pd.read_csv(csv_content)

df.head()

### Download multiple files of a particular type from a webpage

In [None]:
url3 = ('https://www.england.nhs.uk/statistics/statistical-work-areas/serious-mental-illness-smi/')

response3 = req.get(url3)

if response3.status_code == 200:
    soup3 = BeautifulSoup(response3.content, "html.parser")
    
  
    for link in soup3.find_all("a", href=True):
        file_url = link["href"]
        if file_url.endswith(('.xlsx')):  # Check for .xlsx file extensions
            print("Found .xlsx file:", file_url)
            
            # Step 4: Download the file
            file_name = file_url.split("/")[-1]  # Extract the file name from the URL i.e. everything after the last /
            file_response = req.get(file_url)
            
            if file_response.status_code == 200:
                # Save the file to the current directory
                with open(file_name, "wb") as file:
                    file.write(file_response.content)
                print(f"Downloaded: {file_name}")
            else:
                print(f"Failed to download: {file_url}")
    
else:
    print(f"Failed to fetch webpage: {response.status_code}")

### Scrape information from a table on a webpage

For example, if you wanted to scrape a publication schedule, or in this case a submission schedule for Providers' MHSDS data

In [None]:
url4 = ('https://digital.nhs.uk/data-and-information/data-collections-and-data-sets/data-sets/mental-health-services-data-set/submit-data')

response4 = req.get(url4)

if response4.status_code == 200:
    soup4 = BeautifulSoup(response4.content, "html.parser")

tables = soup4.find_all('table') # 
print(type(tables))

table_df = pd.read_html(str(tables))[0] # get the first item in the beautifulsoup ResultSet, convert it into a string, and read the html into a pandas DataFrame

table_df

Doesn't appear to cover the whole year. Is there another section to the table?

In [None]:
len(pd.read_html((str(tables)))) # check how many individual tables have been found

In [None]:
table_df2 = pd.read_html(str(tables))[1] # return the second item from the ResultSet


table_df2 

### Using urljoin to construct URLs

This will extract any .csv files for the calendar year 2024, which are all saved to their individual web pages, meaning that urljoin is required to construct the URLs dynamically (i.e. so that you don't have to hard code all the indidual web pages).

The package "re" is imported so that regular expression logic can be used in the construction of the URLs i.e. anything matching the patterm of the regular expression will be considered a web page of interest. (NOTE: you do not need to install "re", it is native to Python)

It's been limited to 2024 files to reduce the amount of data being transferred, but you could use a different regular expression to cover more.

In [None]:
from urllib.parse import urljoin
import re

url5 = 'https://digital.nhs.uk/data-and-information/publications/statistical/learning-disabilities-health-check-scheme'

target_urls = []                           # empty list that will later get filled with target URLs in a for loop.

dynamic_section = r'^england-[a-z]+-2024$' # the regular expression for the URLs we are interested in. note that the $ implies that you don't want anything else to follow.

response5 = req.get(url5)                  # get the response from the base URL

if response5.status_code == 200:
    soup5 = BeautifulSoup(response5.content, "html.parser")     # if there is a successful response, create a BeautifulSoup object.

    for link in soup5.find_all('a', href = True):
        sublink = link["href"]
        if re.match(dynamic_section,sublink.split('/')[-1]):
            full_url = urljoin(url5, sublink)                   # for each of the instances of the pattern we are looking for
            target_urls.append(full_url)                        # add the constructed full URL to a list of target URLs
        
    for link in target_urls:                                    # check for a successful response (code 200) from each URL
        response = req.get(link)                                # and create a BeautifulSoup object for each.
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, "html.parser")

            for link in soup.find_all("a", href=True):          # for each URL found on each of the pages in target_urls...
                file_url = link['href']                         

                if file_url.endswith(('.csv')):                 # ... check for .csv file extensions
                    print("Found .csv file:", file_url)

                    file_name = file_url.split("/")[-1]         # extract the file name from the URL i.e. everything after the last /
                    file_response = req.get(file_url)           # check the response for each file
            
                    if file_response.status_code == 200:        # if there's a successful response
                        
                        with open(file_name, "wb") as file:     # save the file to the current directory
                            file.write(file_response.content)
                        print(f"Downloaded: {file_name}")
                    else:
                        print(f"Failed to download: {file_url}")

else:
    print(f'Failed to fetch webpage: {response.status_code}')   # this else statement pairs with the original response code check for the base URL
                                                                # (see the first "if" in this code block)