In [7]:
## Import relevant libraries:
from bs4 import BeautifulSoup
import requests, certifi
import pandas as pd
from datetime import datetime
import pytz
import os
import re
from requests.adapters import HTTPAdapter
from urllib3.util import Retry

def replaceValsWithNationalVals(variable_dict=None, localValsVar=None, nationalValsVar=None):
    if any(nationalValsVar in d for d in variable_dict) and not any(localValsVar in d for d in variable_dict):
                mnths_dict = next(d for d in variable_dict if nationalValsVar in d)
                variable_dict.append({localValsVar: mnths_dict[nationalValsVar]})

# setup working directory
base_path = '/content/'
print('*******')

siteList = ['[All]','7A2T','7A5T','GWY','LLD','MOR','NEV']
siteHBDict = {
    '7A2T': 'HDUHB',
    '7A5T': 'CTMUHB',
    'GWY': 'BCUHB',
    'LLD': 'CAVUHB',
    'MOR': 'SBUHB',
    'NEV': 'ABUHB',
    '[All]': 'All Wales'
}
ChartList = ['DXAwithin12weeks','Investigationandtreatment']
DXA_df = pd.DataFrame()
Investigationandtreatment_df = pd.DataFrame()

retry_strategy = Retry(
    total=4,  # Maximum number of retries
    status_forcelist=[429, ## Too Many Requests
                      500, ## Internal Server Error
                      502, ## 502 Bad Gateway
                      503, ## 503 Service Unavailable
                      504 ## 504 Gateway Timeout
                      ],  # HTTP status codes to retry on
)
# Create an HTTP adapter with the retry strategy and mount it to session
adapter = HTTPAdapter(max_retries=retry_strategy)

date_of_scrape = datetime.now(pytz.timezone("Europe/London"))
date_of_scrape = date_of_scrape.strftime("%Y-%m-%d %H:%M:%S")

# Create a new session object
session = requests.Session()
session.mount('https://', adapter)

# Loop through each Site and Chart to scrape data
for chart in ChartList:
    for site in siteList:
        url = f'https://www.fffap.org.uk/FLS/charts.nsf/vwPcharts/{chart}?opendocument&org={site}'
        print(url)

        result = session.get(
            url,
            headers={'User-Agent': 'Mozilla/5.0'},
            verify=False
        )

        ## Parse Content of Results Object
        doc = BeautifulSoup(result.text, "html.parser")

        # Find the Numeric data in the html code
        data = '\n'.join(script.text for script in doc.find_all('script', {'type': 'text/javascript'}))
        match_scripts = re.findall(r"(.*)( = )([^;]*)", data)

        # Convert List of Tuples to List of Dictionaries
        variable_dict = []
        """
        The 3 part Tuple for each list item will always match the below Index positions:
        0 = Javascript variable name (i.e. var cat)
        1 = will always be the string '='
        2 = the content of the variable.
        """
        count = 0

        # Differentiate between [All] sites and individual sites
        if site != '[All]':
            for variable in match_scripts:
                if match_scripts[count][0] in [
                                            "var cats",
                                            "var dxah" ,
                                            "var bp",
                                            "var months",
                                            "var mnths",
                                            "var dxan",
                                            "var bpn",
                                            "var axa",
                                            "var falls",
                                            "var dxa75",
                                            "var axan",
                                            "var fallsn",
                                            "var dxa75n"
                                            ]:
                    variable_dict.append({variable[0].strip(): variable[2].replace("[", "").replace("]", "").replace("'", "").strip().split(",")})
                    count += 1
                else:
                    count += 1
                    pass
        else:
            # For [All] sites, local variables are not available, only national equivalents
            for variable in match_scripts:
                # Only capture national variables
                if match_scripts[count][0] in [
                                            "var mnths",
                                            "var dxan",
                                            "var bpn",
                                            "var axan",
                                            "var fallsn",
                                            "var dxa75n"
                                            ]:
                    variable_dict.append({variable[0].strip(): variable[2].replace("[", "").replace("]", "").replace("'", "").strip().split(",")})
                    count += 1
                else:
                    count += 1
                    pass

            # Fill missing local variables with national equivalents
            replaceValsWithNationalVals(variable_dict, 'var cats', 'var mnths')
            replaceValsWithNationalVals(variable_dict, 'var dxah', 'var dxan')
            replaceValsWithNationalVals(variable_dict, 'var bp', 'var bpn')
            replaceValsWithNationalVals(variable_dict, 'var axa', 'var axan')
            replaceValsWithNationalVals(variable_dict, 'var falls', 'var fallsn')
            replaceValsWithNationalVals(variable_dict, 'var dxa75', 'var dxa75n')

        # Convert List of Dictionaries to DataFrame
        temp_df = pd.DataFrame()
        for var_idx in range(len(variable_dict)):
            key = list(variable_dict[var_idx].keys())[0]
            ser = pd.Series(variable_dict[var_idx][key])
            try:
                temp_df.insert(var_idx, key, ser)
            except ValueError:
                temp_df.insert(var_idx, key + str(var_idx), ser)

        # Rename Columns
        temp_df.rename(columns={
            "var cats": "Date Index Fracture Diagnosed Year & Month (Alt)",
            "var dxah": "Patients where a DXA was ordered or recommended and was completed within 12 weeks %" ,
            "var bp": "Patients offered Bone Protection medication %",
            "var months": "Date Index Fracture Diagnosed Year & Month",
            "var mnths": "Date Index Fracture Diagnosed Year & Month",
            "var dxan": "Patients where a DXA was ordered or recommended and was completed within 12 weeks National %",
            "var bpn":"Bone Protection Meds National %",
            "var axa": "FLS assessment <=90 days %",
            "var falls": "Patients offered/referred for falls risk assessment %",
            "var dxa75": "Patients<75 offered/undergone a DXA %",
            "var axan": "FLS assessment <=90 days National %",
            "var fallsn": "Falls assessment National %",
            "var dxa75n":"Patients<75 offered/undergone a DXA National %"
        }, inplace=True)

        # Add SiteNameCode and Healthboard columns
        temp_df["SiteNameCode"] = f"FLS_{site}"
        temp_df["Healthboard"] = siteHBDict[site]
        ## Add Date of Scrape column
        temp_df["Date_of_Scrape"] = date_of_scrape

        # Append to relevant DataFrame
        if chart == 'DXAwithin12weeks':
            DXA_df = pd.concat([DXA_df, temp_df], ignore_index=True)
        elif chart == 'Investigationandtreatment':
            Investigationandtreatment_df = pd.concat([Investigationandtreatment_df, temp_df], ignore_index=True)

        print("---------------------------------------------------")

# Replace blank spaces with NaN in both dataframes
DXA_df = DXA_df.replace(r'^\s*$', pd.NA, regex=True)
Investigationandtreatment_df = Investigationandtreatment_df.replace(r'^\s*$', pd.NA, regex=True)

#Reorder Columns for both dataframes
Investigationandtreatment_df = Investigationandtreatment_df[[
    "Date Index Fracture Diagnosed Year & Month",
    "FLS assessment <=90 days %",
    "FLS assessment <=90 days National %",
    "Patients offered/referred for falls risk assessment %",
    "Falls assessment National %",
    "Patients offered Bone Protection medication %",
    "Bone Protection Meds National %",
    "Patients<75 offered/undergone a DXA %",
    "Patients<75 offered/undergone a DXA National %",
    "SiteNameCode",
    "Healthboard",
    "Date_of_Scrape"
]]
DXA_df = DXA_df[[
    "Date Index Fracture Diagnosed Year & Month",
    "Patients where a DXA was ordered or recommended and was completed within 12 weeks %",
    "Patients where a DXA was ordered or recommended and was completed within 12 weeks National %",
    "SiteNameCode",
    "Healthboard",
    "Date_of_Scrape"
]]

#Directory and file path
directory_path = f'FLS_Scrape_{date_of_scrape}'
os.makedirs(directory_path, exist_ok=True)

dxa_file_path = os.path.join(directory_path, f'{ChartList[0]}.csv')
IandT_file_path = os.path.join(directory_path, f'{ChartList[1]}.csv')

# Export DataFrames to CSV
DXA_df.to_csv(dxa_file_path, encoding='utf-8', index=False)
Investigationandtreatment_df.to_csv(IandT_file_path, encoding='utf-8', index=False)


*******
https://www.fffap.org.uk/FLS/charts.nsf/vwPcharts/DXAwithin12weeks?opendocument&org=[All]




---------------------------------------------------
https://www.fffap.org.uk/FLS/charts.nsf/vwPcharts/DXAwithin12weeks?opendocument&org=7A2T
---------------------------------------------------
https://www.fffap.org.uk/FLS/charts.nsf/vwPcharts/DXAwithin12weeks?opendocument&org=7A5T




---------------------------------------------------
https://www.fffap.org.uk/FLS/charts.nsf/vwPcharts/DXAwithin12weeks?opendocument&org=GWY
---------------------------------------------------
https://www.fffap.org.uk/FLS/charts.nsf/vwPcharts/DXAwithin12weeks?opendocument&org=LLD




---------------------------------------------------
https://www.fffap.org.uk/FLS/charts.nsf/vwPcharts/DXAwithin12weeks?opendocument&org=MOR
---------------------------------------------------
https://www.fffap.org.uk/FLS/charts.nsf/vwPcharts/DXAwithin12weeks?opendocument&org=NEV




---------------------------------------------------
https://www.fffap.org.uk/FLS/charts.nsf/vwPcharts/Investigationandtreatment?opendocument&org=[All]
---------------------------------------------------
https://www.fffap.org.uk/FLS/charts.nsf/vwPcharts/Investigationandtreatment?opendocument&org=7A2T




---------------------------------------------------
https://www.fffap.org.uk/FLS/charts.nsf/vwPcharts/Investigationandtreatment?opendocument&org=7A5T
---------------------------------------------------
https://www.fffap.org.uk/FLS/charts.nsf/vwPcharts/Investigationandtreatment?opendocument&org=GWY




---------------------------------------------------
https://www.fffap.org.uk/FLS/charts.nsf/vwPcharts/Investigationandtreatment?opendocument&org=LLD
---------------------------------------------------
https://www.fffap.org.uk/FLS/charts.nsf/vwPcharts/Investigationandtreatment?opendocument&org=MOR




---------------------------------------------------
https://www.fffap.org.uk/FLS/charts.nsf/vwPcharts/Investigationandtreatment?opendocument&org=NEV
---------------------------------------------------
