In [2]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from bs4 import BeautifulSoup
import os
from io import BytesIO

def get_url(url):
    session = requests.Session()
    retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
    session.mount('https://', HTTPAdapter(max_retries=retries))

    response = session.get(url, timeout=10)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        company_links = [span.find('a') for span in soup.find_all('span', {'class': 'companyName'})]
        company_urls = ['https://www.annualreports.com' + link['href'] for link in company_links if link]
        return company_urls
    else:
        print(f'无法访问网页，状态码: {response.status_code}')
        return []

def extract_pdf_url(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    pdf_button = soup.find('a', {'class': 'btn_form_10k'})
    if pdf_button and 'href' in pdf_button.attrs:
        return 'https://www.annualreports.com' + pdf_button['href']
    return None

def download_and_save_pdf(url):
    session = requests.Session()
    retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
    session.mount('https://', HTTPAdapter(max_retries=retries))

    response = session.get(url, timeout=10)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        ticker = soup.find('span', class_='ticker_name').text.strip()

        if not os.path.exists(ticker):
            os.makedirs(ticker)

        pdf_url = extract_pdf_url(response.content)
        if pdf_url:
            pdf_filename = ticker + '.pdf'
            pdf_path = os.path.join(ticker, pdf_filename)

            if os.path.exists(pdf_path):
                print(f'文件已存在{pdf_path}，跳过: {pdf_filename}')
                return

            pdf_response = session.get(pdf_url, timeout=10)
            with open(pdf_path, 'wb') as pdf_file:
                pdf_file.write(pdf_response.content)

            print(f'已下载并保存 PDF: {pdf_filename} 在 {pdf_path}')
    else:
        print(f'无法访问网页，状态码: {response.status_code}')

LSE_list = get_url('https://www.annualreports.com/Companies?exch=9')

for url in LSE_list:
    download_and_save_pdf(url)


文件已存在SPA/SPA.pdf，跳过: SPA.pdf
文件已存在III/III.pdf，跳过: III.pdf
文件已存在3IN/3IN.pdf，跳过: 3IN.pdf
文件已存在4BB/4BB.pdf，跳过: 4BB.pdf
文件已存在DDDD/DDDD.pdf，跳过: DDDD.pdf
文件已存在FOUR.L/FOUR.L.pdf，跳过: FOUR.L.pdf
文件已存在SIXH/SIXH.pdf，跳过: SIXH.pdf
文件已存在PLS/PLS.pdf，跳过: PLS.pdf
文件已存在888/888.pdf，跳过: 888.pdf
文件已存在BAG/BAG.pdf，跳过: BAG.pdf
文件已存在ABDP.L/ABDP.L.pdf，跳过: ABDP.L.pdf
文件已存在AAIF/AAIF.pdf，跳过: AAIF.pdf
文件已存在ADIG/ADIG.pdf，跳过: ADIG.pdf
文件已存在AEMC/AEMC.pdf，跳过: AEMC.pdf
文件已存在AJIT/AJIT.pdf，跳过: AJIT.pdf
文件已存在ASEI/ASEI.pdf，跳过: ASEI.pdf
文件已存在ASLI/ASLI.pdf，跳过: ASLI.pdf
文件已存在ASL/ASL.pdf，跳过: ASL.pdf
文件已存在ABDN/ABDN.pdf，跳过: ABDN.pdf
文件已存在ACC/ACC.pdf，跳过: ACC.pdf
文件已存在ACSO.L/ACSO.L.pdf，跳过: ACSO.L.pdf
文件已存在ACRL.L/ACRL.L.pdf，跳过: ACRL.L.pdf
文件已存在AXS/AXS.pdf，跳过: AXS.pdf
文件已存在AEG/AEG.pdf，跳过: AEG.pdf
文件已存在ACT/ACT.pdf，跳过: ACT.pdf
文件已存在ADA/ADA.pdf，跳过: ADA.pdf
文件已存在ADT/ADT.pdf，跳过: ADT.pdf
文件已存在ADME/ADME.pdf，跳过: ADME.pdf
文件已存在ABV/ABV.pdf，跳过: ABV.pdf
文件已存在AMS.L/AMS.L.pdf，跳过: AMS.L.pdf
文件已存在AVO.L/AVO.L.pdf，跳过: AVO.L.pdf
文件已存在AEO/AEO.pdf，跳过: AE