In [18]:
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm

def get_url(url):
    session = requests.Session()
    retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
    session.mount('https://', HTTPAdapter(max_retries=retries))

    response = session.get(url, timeout=10)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        company_links = [span.find('a') for span in soup.find_all('span', {'class': 'companyName'})]
        company_urls = ['https://www.annualreports.com' + link['href'] for link in company_links if link]
        return company_urls
    else:
        print(f'无法访问网页，状态码: {response.status_code}')
        return []

def get_info(url):
    session = requests.Session()
    retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])
    session.mount('https://', HTTPAdapter(max_retries=retries))

    response = session.get(url, timeout=10)
    dict = {}
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        dict['Name'] = soup.find('div',class_='vendor_name').get_text(strip=True)
        dict['Ticker'] = soup.find('span', class_='ticker_name').text.strip()
        dict['Market'] = soup.find('div', class_='right').get_text(strip=True).replace('Exchange','').replace('More','')
        dict['Industry'] = soup.find_all('span', class_='blue_txt')[2].find_next_sibling(string=True).strip()
        dict['Sector'] = soup.find_all('span', class_='blue_txt')[3].find_next_sibling(string=True).strip()
        return dict

NYS_list = get_url('https://www.annualreports.com/Companies?exch=1')
LSE_list = get_url('https://www.annualreports.com/Companies?exch=9')
NAS_list = get_url('https://www.annualreports.com/Companies?exch=2')
ASX_list = get_url('https://www.annualreports.com/Companies?exch=7')
TSE_list = get_url('https://www.annualreports.com/Companies?exch=5')

all_urls = NYS_list + LSE_list + NAS_list + ASX_list + TSE_list

list = []
for url in tqdm(all_urls, desc='Processing companies'):
    info = get_info(url)
    if info:
        list.append(info)

df = pd.DataFrame(list)
df.to_csv('Companies_Industry.csv', index=False)


Processing companies: 100%|██████████| 8787/8787 [1:01:26<00:00,  2.38it/s]
