In [None]:
import requests
import re
import os
from urllib.parse import urljoin
import shutil
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException, NoSuchElementException
from bs4 import BeautifulSoup
import time
import pandas as pd

def get_automotive_industry():
    # Step 1: Get the urls for all sectors.
    response = requests.get("https://sustainabilityreports.com/sectors/")
    response.raise_for_status()
    soup = BeautifulSoup(response.text, 'html.parser')
    sectors_url = [i['href'] for i in soup.find_all('a', href=re.compile(r'https://sustainabilityreports.com/sector/[a-z\\-]+'))]

    # Step 2: 
    industry_link = []
    for sector_url in sectors_url:
        companies = set()
        options = webdriver.ChromeOptions()
        options.add_argument('--headless')
        options.add_argument('--disable-gpu')
        options.add_argument('--no-sandbox')
        options.add_argument('--disable-dev-shm-usage')
        driver = webdriver.Chrome(options=options)
        driver.maximize_window()
        try:
            driver.get(sector_url)
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, 'a[href^="https://sustainabilityreports.com/company/"]'))
            )
            scroll_count = 0
            last_height = driver.execute_script("return document.body.scrollHeight")
            scroll_limit = 2000
            while True:
                if scroll_limit and scroll_count >= scroll_limit:
                    print(f"Reached maximum scroll limit of {scroll_limit}")
                    break
                soup = BeautifulSoup(driver.page_source, 'html.parser')
                items = soup.find_all("a", href=re.compile(r'https://sustainabilityreports.com/company/[a-z\\-]+/'))
                for item in items:
                    companies.add(item['href'])
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(4)
                new_height = driver.execute_script("return document.body.scrollHeight")
                if new_height == last_height:
                    break
                last_height = new_height
                scroll_count += 1
        except TimeoutException:
            print("Timeout waiting for elements to load")
        except Exception as e:
            print(f"An error occurred: {str(e)}")
        finally:
            driver.quit()
        companies = list(companies)
        counter = 1
        for company in companies:
            response = requests.get(company)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            industry = soup.find("a", href=re.compile(r'https://sustainabilityreports.com/industry/[a-z\\-]+/'))
            if industry['href'] not in industry_link:
                industry_link.append(industry['href'])
                counter += 1

In [1]:
import requests
import re
import os
from urllib.parse import urljoin
import shutil
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException, NoSuchElementException
from bs4 import BeautifulSoup
import time
import pandas as pd

In [2]:
response = requests.get("https://sustainabilityreports.com/sectors/")
response.raise_for_status()
soup = BeautifulSoup(response.text, 'html.parser')
sectors_url = [i['href'] for i in soup.find_all('a', href=re.compile(r'https://sustainabilityreports.com/sector/[a-z\\-]+'))]

In [None]:
industry_link = []
for sector_url in sectors_url:
    companies = set()
    print("Checking sector:", sector_url)
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)
    driver.maximize_window()
    try:
        driver.get(sector_url)
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'a[href^="https://sustainabilityreports.com/company/"]'))
        )
        scroll_count = 0
        last_height = driver.execute_script("return document.body.scrollHeight")
        scroll_limit = 2000
        while True:
            if scroll_limit and scroll_count >= scroll_limit:
                print(f"Reached maximum scroll limit of {scroll_limit}")
                break
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            items = soup.find_all("a", href=re.compile(r'https://sustainabilityreports.com/company/[a-z\\-]+/'))
            for item in items:
                companies.add(item['href'])
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(4)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height
            scroll_count += 1
    except TimeoutException:
        print("Timeout waiting for elements to load")
    except Exception as e:
        print(f"An error occurred: {str(e)}")
    finally:
        driver.quit()
    companies = list(companies)
    counter = 1
    for company in companies:
        response = requests.get(company)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        industry = soup.find("a", href=re.compile(r'https://sustainabilityreports.com/industry/[a-z\\-]+/'))
        if industry['href'] not in industry_link:
            industry_link.append(industry['href'])
            print(f"\rFound {counter} Distinct Industries", end="", flush=True)
            counter += 1
    print("\n")

Checking sector: https://sustainabilityreports.com/sector/transportation-and-warehousing/


Checking sector: https://sustainabilityreports.com/sector/utilities/
Found 2 Distinct Industries

Checking sector: https://sustainabilityreports.com/sector/wholesale-trade/
Found 18 Distinct Industries



In [None]:
industry_link = pd.unique(pd.Series(industry_link)).tolist()
for link in industry_link:
    with open("../sr_data/industry.txt", 'a') as file:
        file.write(link + "\n")

In [None]:
industry_link = []
with open("../sr_data/industry.txt", 'r') as file:
    for line in file:
        industry_link.append(line.strip("\n"))
industries = [link.split("/industry/")[1].strip("/") for link in industry_link]

In [2]:
automotive_industries = [
    "motor-vehicle-parts-manufacturing",
    "motor-vehicle-manufacturing",
    "motor-vehicle-body-and-trailer-manufacturing",
    "automotive-repair-and-maintenance",
    "automotive-equipment-rental-and-leasing",
    "automotive-parts-accessories-and-tire-retailers",
    "motor-vehicle-and-motor-vehicle-parts-and-supplies-merchant-wholesalers"
]

In [3]:
automotive_links = ["https://sustainabilityreports.com/industry/" + industry for industry in automotive_industries]

In [4]:
companies = []
download_dests = []
for automotive_url in automotive_links:
    print("Checking industry:", automotive_url)
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    driver = webdriver.Chrome(options=options)
    driver.maximize_window()
    try:
        driver.get(automotive_url)
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'a[href^="https://sustainabilityreports.com/company/"]'))
        )
        scroll_count = 0
        last_height = driver.execute_script("return document.body.scrollHeight")
        scroll_limit = 2000
        while True:
            if scroll_limit and scroll_count >= scroll_limit:
                print(f"Reached maximum scroll limit of {scroll_limit}")
                break
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            items = soup.find_all("a", href=re.compile(r'https://sustainabilityreports.com/company/[a-z\\-]+/'))
            for item in items:
                if item['href'] not in companies:
                    companies.append(item['href'])
                    download_dests.append(f"../sr_data/{automotive_url.split('/industry/')[-1]}/{item['href'].split('/company/')[-1]}")
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(4)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            last_height = new_height
            scroll_count += 1
    except TimeoutException:
        print("Timeout waiting for elements to load")
    except Exception as e:
        print(f"An error occurred: {str(e)}")
    finally:
        driver.quit()

Checking industry: https://sustainabilityreports.com/industry/motor-vehicle-parts-manufacturing
Checking industry: https://sustainabilityreports.com/industry/motor-vehicle-manufacturing
Checking industry: https://sustainabilityreports.com/industry/motor-vehicle-body-and-trailer-manufacturing
Checking industry: https://sustainabilityreports.com/industry/automotive-repair-and-maintenance
Checking industry: https://sustainabilityreports.com/industry/automotive-equipment-rental-and-leasing
Checking industry: https://sustainabilityreports.com/industry/automotive-parts-accessories-and-tire-retailers
Checking industry: https://sustainabilityreports.com/industry/motor-vehicle-and-motor-vehicle-parts-and-supplies-merchant-wholesalers


In [5]:

def retrieve_url(url, email_address):
    pdf_url = ''
    report_name = ''
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        report = soup.find("a", href=re.compile(r'https://sustainabilityreports.com/reports/[a-z\\-]+'))
        report_name = report.text.strip() + '.pdf'
    except Exception as e:
        pass
    try:
        driver.get(url)
        # Step 1: Wait for the first "Download PDF" button to be clickable and click it.
        download_pdf_btn = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable(
                (By.XPATH, "//a[contains(text(), 'Download PDF') and contains(@class, 'wpdm-download-link')]")
            )
        )
        download_pdf_btn.click()
        print(f"\rClicked the initial Download PDF button.", end="", flush=True)

        # Wait for the iframe containing the popup to load.
        # Adjust the XPath below based on your page. This example assumes the iframe's src contains "popup".
        iframe = WebDriverWait(driver, 15).until(
            EC.presence_of_element_located(
                (By.XPATH, "//iframe[contains(@src, 'wpdm')]")
            )
        )
        # Switch into the iframe context
        driver.switch_to.frame(iframe)
        print(f"\rSwitched to the iframe containing the checkbox.", end="", flush=True)

        # Step 2: Wait for the checkbox to be clickable and then click it.
        checkbox = WebDriverWait(driver, 15).until(
            EC.element_to_be_clickable(
                (By.XPATH, "//input[contains(@class, 'terms_checkbox') and @type='checkbox']")
            )
        )
        checkbox.click()
        print(f"\rCheckbox clicked.", end="", flush=True)

        # Step 3: Check if an email input appears. If so, fill it in and click Submit.
        try:
            email_input = WebDriverWait(driver, 10).until(
                EC.visibility_of_element_located((By.XPATH, "//input[@type='email']"))
            )
            email_input.send_keys(email_address)
            print(f"\rEmail entered.", end="", flush=True)

            submit_btn = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable(
                    (By.XPATH, "//button[contains(@id, 'wpdm_submit') and contains(text(), 'Submit')]")
                )
            )
            submit_btn.click()
            print("Submit button clicked.")
        except Exception as e:
            print(f"\rEmail input not required, skipping email submission.", end="", flush=True)

        # Step 4: Wait for the final download link to appear in the main content and retrieve its href.
        download_link = WebDriverWait(driver, 15).until(
            EC.element_to_be_clickable(
                (By.XPATH, "//a[contains(text(), 'Download PDF')]")
            )
        )
        pdf_url = download_link.get_attribute("href")
        print(f"\rDownload link found: {pdf_url}", end="", flush=True)
    except Exception as e:
        print("An error occurred:", e)
    finally:
        driver.quit()
        return (pdf_url, report_name)

In [6]:
def download_pdf(url, output_path):
    try:
        output_dir = os.path.dirname(output_path)
        if output_dir and not os.path.exists(output_dir):
            os.makedirs(output_dir)
        response = requests.get(url)
        response.raise_for_status()
        with open(output_path, 'wb') as pdf_file:
            pdf_file.write(response.content)
        with open("../sr_data/SustainabilityReports.txt", 'a') as file:
            file.write(f"Industry: {output_path.split("/")[2]}, Company: {output_path.split("/")[3]}\n")
        with open("../sr_data/pdf_reports_urls.txt", 'a') as file:
            file.write(url+'\n')
        return True
    except requests.RequestException as e:
        print(f"Error downloading the PDF: {e}")
        return False
    except IOError as e:
        print(f"Error saving the PDF: {e}")
        return False

In [None]:
for idx, company_url in enumerate(companies):
    email_address = 'example@email.com'
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument("--headless")
    driver = webdriver.Chrome(options=chrome_options, service=ChromeService())
    pdf_url, report_name = retrieve_url(company_url, email_address)
    if pdf_url != '':
        download_pdf(pdf_url, download_dests[idx] + "/" + report_name)
    print(f"\rProcessed {idx+1}/{len(companies)} reports.", end="", flush=True)

: 