In [2]:
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
import time




In [9]:

# 爬取链接网页
def get_submission_data(driver, link):
    # 打开链接网页并切换到新窗口
    driver.execute_script("window.open();")
    driver.switch_to.window(driver.window_handles[1])
    driver.get(link)

    # 显示等待，等待页面中的特定元素加载完成
    time.sleep(3)

    # 使用Beautiful Soup解析页面源代码
    page_source = driver.page_source
    soup = BeautifulSoup(page_source, 'html.parser')

    # 定义一个辅助函数来提取字段值，处理可能的缺失字段
    def extract_field(field_name):
        field = soup.find('span', string=field_name)
        if field:
            return field.find_next('div', class_='slds-form-element__static').text
        return None

    # 提取链接对应页面的数据
    submission_data = {
        #'Submission ID': extract_field('Submission ID'),
        'Full Organization Legal Name': extract_field('Full Organization Legal Name'),
        'Third Party': extract_field('Third Party'),
        'Third Party Organization Name': extract_field('Third Party Organization Name'),
        'Exclusion Product Description':extract_field('Exclusion Product Description'),
        'Is this product subject to an antidumping (AD) or countervailing duty (CVD) order issued by the U.S. Department of Commerce?': extract_field('Is this product subject to an antidumping (AD) or countervailing duty (CVD) order issued by the U.S. Department of Commerce?'),
        'Does your business meet the size standards for a small business as established by the Small Business Administration?': extract_field('Does your business meet the size standards for a small business as established by the Small Business Administration?'),
        'Please report the number of employees your business employs in the United States.':extract_field('Please report the number of employees your business employs in the United States.'),
        'Do you support reinstating the exclusion?': extract_field('Do you support reinstating the exclusion?'),
        'Please explain your rationale.':extract_field('Please explain your rationale.'),
        'Are you a domestic producer of the products covered by this exclusion?': extract_field('Are you a domestic producer of the products covered by this exclusion?'),
        'Please explain whether the products covered by the exclusion, or comparable products, are available from sources in the United States?': extract_field('Please explain whether the products covered by the exclusion, or comparable products, are available from sources in the United States?'),
        'Please explain whether the products covered by the exclusion, or comparable products, are available from sources in third countries?': extract_field('Please explain whether the products covered by the exclusion, or comparable products, are available from sources in third countries?')
    }
    #print(submission_data)

    # 关闭链接网页并切换回原窗口
    driver.close()
    driver.switch_to.window(driver.window_handles[0])

    return submission_data

# 爬取第一层网页并存储数据
def scrape_ustr_comments(url):
    driver = webdriver.Chrome()
    driver.get(url)

    data = {
        'Submission ID': [],
        'Organization Name': [],
        'Published Exclusion Number': [],
        'Exclusion Product Description': [],
        'Date Posted': [],
        'Link': [],
        'Full Organization Legal Name': [],
        'Third Party': [],
        'Third Party Organization Name': [],
        'Is this product subject to an antidumping (AD) or countervailing duty (CVD) order issued by the U.S. Department of Commerce?': [],
        'Does your business meet the size standards for a small business as established by the Small Business Administration?': [],
        'Please report the number of employees your business employs in the United States.':[],
        'Do you support reinstating the exclusion?':[],
        'Please explain your rationale.': [],
        'Are you a domestic producer of the products covered by this exclusion?':[],
        'Please explain whether the products covered by the exclusion, or comparable products, are available from sources in the United States?': [],
        'Please explain whether the products covered by the exclusion, or comparable products, are available from sources in third countries?': [],
    }

    while True:
        time.sleep(4)
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')

        tbody = soup.find('table').find('tbody') if soup.find('table') else None
        if tbody:
            for row in tbody.find_all('tr'):
                submission_id = row.find('a', href=True).text
                organization_name = row.find_all('td')[0].text
                published_exclusion_number = row.find_all('td')[1].text
                exclusion_product_description = row.find_all('td')[2].text
                date_posted = row.find_all('td')[3].text
                link = f"https://comments.ustr.gov{row.find('a', href=True)['href']}"

                submission_data = get_submission_data(driver, link)

                data['Submission ID'].append(submission_id)
                data['Organization Name'].append(organization_name)
                data['Published Exclusion Number'].append(published_exclusion_number)
                data['Exclusion Product Description'].append(exclusion_product_description)
                data['Date Posted'].append(date_posted)
                data['Link'].append(link)
                
                #print(submission_id)
                #print(organization_name)

                for field, value in submission_data.items():
                    data[field].append(value)
        
        # 翻页
        next_button = driver.find_element("xpath", "//button[contains(@class, 'slds-button_neutral') and contains(text(), 'Next')]")
        
        if "aria-disabled=\"true\"" in next_button.get_attribute("outerHTML"):
            break

        next_button.click()

    driver.quit()

    max_length = max(len(value) for value in data.values())

# 填充缺失值，使所有字段具有相同的长度
    filled_data = {field: value + [None] * (max_length - len(value)) for field, value in data.items()}

# 创建数据框架
    df = pd.DataFrame(filled_data)
    df.to_excel('ustr_comments_data_2021-0019.xlsx', index=False)

if __name__ == "__main__":
    url = 'https://comments.ustr.gov/s/docket?docketNumber=USTR-2021-0019'
    scrape_ustr_comments(url)
