In [62]:
import undetected_chromedriver as uc
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time

In [63]:
# Set up Chrome options
chrome_options = Options()

# Start undetected_chromedriver
browser = uc.Chrome(options=chrome_options)

# Open target webpage
url = "https://www.creditchina.gov.cn/xinyongxinxixiangqing/xyDetail.html?searchState=1&entityType=1&keyword=%E4%B8%87%E5%8D%8E%E5%8C%96%E5%AD%A6%E9%9B%86%E5%9B%A2%E8%82%A1%E4%BB%BD%E6%9C%89%E9%99%90%E5%85%AC%E5%8F%B8"
browser.get(url)

In [49]:
# Wait for the table to load on the page
try:
    element_present = EC.presence_of_element_located((By.CLASS_NAME, 'result-table'))
    WebDriverWait(browser, 30).until(element_present)
except Exception as e:
    print(f"Timeout or error waiting for page to load: {e}")

# Create an empty DataFrame to store all data
all_data_df = pd.DataFrame(columns=[
    "Company_name", "authority", "start_year", "start_date", "end_year", "end_date", "permit_type", "permit_certificate", "permit_decision_name"
])

# Function to extract table data from the current page
def extract_table_data():
    page_data = []  # Store data from the current page
    tables = browser.find_elements(By.CLASS_NAME, 'result-table')
    for table in tables:
        rows = table.find_elements(By.TAG_NAME, 'tr')
        row_data_dict = {}
        for row in rows:
            cols = row.find_elements(By.TAG_NAME, 'td')
            if len(cols) == 2:  # Ensure data completeness
                key = cols[0].text.strip()
                value = cols[1].text.strip()
                row_data_dict[key] = value
        if len(row_data_dict) > 0:
            company_name = "万华化学集团股份有限公司"
            authority = row_data_dict.get('许可机关', '')
            start_year = row_data_dict.get('有效期自', '')[:4]
            start_date = row_data_dict.get('有效期自', '')
            end_year = row_data_dict.get('有效期至', '')[:4]
            end_date = row_data_dict.get('有效期至', '')
            permit_type = row_data_dict.get('许可类别', '')
            permit_certificate = row_data_dict.get('许可证书名称', '')
            permit_decision_name = row_data_dict.get('行政许可决定文书名称', '')
            
            page_data.append({
                "Company_name": company_name,
                "authority": authority,
                "start_year": start_year,
                "start_date": start_date,
                "end_year": end_year,
                "end_date": end_date,
                "permit_type": permit_type,
                "permit_certificate": permit_certificate,
                "permit_decision_name": permit_decision_name
            })
    print(f"Extracted data: {page_data}")
    return page_data  # Return data from the current page

# Extract data from the first page
page_data = extract_table_data()
all_data_df = all_data_df.append(page_data, ignore_index=True)

# Loop to click "Next Page" button and extract data
while True:
    try:
       # 找到“下一页”按钮并滚动到该元素
        next_button = WebDriverWait(browser, 30).until(
            EC.element_to_be_clickable((By.LINK_TEXT, '下一页 >'))
        )
        
        current_table_data = browser.find_element(By.CLASS_NAME, 'result-table').text

        # 使用JavaScript滚动到“下一页”按钮
        browser.execute_script("arguments[0].scrollIntoView();", next_button)
        
        # 每次点击前等待20秒
        time.sleep(5)
        
        next_button.click()
        
        # 等待表格内容变化
        WebDriverWait(browser, 20).until(
            lambda browser: browser.find_element(By.CLASS_NAME, 'result-table').text != current_table_data
        )

        # 等待新的表格加载
        WebDriverWait(browser, 20).until(
            EC.presence_of_element_located((By.CLASS_NAME, 'result-table'))
        )

        # 提取当前页面的数据
        page_data = extract_table_data()
        all_data_df = all_data_df.append(page_data, ignore_index=True)

    except Exception as e:
        print(f"No more pages or error: {e}")
        break

# Save all extracted data to a CSV file
csv_filename = 'extracted_data.csv'
all_data_df.to_csv(csv_filename, index=False, encoding='utf-8-sig')

# Print all extracted data
for table_index, table_data in all_data_df.iterrows():
    print(f"Table {table_index + 1}:")
    print(table_data.to_dict())
    print("\n")

# Close the browser
# browser.quit()

Extracted data: [{'Company_name': '万华化学集团股份有限公司', 'authority': '烟台经济技术开发区行政审批服务局', 'start_year': '2024', 'start_date': '2024-05-28', 'end_year': '2024', 'end_date': '2024-08-28', 'permit_type': '普通', 'permit_certificate': '— —', 'permit_decision_name': '建筑工程施工许可证'}, {'Company_name': '万华化学集团股份有限公司', 'authority': '烟台经济技术开发区行政审批服务局', 'start_year': '2024', 'start_date': '2024-05-24', 'end_year': '2025', 'end_date': '2025-05-31', 'permit_type': '普通', 'permit_certificate': '— —', 'permit_decision_name': '建设工程规划许可证核发'}, {'Company_name': '万华化学集团股份有限公司', 'authority': '烟台经济技术开发区行政审批服务局', 'start_year': '2024', 'start_date': '2024-05-24', 'end_year': '2024', 'end_date': '2024-12-31', 'permit_type': '普通', 'permit_certificate': '— —', 'permit_decision_name': '建设工程规划许可证核发'}, {'Company_name': '万华化学集团股份有限公司', 'authority': '烟台市住房和城乡建设局', 'start_year': '2024', 'start_date': '2024-05-21', 'end_year': '2099', 'end_date': '2099-12-31', 'permit_type': '普通', 'permit_certificate': '特殊建设工程消防验收意见书', 'permit_deci

  all_data_df = all_data_df.append(page_data, ignore_index=True)


No more pages or error: Message: 

Table 1:
{'Company_name': '万华化学集团股份有限公司', 'authority': '烟台经济技术开发区行政审批服务局', 'start_year': '2024', 'start_date': '2024-05-28', 'end_year': '2024', 'end_date': '2024-08-28', 'permit_type': '普通', 'permit_certificate': '— —', 'permit_decision_name': '建筑工程施工许可证'}


Table 2:
{'Company_name': '万华化学集团股份有限公司', 'authority': '烟台经济技术开发区行政审批服务局', 'start_year': '2024', 'start_date': '2024-05-24', 'end_year': '2025', 'end_date': '2025-05-31', 'permit_type': '普通', 'permit_certificate': '— —', 'permit_decision_name': '建设工程规划许可证核发'}


Table 3:
{'Company_name': '万华化学集团股份有限公司', 'authority': '烟台经济技术开发区行政审批服务局', 'start_year': '2024', 'start_date': '2024-05-24', 'end_year': '2024', 'end_date': '2024-12-31', 'permit_type': '普通', 'permit_certificate': '— —', 'permit_decision_name': '建设工程规划许可证核发'}


Table 4:
{'Company_name': '万华化学集团股份有限公司', 'authority': '烟台市住房和城乡建设局', 'start_year': '2024', 'start_date': '2024-05-21', 'end_year': '2099', 'end_date': '2099-12-31', 'permit_type': 