# Part1: Scraping to obtain all Supply Chain Finance data
 <span style="color:red;">**Note: Since the work is divided into scraping and subsequent manual processing, the project code is split into five files. This is the first code file.**</span>
## 1. Packages

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
import pandas as pd
import time

## 2. Define functions
### 1.1 Login function
 <span style="color:red;">**Note: This is a trial account, valid for seven days, until July 4, 2024. If the account expires, you can register a new one.**</span>

In [2]:
def login_to_jianwei():
    url = "https://www.jianweidata.com/Index"
    username = "18379771599"
    password = "Try123456"

    # 初始化WebDriver
    driver = webdriver.Chrome()
    driver.get(url)

    # 点击登录按钮以打开登录框
    WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.CSS_SELECTOR, ".login-info"))).click()

    # 填写用户名和密码并提交
    WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.ID, "username"))).send_keys(username)
    driver.find_element(By.ID, "password").send_keys(password)
    driver.find_element(By.CSS_SELECTOR, "button.btn.btn-default.pull-left").click()

    # 等待登录成功
    WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.CSS_SELECTOR, ".user-info.usual-header-link")))

    # 点击“跳过”按钮，使用JS点击防止遮挡问题
    skip_button = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//span[contains(text(), '跳过')]")))
    driver.execute_script("arguments[0].click();", skip_button)

    return driver

### 1.2 Manual page navigation


In [3]:
def manual_navigation_to_page(driver):
    input("请在浏览器中手动翻页到需要的页面，然后回到这里按回车继续...")

### 1.3 Regex to match illegal characters

In [4]:
import re

def remove_illegal_chars(text):
    # 正则表达式匹配非法字符
    illegal_chars_pattern = re.compile(r'[\000-\010]|[\013-\014]|[\016-\037]')
    # 移除非法字符
    cleaned_text = re.sub(illegal_chars_pattern, '', text)
    return cleaned_text

### 1.4 Scraping
 <span style="color:red;">**Note: Chromedriver is required to drive the browser!**</span>

In [5]:
def search_and_extract_data(driver):
    driver.get("https://www.jianweidata.com/IndexSearch")
    WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, "//span[contains(text(), '跳过')]"))).click()
    
    search_input = WebDriverWait(driver, 10).until(
        EC.visibility_of_element_located((By.CSS_SELECTOR, ".main-sreach-input")))
    search_input.send_keys("供应链金融")
    driver.find_element(By.CSS_SELECTOR, ".sreach-button").click()

    # 确保搜索结果已加载
    WebDriverWait(driver, 10).until(EC.visibility_of_all_elements_located((By.CSS_SELECTOR, ".info-item")))

    # 手动翻页
    manual_navigation_to_page(driver)

    collected_data = []
    current_items_count = 0  
    
    items = WebDriverWait(driver, 10).until(
        EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".info-item")))

    current_items_count = 0
    collected_data = []

    # 迭代所有items，但只对非第一个item执行点击操作
    for index, item in enumerate(items):
        if index > 0:  # 跳过第一个元素的点击操作
            expand_icon = item.find_element(By.CSS_SELECTOR, ".icon-drop")
            driver.execute_script("arguments[0].click();", expand_icon)
            time.sleep(1)  # 等待内容展开  

        # 继续抓取所有元素的数据
        stock_code = item.find_element(By.CSS_SELECTOR, ".stock").text.strip()
        title = item.find_element(By.CSS_SELECTOR, ".info-title").text.strip()
        date = item.find_element(By.CSS_SELECTOR, ".info-time").text.strip()
        summary_elements = item.find_elements(By.CSS_SELECTOR, ".info-word .para")
        summaries = " | ".join([summary.text.strip() for summary in summary_elements])

        collected_data.append({
            "股票代码": stock_code,
            "标题": title,
            "发布时间": date,
            "摘要内容": summaries
        })

        current_items_count += 1

    print(f"Processed {current_items_count} items.")
    return collected_data


## 2. Run functions
<span style="color:red; font-weight:bold;">
Note: 
    
1. Since the website has anti-scraping mechanisms (refreshing requires reloading all data, page links don’t auto-refresh), manual adjustments are required: first set “100 items per page” at the bottom, then flip pages manually.
    
2. Chromedriver is required to ensure automated Chrome operation.
    
</span>

In [7]:
# 登录并调用提取数据的函数
driver = login_to_jianwei()  
data = search_and_extract_data(driver)

# 准备DataFrame
new_data = pd.DataFrame(data)

# 遍历数据并清理每个相关列
columns_to_clean = ['摘要内容', '标题']  
for column in columns_to_clean:
    new_data[column] = new_data[column].apply(remove_illegal_chars)

# 指定文件路径
file_path = "Extracted_Data.xlsx"

# 检查文件是否存在并读取旧数据
try:
    old_data = pd.read_excel(file_path)
    combined_data = pd.concat([old_data, new_data], ignore_index=True)
except FileNotFoundError:
    combined_data = new_data

    
# 使用ExcelWriter写入数据，指定openpyxl引擎
with pd.ExcelWriter(file_path, engine='openpyxl') as writer:
    combined_data.to_excel(writer, index=False, sheet_name='Sheet1')
    
print("Collected data has been appended to the Excel file.")
driver.quit()

请在浏览器中手动翻页到需要的页面，然后回到这里按回车继续...
Processed 100 items.
Collected data has been appended to the Excel file.
