In [3]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import pandas as pd

faq_pd = pd.DataFrame(columns=['Categories', 'Question', 'Answer'])
rm_faq_pd = pd.DataFrame(columns=['Question', 'Answer'])
one_fqa_pd = pd.DataFrame(columns=['Question', 'Answer'])

def clean_text(text):
    text = text.replace('- ', '').replace('， ', '，').replace('  ', ' ')
    text = text.replace(' ‌  ', ' ').replace('  ', ' ')
    # remove space in 約 定 跨 行 轉 帳 限 額
    text = text.replace('約 定 跨 行 轉 帳 限 額', '約定跨行轉帳限額')


    return text

def fetch_faq_and_categories():
    url = 'https://www.cathaybk.com.tw/cathaybk/personal/contact/help/faq/'
    response = requests.get(url)
    response.raise_for_status()  # 檢查請求是否成功

    soup = BeautifulSoup(response.text, 'html.parser')

    # 先提取所有類別的 id 和內容
    categories = {}
    for checkbox in soup.find_all(class_='cubre-a-checkbox__input'):
        category_id = checkbox['id']
        label = soup.find('label', {'for': category_id})
        if label:
            categories[category_id] = label.text.strip()
    
    # 爬取每個 class="cubre-m-collapse__item" 的元素
    for index, item in tqdm(enumerate(soup.find_all(class_='cubre-m-collapse__item'))):
        data_item_filter = item['data-item-filter'].split(', ')
        
        for categories_name in data_item_filter:
            category_names = [categories.get(category_id, 'Unknown Category')  for category_id in categories_name.split(',')]
            # remove'不限' or ' 不限' or '不限 '
            category_names = [category_name for category_name in category_names if category_name != '不限' and category_name != ' 不限' and category_name != '不限 ']
            # remove the duplicate category name
            categories_name = list(set(category_names))
       
        title = item.find(class_='cubre-m-collapse__title').text.strip()
        text_content = item.find(class_='cubre-o-textContent').text.strip()
        text_content = '\n'.join([line for line in text_content.split('\n') if line.strip()])
        # remove text_content's '\n'
        text_content = text_content.replace('\n', '')
        

        # # 匹配輸出每個元素的類別、標題和文本內容
        # print(f'Index: {index}')
        # print(f'Categories: {", ".join(category_names)}')
        # print(f'Title: {title}')
        # print(f'Text Content: {text_content}')
        # print('-' * 50)  # 輸出一個分隔線以區分不同的元素
        
        # save to pandas
        global faq_pd
        faq_pd = pd.concat([faq_pd, pd.DataFrame([[', '.join(category_names), title, text_content]], columns=['Categories', 'Question', 'Answer'])], ignore_index=True)
        
        global rm_faq_pd
        
        # remove the Categories
        rm_faq_pd = faq_pd.drop(['Categories'], axis=1)
        # rename the columns input and output with Question and Answer
        rm_faq_pd = rm_faq_pd.rename(columns={'Question': 'input', 'Answer': 'output'})
        
        global one_fqa_pd
        # get the first question and answer and convert to dataframe
        one_fqa_pd = rm_faq_pd.iloc[0:1]
        

def export_to_json(df, file_path):
    df.to_json(file_path, orient='records', lines=True)


    
if __name__ == "__main__":
    fetch_faq_and_categories()
    
    export_to_json(faq_pd, 'faq_data.json')
    export_to_json(rm_faq_pd, 'faq_data_rm.json')
    export_to_json(one_fqa_pd, 'faq_data_one.json')


634it [00:00, 3108.42it/s]


In [19]:
lona_df = faq_pd.query('Categories.str.contains("貸")').head(20)
lona_df.drop(['Categories'], axis=1, inplace=True)
lona_df.to_csv('lona_data.csv', index=False)

In [5]:
# random select 20 question and answer
rm_faq_pd.sample(n=30).to_csv('faq_data_rm.csv', index=False)

Unnamed: 0,Question,Answer
0,子帳戶的申請資格是什麼？,加開子帳戶方式如下：具備臺幣活期儲蓄存款、綜合活期儲蓄存款、國泰世華數位存款帳戶(3-1類、...
1,子帳戶可透過什麼管道申請？,開立子帳戶方式如下：登入網路銀行App，點選【臺幣總額】> 選擇母帳戶右方更多功能 >【新增...
2,為何我無法與他人共用手機門號/電子郵件？,本行會透過您留存於本行的手機門號/Email，通知您與本行往來之所有個人帳務資訊與個人權益。...
3,我已申請非約定轉帳服務，為何還是無法於CUBE App轉帳？,CUBE App「非約定轉帳」功能，需搭配「人臉辨識」或「交易認證碼」(原隨身密碼)方可使...
4,子帳戶是什麼？,子帳戶的介紹如下：為提供客戶更完善的交易服務，於網路銀行App新增「子帳戶」功能，提供客戶分...
...,...,...
629,如何於集保e手掌握APP內取消與銀行的帳戶連結?,我的資產>銀行資產>查看/管理 銀行資訊>銀行帳戶>設定>取消銀行連結進入「查看/管理 銀行...
630,如何於國泰世華CUBE App內終止於集保e手掌握的帳戶連結授權?,需登入本行CUBE App後，至設定>授權驗證管理> 臺灣集中保管結算所> 取消授權，確認後...
631,何時會收到通知信?,每次進行連結帳戶的認證授權或終止授權，都會收到本行發出的即時通知信。每個月2號本行會寄送彙整...
632,集保e手掌握的客服專線或email信箱,(1) 集保e手掌握客服專線(週一至週五08:30〜17:30)：(02)2719-5805...
