In [10]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
import time
import pandas as pd
import os
import random
import jieba
import re
from rank_bm25 import BM25Okapi
import requests
import pandas as pd
from docx import Document
from docx.opc.constants import RELATIONSHIP_TYPE
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from docx.shared import Inches, RGBColor
from PIL import Image
import os
from io import BytesIO

### Parameter Setting

In [11]:
# Query source and output result path
student_id = "M11207408"
query_path = f"./queries.txt"
results_search = "./results_search"
results_sorted = "./results_sorted"

# Web scraping target URL
search_url_m = "https://www.momoshop.com.tw/search/searchShop.jsp?keyword="
search_url_p = "https://24h.pchome.com.tw/search/?q="

# Scraping parameter settings
short_time_sleep = 1
medium_time_sleep = 3
long_time_sleep = 5

### Helpful Funtions

In [12]:
# Read queries from file
def read_queries(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        lines = [line.strip() for line in lines]
    return lines

# Check if the webpage is accessible
def check_access(driver):
    try:
        text = driver.find_element(By.XPATH, '/html/body/h1')
        if text.text == 'Access Denied':
            return False
        else: 
            return True
    except:
        return True

# Get all items from pages (momo)
def get_all_items_m(driver):
    items = pd.DataFrame()
    try:
        length = 0
        i = 0
        while length < 100:
            item = driver.find_elements(By.XPATH, "//div[@class = 'goodsUrl']")
            temp = extract_item_info_m(item)
            items = pd.concat([items, temp], ignore_index=True)
            length = len(items)  # 更新長度
            
            if length >= 100:  # 已獲取足夠資料，提前結束
                break
            
            # 翻頁邏輯
            try:
                nextpage = driver.find_elements(By.XPATH, "//div[@class = 'page-btn page-next']")
                if nextpage:  # 確保找到下一頁按鈕
                    nextpage[1].click()
                    time.sleep(long_time_sleep)
                    i += 1
                else:
                    print("No more pages to click.")
                    break  # 若沒有下一頁則結束迴圈
            except Exception as e:
                print(f"Click fail: {e}")
                break  # 若翻頁出錯則結束迴圈
            
            if i > 10:  # 防止過多翻頁
                break

        items.drop_duplicates(inplace=True, subset=['momo_name'])
        items = items.head(100)  # 確保只返回前100筆資料
        return items
    except Exception as e:
        print(f"No items found: {e}")


# Extract all item informations to a dataframe   
def extract_item_info_m(items):
    print("Extracting item information...")
    data = []
    for i, item in enumerate(items):
        try:
            item_name = item.find_element(By.XPATH, 'div[2]/h3').text
            item_url = item.find_element(By.XPATH, 'div[1]/div/div/div/a').get_attribute('href')
            item_price = item.find_element(By.XPATH, 'div[3]/div[2]/span/b').text
            item_pic_url = item.find_element(By.XPATH, "div[1]/div/div[1]/div[1]/a/picture/img").get_attribute('src')

            
            data.append({
                'momo_name': item_name,
                'momo_price': item_price,
                'momo_url': item_url,
                'momo_pic': item_pic_url
            })
        except:
            print(f"Error extracting item {i}.")

    df = pd.DataFrame(data)
    return df

def scroll_p(driver):
    # scroll
    last_height = driver.execute_script("return document.body.scrollHeight")
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(3)
    new_height = driver.execute_script("return document.body.scrollHeight")
    return new_height != last_height

# Get all items from pages (PChome)
def get_all_items_p(driver):
    try:
        items = driver.find_elements(By.XPATH, "/html/body/div[1]/div[2]/div[1]/div[2]/div[1]/div[2]/dl")
        data = []
        seen_names = set()

        while len(data) < 100:
            # 重新獲取當前項目
            items = driver.find_elements(By.XPATH, "/html/body/div[1]/div[2]/div[1]/div[2]/div[1]/div[2]/dl")
            for item in items:
                try:
                    item_name = item.find_element(By.XPATH, 'dd[2]/h5/a').text
                    item_url = item.find_element(By.XPATH, 'dd[2]/h5/a').get_attribute('href')
                    item_price = item.find_element(By.XPATH, 'dd[3]/ul[1]/li/span/span').text  # on sale.
                    try: #R18網頁抓不到圖片
                        item_pic_url = item.find_element(By.XPATH, 'dd[1]/a/img').get_attribute('src')
                    except:
                        item_pic_url = "no picture"

                    if item_name not in seen_names:

                        
                        seen_names.add(item_name)
                        data.append({
                            'PChome_name': item_name,
                            'PChome_price': item_price,
                            'PChome_url': item_url,
                            'PChome_pic': item_pic_url
                        })
                except:
                    print(f"Error extracting item.")

            # 檢查是否達到 100 筆資料
            if len(data) >= 100:
                break

            # 滾動頁面
            if not scroll_p(driver):
                break
        data = pd.DataFrame(data).head(100)
        return data  # 回傳收集到的資料
    except Exception as e:
        print(f"Error: {e}")


### Start web scraping.

In [13]:
# Open the webpage
driver = webdriver.Chrome()
time.sleep(short_time_sleep)

# Main scraping process
queries = read_queries(query_path)
for query in queries:
    # Get all csv files in the results folder, if exists pass the query
    csv_files = [f for f in os.listdir(results_search) if f.endswith('.csv')]
    search_string = query
    all_contain_string = any(search_string in file_name for file_name in csv_files)
    if all_contain_string:
        print(f"Results for {query} have already been scraped. Skipping...\n")
        continue
    
    # Search for the query(momo)
    driver.get(search_url_m + query)
    time.sleep(medium_time_sleep)
    status = check_access(driver)
    if status:
        print(f"Start scraping {query}...")
    else:
        print(f"Some error occurred while scraping {query}.")
        continue
    # Process the items(momo)
    items_df_m = get_all_items_m(driver)
   
    # Search for the query(PChome)
    driver.get(search_url_p + query)
    time.sleep(medium_time_sleep)
    status = check_access(driver)
    if status:
        print(f"Start scraping {query}...")
    else:
        print(f"Some error occurred while scraping {query}.")
        continue
    # Process the items(PChome)
    items_df_p = get_all_items_p(driver)

    # 清除空白值
    items_df_m_cleaned = items_df_m.dropna().reset_index(drop=True)
    items_df_p_cleaned = items_df_p.dropna().reset_index(drop=True)
    # 合併 DataFrame
    items_df = pd.concat([items_df_m_cleaned, items_df_p_cleaned], axis=1)
    
    # Save the results to a CSV file
    file_path = os.path.join(results_search, f"{query}.csv")
    items_df.to_csv(file_path, index=False, encoding='utf-8-sig')
    print(f"Results for {query} have been saved to {file_path}")
    time.sleep(long_time_sleep + random.random() * 10)
    print("Sleeping for a while...")
    print("-"*80)

    

# Close the browser
driver.quit()

Results for 運動鞋 have already been scraped. Skipping...

Results for 3_5mm 轉 usb have already been scraped. Skipping...

Results for amd_迷你主機 have already been scraped. Skipping...

Results for inktank_115 have already been scraped. Skipping...

Results for iphone 音源線 have already been scraped. Skipping...

Results for mio_行車紀錄器支架 have already been scraped. Skipping...

Results for papago_導航機 have already been scraped. Skipping...

Results for rtx 顯示卡 have already been scraped. Skipping...

Results for switch_底座 have already been scraped. Skipping...

Results for zenpower_10050 have already been scraped. Skipping...

Results for 大聖誕樹 have already been scraped. Skipping...

Results for 斗篷雨衣 have already been scraped. Skipping...

Results for 交換禮物 have already been scraped. Skipping...

Results for 行動電源 have already been scraped. Skipping...

Results for 兒童毛帽 have already been scraped. Skipping...

Results for 便攜式咖啡機 have already been scraped. Skipping...

Results for 倍潔雅 have already bee

### Funtion for BM25

In [14]:
# 提取型號的函數
def extract_model(name):
    pattern = r'([A-Za-z0-9\-.]+)'
    matches = re.findall(pattern, name)
    matches = [match for match in matches if ' ' not in match and match.strip()]
    return ' '.join(matches)

# 定義簡單的分詞器
def jieba_tokenizer(text):
    tokens = jieba.lcut(text, cut_all=False)
    stop_words = ['【', '】', '/', '~', '＊', '、', '（', '）', '+', '‧', ' ', '', '-', '&']
    tokens = [t for t in tokens if t not in stop_words and len(t) >= 2] #至少兩個字
    return tokens

# Read queries from file
def read_queries(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        lines = [line.strip() for line in lines]
    return lines

### BM25

In [15]:
for query in queries:
        try:
            # 讀取商品數據的 CSV 檔案
            df = pd.read_csv(f'{results_search}/{query}.csv')  # 替換為你的檔案路徑
            # 檢查 DataFrame 是否為空
        except:
            print(f"{query}.csv 為空") #兩個網站都沒有搜尋結果
            continue  # 跳過這次迴圈，繼續下一個
        
        # 預處理
        try:
            df['momo_name'] = df['momo_name'].str.lower()
            df['PChome_name'] = df['PChome_name'].str.lower()
            df['momo_name'] = df['momo_name'].fillna('').str.replace('-', '', regex=False)
            df['PChome_name'] = df['PChome_name'].fillna('').str.replace('-', '', regex=False)
        except:
            print(f'{query}.csv 有欄位為空') #其中一個網站沒有搜尋結果
            continue

        # 使用 jieba 進行分詞
        df['momo_tokens'] = df['momo_name'].apply(jieba_tokenizer)
        df['PChome_tokens'] = df['PChome_name'].apply(jieba_tokenizer)

        # 建立 BM25 模型以進行比較
        bm25 = BM25Okapi(df['momo_tokens'].tolist())

        matched_products = []
        used_pchome_names = set()  # 用於追蹤已匹配的 PChome 商品名稱

        for pc_name, pc_tokens, pc_price, pc_url, pc_pic in zip(df['PChome_name'], df['PChome_tokens'], df['PChome_price'], df['PChome_url'], df['PChome_pic']):
            if pc_name in used_pchome_names:
                continue
            
            scores = bm25.get_scores(pc_tokens)
            
            best_match = None
            best_score = -1

            for i, score in enumerate(scores):
                if len(df['momo_tokens'].iloc[i]) > 18:
                    score *= 0.8  # 名稱內token太多降低分數

                if score > best_score and score > 1:
                    best_score = score
                    best_match = {
                        'PChome_name': pc_name,
                        'PChome_price': pc_price,
                        'PChome_url': pc_url,
                        'PChome_pic': pc_pic,      
                        'momo_name': df['momo_name'].iloc[i],
                        'momo_price': df['momo_price'].iloc[i],
                        'momo_url': df['momo_url'].iloc[i],
                        'momo_pic':df['momo_pic'].iloc[i],    
                        'score': score
                    }

            if best_match:
                matched_products.append(best_match)
                used_pchome_names.add(best_match['PChome_name'])


        result_df = pd.DataFrame(matched_products)
        result_df.sort_values(by='score', ascending=False, inplace=True)

        # 將結果寫入 CSV 檔案，使用 utf-8-sig 編碼
        result_df.to_csv(f'{results_sorted}/{query}_matched_products.csv', index=False, encoding='utf-8-sig')
        print(f"Results for {query} have been saved")


Results for 運動鞋 have been saved
Results for 3_5mm 轉 usb have been saved
Results for amd_迷你主機 have been saved
inktank_115.csv 有欄位為空
Results for iphone 音源線 have been saved
Results for mio_行車紀錄器支架 have been saved
Results for papago_導航機 have been saved
Results for rtx 顯示卡 have been saved
Results for switch_底座 have been saved
zenpower_10050.csv 為空
Results for 大聖誕樹 have been saved
Results for 斗篷雨衣 have been saved
Results for 交換禮物 have been saved
Results for 行動電源 have been saved
Results for 兒童毛帽 have been saved
Results for 便攜式咖啡機 have been saved
Results for 倍潔雅 have been saved
Results for 馬玉山紅藜麥 have been saved
Results for 莊園巧克力 have been saved
Results for 發電機 have been saved
Results for 感應垃圾桶 have been saved
Results for 電暖器 have been saved
Results for 樂高_71043 have been saved
Results for 燈串 have been saved
Results for 麵包 have been saved
Results for 冰糖燕窩 have been saved
Results for 國際牌 星光 have been saved
Results for 安撫娃娃 have been saved
Results for 微波爐 have been saved
Results for 教學無線麥克風 have

### Function for csv2word

In [16]:
def download_and_convert_image(url, save_path):
    try:
        response = requests.get(url, stream=True)
        if response.status_code == 200:
            img = Image.open(BytesIO(response.content))
            # 檢查是否是 WebP 格式
            if img.format == 'WEBP':
                save_path = save_path.replace(".jpg", ".png")  # 轉為 PNG 格式
                img = img.convert("RGB")  # 將 WebP 轉換為 RGB 格式
                img.save(save_path, 'PNG')
            else:
                with open(save_path, 'wb') as out_file:
                    out_file.write(response.content)
            return save_path
        else:
            print(f"Failed to download {url}, status code: {response.status_code}")
            return None
    except Exception as e:
        print(f"Failed to download {url}: {e}")
        return None
    
def safe_add_image(cell, image_path, width):
    try:
        cell.paragraphs[0].add_run().add_picture(image_path, width=width)
    except Exception as e:
        cell.text = 'No Image'
        print(f"Failed to add image {image_path}: {e}")

# 添加超連結
def add_hyperlink(paragraph, url, text):
    """
    為段落添加一個帶有超連結的文字
    :param paragraph: 段落對象
    :param url: 超連結的網址 (字串)
    :param text: 顯示在段落中的文字 (字串)
    """
    # 建立超連結的 rId
    part = paragraph.part
    r_id = part.relate_to(url, RELATIONSHIP_TYPE.HYPERLINK, is_external=True)

    # 創建 <w:hyperlink> 標籤
    hyperlink = OxmlElement('w:hyperlink')
    hyperlink.set(qn('r:id'), r_id)

    # 創建 <w:r> 標籤
    run = OxmlElement('w:r')
    run_properties = OxmlElement('w:rPr')

    # 設置超連結文字為藍色並加下劃線
    color = OxmlElement('w:color')
    color.set(qn('w:val'), "0000FF")  # 設置為藍色
    run_properties.append(color)

    underline = OxmlElement('w:u')
    underline.set(qn('w:val'), 'single')  # 添加下劃線
    run_properties.append(underline)

    run.append(run_properties)

    # 添加顯示文字
    text_element = OxmlElement('w:t')
    text_element.text = text
    run.append(text_element)

    # 將 run 加入 hyperlink
    hyperlink.append(run)

    # 將 hyperlink 加入段落
    paragraph._element.append(hyperlink)


### csv2word

In [17]:

image_dir = "./image"
doc = Document()
doc.add_heading('Product Comparison', 0)
os.makedirs(image_dir, exist_ok=True)  # 確保圖片目錄存在

for query in queries:
    word_output_path = f'./word/{query}.docx'
    
    # 檢查 Word 檔案是否已存在
    if os.path.exists(word_output_path):
        print(f"{word_output_path} already exists. Skipping...")
        continue

    # 讀取csv檔案
    try:
        df = pd.read_csv(f'{results_sorted}/{query}_matched_products.csv')
    except FileNotFoundError:
        print(f"No {query} file found, skipping...")
        continue

    doc = Document()
    doc.add_heading('Product Comparison', 0)

    # 添加表格
    table = doc.add_table(rows=1, cols=6)
    hdr_cells = table.rows[0].cells
    hdr_cells[0].text = 'PChome Name'
    hdr_cells[1].text = 'PChome Price'
    hdr_cells[2].text = 'PChome Image'
    hdr_cells[3].text = 'Momo Name'
    hdr_cells[4].text = 'Momo Price'
    hdr_cells[5].text = 'Momo Image'

    # 修改下載與插入圖片的部分
    for idx, row in df.iterrows():
        row_cells = table.add_row().cells

        # 插入 PChome 名稱及其超連結
        pchome_paragraph = row_cells[0].paragraphs[0]
        add_hyperlink(pchome_paragraph, row['PChome_url'], str(row['PChome_name']))
        row_cells[1].text = str(row['PChome_price'])

        # 下載並插入 PChome 圖片
        pchome_image_path = os.path.join(image_dir, f"pchome_image_{idx}.jpg")
        if download_and_convert_image(row['PChome_pic'], pchome_image_path):
            safe_add_image(row_cells[2], pchome_image_path, width=Inches(1.0))
        else:
            row_cells[2].text = 'No Image'

        # 插入 Momo 名稱及其超連結
        momo_paragraph = row_cells[3].paragraphs[0]
        add_hyperlink(momo_paragraph, row['momo_url'], str(row['momo_name']))
        row_cells[4].text = str(row['momo_price'])

        # 下載並插入 Momo 圖片
        momo_image_path = os.path.join(image_dir, f"momo_image_{idx}.png")
        if download_and_convert_image(row['momo_pic'], momo_image_path):
            safe_add_image(row_cells[5], momo_image_path, width=Inches(1.0))
        else:
            row_cells[5].text = 'No Image'

    # 保存 Word
    doc.save(word_output_path)
    print(f"Item {query} converted and saved to {word_output_path}")
    table._element.clear()  # 清空表格的所有內容

./word/運動鞋.docx already exists. Skipping...
./word/3_5mm 轉 usb.docx already exists. Skipping...
./word/amd_迷你主機.docx already exists. Skipping...
No inktank_115 file found, skipping...
./word/iphone 音源線.docx already exists. Skipping...
./word/mio_行車紀錄器支架.docx already exists. Skipping...
./word/papago_導航機.docx already exists. Skipping...
./word/rtx 顯示卡.docx already exists. Skipping...
./word/switch_底座.docx already exists. Skipping...
No zenpower_10050 file found, skipping...
./word/大聖誕樹.docx already exists. Skipping...
./word/斗篷雨衣.docx already exists. Skipping...
./word/交換禮物.docx already exists. Skipping...
./word/行動電源.docx already exists. Skipping...
./word/兒童毛帽.docx already exists. Skipping...
./word/便攜式咖啡機.docx already exists. Skipping...
./word/倍潔雅.docx already exists. Skipping...
Item 馬玉山紅藜麥 converted and saved to ./word/馬玉山紅藜麥.docx
Item 莊園巧克力 converted and saved to ./word/莊園巧克力.docx
Item 發電機 converted and saved to ./word/發電機.docx
./word/感應垃圾桶.docx already exists. Skipping...
./word