In [13]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import time
import re
import pickle
# Set up Selenium WebDriver
chrome_options = Options()
chrome_options.add_argument("--headless")  # run in headless mode
chrome_options.add_argument("--no-sandbox")  # bypass OS security model
chrome_options.add_argument("--disable-dev-shm-usage")  # overcome limited resource problems
chrome_options.add_argument("--disable-gpu")  # applicable to windows os only
chrome_options.add_argument("start-maximized")  # open Browser in maximized mode
chrome_options.add_argument("disable-infobars")  # disabling infobars
chrome_options.add_argument("--disable-extensions")  # disabling extensions


def scraperLoop(key, text_list, driver, syn, numberOfObjects, special_word = '', time_of_waiting = 4):
    
    time.sleep(1)
    chat_xpath = '/html/body/div/div[1]/div/div[2]/div/form/div/div/textarea'
    chat_input = driver.find_element(By.XPATH, chat_xpath)
    chat_input.send_keys(f'{numberOfObjects}個{syn}的同義詞？ {special_word} 用空格分開')
    chat_input.send_keys(Keys.RETURN)
    time.sleep(time_of_waiting)
    for i in range(20):
        try: 
            fetch_xpath = '/html/body/div/div[1]/div/div[1]/div/div[2]/div[1]'
            fetch_output = driver.find_element(By.XPATH, fetch_xpath)
            output = re.sub(r"[（.*?）0-9a-zA-Z]", '', fetch_output.text)
            text_list[key] = re.split(r"[.\s,，。、]+", output)
            break
        except Exception as e:
            time.sleep(1)
    new_chat_xpath = '/html/body/div/div[1]/nav[3]/div[1]/a[2]'
    new_chat_entry = driver.find_element(By.XPATH, new_chat_xpath)
    new_chat_entry.click()

In [14]:

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
time.sleep(1)
driver.get("https://twllm.com/")

b1_xpath = '/html/body/div[2]/div/div/div/div/button'
b1_input = driver.find_element(By.XPATH, b1_xpath)
b1_input.click()

text_list = {
    'price_synonyms': {'低':'便宜','中': '中等','高':'不便宜'},
    'size_synonyms': {'大':'大容量', '中':'剛剛好','小': '小容量'},
    'color_synonyms': {'白色':'白色', '紅色':'紅色'}
}
text_map = {
    'price_synonyms':{
        '低': [],
        '中': [],
        '高': []
    },
    'size_synonyms':{
        '大': [],
        '中': [],
        '小': []
    },
    'color_synonyms':{
        '白色': [],
        '紅色': []
    }
}

for syn_key, syn_map in text_map.items():
    for key, val in syn_map.items():
        scraperLoop(key, text_map[syn_key], syn=text_list[syn_key][key],driver=driver, numberOfObjects=10)
driver.quit()

In [15]:
actual_map = {}
for syn_key, syn_map in text_map.items():
    actual_map[syn_key] = {}
    for key, val in syn_map.items():
        for subval in val:
            actual_map[syn_key][subval] = key

In [16]:
import spacy
from spacy.matcher import Matcher, PhraseMatcher

# Load the Chinese model
nlp = spacy.load('zh_core_web_md')

# Initialize the matchers
matcher = Matcher(nlp.vocab)
phrase_matcher = PhraseMatcher(nlp.vocab)

# Synonym mappings
price_synonyms = actual_map['price_synonyms']

gift_synonyms = {
    '烤肉': True,
    '沒有贈品': False,
    '附贈品': True
}

size_synonyms = actual_map['size_synonyms']

color_synonyms = actual_map['color_synonyms']

temperature_synonyms = {
    '低': '低',
    '中': '中',
    '高': '高'
}

capacity_synonyms = {
    '6人份': 6,
    '4人份': 4,
    '5人份': 5
}

# Define patterns using PhraseMatcher for multi-word synonyms
price_patterns = list(price_synonyms.keys())
gift_patterns = list(gift_synonyms.keys())
size_patterns = list(size_synonyms.keys())
color_patterns = list(color_synonyms.keys())
temperature_patterns = list(temperature_synonyms.keys())
capacity_patterns = list(capacity_synonyms.keys())

# Add patterns to the phrase matcher
price_phrases = [nlp(text) for text in price_patterns]
gift_phrases = [nlp(text) for text in gift_patterns]
size_phrases = [nlp(text) for text in size_patterns]
color_phrases = [nlp(text) for text in color_patterns]
temperature_phrases = [nlp(text) for text in temperature_patterns]
capacity_phrases = [nlp(text) for text in capacity_patterns]

phrase_matcher.add("PRICE", None, *price_phrases)
phrase_matcher.add("GIFT", None, *gift_phrases)
phrase_matcher.add("SIZE", None, *size_phrases)
phrase_matcher.add("COLOR", None, *color_phrases)
phrase_matcher.add("TEMPERATURE", None, *temperature_phrases)
phrase_matcher.add("CAPACITY", None, *capacity_phrases)

def extract_form_data(input_text):
    columns = ['價格', '份數', "溫度(低)", '大小', '顏色', '贈品']
    form_data = [None] * len(columns)
    
    # Process the input text with spaCy
    doc = nlp(input_text)
    
    # Apply the matchers to the doc
    matches = matcher(doc)
    phrase_matches = phrase_matcher(doc)
    
    # Extract data based on matches from the PhraseMatcher
    for match_id, start, end in phrase_matches:
        span = doc[start:end]
        match_text = span.text
        match_label = nlp.vocab.strings[match_id]
        
        if match_label == "PRICE":
            form_data[columns.index('價格')] = price_synonyms.get(match_text, None)
        elif match_label == "GIFT":
            form_data[columns.index('贈品')] = gift_synonyms.get(match_text, None)
        elif match_label == "SIZE":
            form_data[columns.index('大小')] = size_synonyms.get(match_text, None)
        elif match_label == "COLOR":
            form_data[columns.index('顏色')] = color_synonyms.get(match_text, None)
        elif match_label == "TEMPERATURE":
            form_data[columns.index('溫度(低)')] = temperature_synonyms.get(match_text, None)
        elif match_label == "CAPACITY":
            form_data[columns.index('份數')] = capacity_synonyms.get(match_text, None)
    
    return columns, form_data

# Test the function
input_text = "我要一個大型不便宜白色氣炸鍋子來烤肉，6人份"
columns, filled_form = extract_form_data(input_text)

print("Columns:", columns)
print("Filled Form:", filled_form)
