In [37]:
import time
from sklearn.preprocessing import normalize
# from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch
import os
import settings
import json
from ollama import Client

In [2]:
# 文字轉向量
def get_embeddings(query_str: str) -> list:
    embedding_model = SentenceTransformer("infgrad/stella-mrl-large-zh-v3.5-1792d")
    vectors = embedding_model.encode([query_str], normalize_embeddings=False, show_progress_bar=False)
    n_dims = 1792
    cut_vecs = normalize(vectors[:, :n_dims])
    return cut_vecs.flatten().tolist()

embedding_word = get_embeddings("四月名古屋 櫻花 滑雪場推薦")
embedding_word

[-0.023494446650147438,
 0.0010405024513602257,
 -0.015481261536478996,
 -0.02097328007221222,
 0.030281592160463333,
 -0.036122143268585205,
 0.02688002772629261,
 -0.035839904099702835,
 0.01109291147440672,
 -0.020366491749882698,
 -0.002242638962343335,
 0.009477959014475346,
 -0.0021596902515739202,
 0.010264988988637924,
 0.00433137733489275,
 -0.01714974269270897,
 0.0060660094022750854,
 0.00848556961864233,
 0.005118826404213905,
 0.014211163856089115,
 0.035775624215602875,
 -0.009671625681221485,
 -0.011691215448081493,
 -0.05402087792754173,
 0.04883722588419914,
 -0.04671153798699379,
 -0.035745128989219666,
 -0.04210784286260605,
 0.010436510667204857,
 0.06444819271564484,
 -0.00034486062941141427,
 0.015078272670507431,
 0.012575013563036919,
 -0.024487776681780815,
 -0.024281824007630348,
 0.038412176072597504,
 -0.05735505372285843,
 0.011175350286066532,
 -0.02598433382809162,
 -0.0009749362943693995,
 0.002680971520021558,
 -0.01627868413925171,
 0.03450807929039001

In [None]:
# 移除elasticsearch query後的重複值
def _remove_duplicate(result_items: list) -> list:
    seen = set()
    unique_data = []
    for item in result_items:
        if item["_source"]["TBM0_CODE"] not in seen:
            seen.add(item["_source"]["TBM0_CODE"])
            item["_source"].pop("TBM0_CODE")
            unique_data.append(item)

    return unique_data

In [None]:
## 向量搜尋：產品資訊單欄位向量搜尋

keyword = "四月名古屋 櫻花 滑雪場推薦"
# query_vector = get_embeddings(keyword)

start = time.time()
new_knn = [
    {
        "field": "embedding",  # ES索引中的「向量欄位」，通常這是一個 dense_vector 型別的欄位
        "query_vector": embedding_word,  # 查詢向量
        "num_candidates": 150,  # 初步從索引中篩選出 150 筆可能匹配的資料
        "k": 80,  # 擇最相似的 80 筆結果
        "similarity": 0.6,  # 只選擇相似度 ≥ 0.6 的結果
        "filter": {
            "nested": {
                "path": "GROUP_INFO",
                "query": {"range": {"GROUP_INFO.B2C_LOW_PRICE": {"gt": 0.0}}},  # 過濾價格 > 0 的產品
            }
        },
    }
]

rescore = {
    "window_size": 80,  # 只對前 80 個結果重新計分
    "query": {
        # rescore_query: 使用 餘弦相似度（cosineSimilarity）來重新評估結果的相似度，使排序更精準
        "rescore_query": {
            # script_score: 透過自訂腳本計算 query_vector 與 ES 索引中的 FromToVector 之間的相似度來決定最終排名
            "script_score": {
                "query": {"match_all": {}}, 
                "script": {
                    "source": "cosineSimilarity(params.query_vector, 'FromToVector')", 
                    "params": {"query_vector": embedding_word},
                },
            }
        }
    },
}

index_1 = "group-vector-20250220"
include_fields = ["PROD_NO", "PROD_NAME", "DEPARTURE_LIST.CITY_NAME", "DESTINATION_LIST.CITY_NAME", "AREA_NAME", "AREAD_NAME",
                  "T_IMAGE_URL", "PROD_DAYS", "WEB", "B2C_LOW_PRICE", "TBM0_CODE",]

# es_conn = Elasticsearch([f"http://{USER_NAME}:{PASSWORD}@{HOST_1}:9200",])

# 使用KNN匹配相似的向量，並透過 Rescore 來進一步微調結果的排序，返回最多100筆資料，從第0筆開始取結果（適用於分頁）
result = es_conn.search(index=index_1, knn=new_knn, rescore=rescore, size=100, from_=0, source=include_fields,)
uq_result = _remove_duplicate(result["hits"]["hits"])

end = time.time()
print(f"processing time: {end - start}",)
print({"data": uq_result, "llm_keyword": {}})

In [31]:
# 串接ollma模型, 將使用者的input切割出filter條件
def llm_extract_keyword(keyword_str: str) -> dict:
    start = time.time()
    client = Client(host="https://iard.liontravel.com/ollama")
    reponse = client.chat(
        model="gemma2:27b",
        messages=[
            {"role": "system", "content": settings.system_prompt},
            {"role": "user", "content": settings.user_prompt},
            {"role": "assistant", "content": settings.assistant_prompt},
            {"role": "user", "content": keyword_str},
        ],
    )
    end = time.time()
    try:
        result = json.loads(
            reponse.message.content.replace("'", '"')
            .replace("`", "")
            .replace("json", "")
        )
        print(f"gpt processing time: {end - start}")
        if isinstance(result, dict):
            return result
        return {
            "depart": [],
            "arrive": [],
            "days": 0,
            "depart_date": "",
            "budget": 0,
            "airlines": [],
            "is_grouped": None,
        }

    except Exception as e:
        print(e)
        return {
            "depart": [],
            "arrive": [],
            "days": 0,
            "depart_date": "",
            "budget": 0,
            "airlines": [],
            "is_grouped": None,
        }

In [42]:
# 依照llm切割出的過濾條件, 整理成filter語句
def _get_knn_filter(extracted_kw: str) -> dict:
    print(extracted_kw)
    knn_filter = {"bool": {"must": [], "should": [], "minimum_should_match": 1}}
    nested_group_info = {
        "nested": {
            "path": "GROUP_INFO",
            "ignore_unmapped": True,
            "query": {"bool": {"must": []}},
        }
    }

    nested_tag_list = {
        "nested": {
            "path": "TAG_LIST",
            "ignore_unmapped": True,
            "query": {"bool": {"must": []}},
        }
    }

    if extracted_kw.get("depart"):
        tmp = []
        for kw in extracted_kw.get("depart"):
            if kw == "台灣":
                tmp.append({"term": {"DEPARTURE_LIST.CITY_NAME": "台北"}})
                tmp.append({"term": {"DEPARTURE_LIST.CITY_NAME": "台中"}})
                tmp.append({"term": {"DEPARTURE_LIST.CITY_NAME": "高雄"}})
            else:
                tmp.append({"term": {"DEPARTURE_LIST.CITY_NAME": kw}})
        knn_filter["bool"]["must"].append(
            {
                "nested": {
                    "path": "DEPARTURE_LIST",
                    "query": {
                        "bool": {
                            "should": tmp,
                            "minimum_should_match": 1,
                        },
                    },
                }
            }
        )

    if extracted_kw.get("arrive"):
        tmp = []
        for kw in extracted_kw.get("arrive"):
            tmp.append({"term": {"DESTINATION_LIST.CITY_NAME": kw}})
            knn_filter["bool"]["should"].append({"term": {"AREA_NAME": kw}})
            knn_filter["bool"]["should"].append({"term": {"AREAD_NAME": kw}})

        knn_filter["bool"]["should"].append(
            {"terms": {"AREA_SYNONYM_LIST": extracted_kw.get("arrive")}}
        )
        knn_filter["bool"]["should"].append(
            {
                "nested": {
                    "path": "DESTINATION_LIST",
                    "query": {
                        "bool": {
                            "should": tmp,
                            "minimum_should_match": 1,
                        },
                    },
                }
            }
        )

    if extracted_kw.get("days"):
        # in case LLM return string value for days field
        try:
            days = int(extracted_kw.get("days"))
            knn_filter["bool"]["must"].append({"term": {"PROD_DAYS": days}})
        except ValueError:
            pass

    if extracted_kw.get("depart_date"):
        nested_group_info["nested"]["query"]["bool"]["must"].append(
            {
                "range": {
                    "GROUP_INFO.PROD_DATE_BEGIN": {
                        "gte": extracted_kw.get("depart_date")
                    }
                }
            }
        )

    if extracted_kw.get("budget"):
        nested_group_info["nested"]["query"]["bool"]["must"].append(
            {
                "range": {
                    "GROUP_INFO.B2C_LOW_PRICE": {
                        "lte": extracted_kw.get("budget"),
                        "gt": 0.0,
                    }
                }
            }
        )
    else:
        nested_group_info["nested"]["query"]["bool"]["must"].append(
            {
                "range": {
                    "GROUP_INFO.B2C_LOW_PRICE": {
                        "gt": 0.0,
                    }
                }
            }
        )

    if extracted_kw.get("airlines"):
        nested_group_flight_info = {
            "nested": {
                "path": "GROUP_INFO.FLIGHT_INFO",
                "ignore_unmapped": True,
                "query": {"bool": {"should": [], "minimum_should_match": 1}},
            }
        }

        for airline in extracted_kw.get("airlines"):
            nested_group_flight_info["nested"]["query"]["bool"]["should"].append(
                {
                    "match_phrase": {
                        "GROUP_INFO.FLIGHT_INFO.FLIGHT_CARR": {
                            "query": airline,
                            "slop": 1,
                        }
                    }
                }
            )
        nested_group_info["nested"]["query"]["bool"]["must"].append(
            nested_group_flight_info
        )

    if extracted_kw.get("is_grouped") is True:
        nested_group_info["nested"]["query"]["bool"]["must"].append(
            {"term": {"GROUP_INFO.PROD_GO": 1}}
        )

    if extracted_kw.get("excludes"):
        knn_filter["bool"]["must_not"] = []
        for kw in extracted_kw.get("excludes"):
            nested_tag_list["nested"]["query"]["bool"]["must"].append(
                {
                    "match_phrase": {
                        "TAG_LIST.TAG_NAME": {
                            "query": kw,
                            "slop": 1,
                        }
                    }
                }
            )
            knn_filter["bool"]["must_not"].append(
                {
                    "match_phrase": {
                        "KEYWORD_LIST": {
                            "query": kw,
                            "slop": 1,
                        }
                    }
                }
            )
            knn_filter["bool"]["must_not"].append({"match": {"PROD_NAME": kw}})
            knn_filter["bool"]["must_not"].append({"match": {"PROD_NAME_1": kw}})

        knn_filter["bool"]["must_not"].append(
            {"terms": {"KEYWORDS": extracted_kw.get("excludes")}}
        )
        knn_filter["bool"]["must_not"].append(nested_tag_list)

    knn_filter["bool"]["must"].append(nested_group_info)

    return knn_filter

In [43]:
_get_knn_filter({'depart': [], 'arrive': ['日本', '愛知', '名古屋'], 'days': 0, 'depart_date': '', 'budget': 0, 'airlines': [], 'is_grouped': 'None', 'excludes': []})

{'depart': [], 'arrive': ['日本', '愛知', '名古屋'], 'days': 0, 'depart_date': '', 'budget': 0, 'airlines': [], 'is_grouped': 'None', 'excludes': []}


{'bool': {'must': [{'nested': {'path': 'GROUP_INFO',
     'ignore_unmapped': True,
     'query': {'bool': {'must': [{'range': {'GROUP_INFO.B2C_LOW_PRICE': {'gt': 0.0}}}]}}}}],
  'should': [{'term': {'AREA_NAME': '日本'}},
   {'term': {'AREAD_NAME': '日本'}},
   {'term': {'AREA_NAME': '愛知'}},
   {'term': {'AREAD_NAME': '愛知'}},
   {'term': {'AREA_NAME': '名古屋'}},
   {'term': {'AREAD_NAME': '名古屋'}},
   {'terms': {'AREA_SYNONYM_LIST': ['日本', '愛知', '名古屋']}},
   {'nested': {'path': 'DESTINATION_LIST',
     'query': {'bool': {'should': [{'term': {'DESTINATION_LIST.CITY_NAME': '日本'}},
        {'term': {'DESTINATION_LIST.CITY_NAME': '愛知'}},
        {'term': {'DESTINATION_LIST.CITY_NAME': '名古屋'}}],
       'minimum_should_match': 1}}}}],
  'minimum_should_match': 1}}

In [None]:
## 結合LLMs過濾後, 單欄位向量搜尋

keyword = "四月名古屋 櫻花 滑雪場推薦"
start = time.time()
query_vector = get_embeddings(keyword)
# extracted_kw = llm_extract_keyword(keyword)
print(extracted_kw)

new_knn = {
    "field": "embedding",
    "query_vector": query_vector,
    "num_candidates": 150,
    "k": 80,
    "similarity": 0.6,  # lower similarity from 0.7 to 0.65
}

new_knn["filter"] = _get_knn_filter(extracted_kw)

result = es_conn.search(index=index_1, knn=new_knn, size=100, from_=0, source=include_fields,)
uq_result = _remove_duplicate(result.body["hits"]["hits"])

end = time.time()
print(f"processing time: {end - start}",)
print({"llm_keyword": extracted_kw, "data": uq_result})

gpt processing time: 6.352467060089111
{'depart': [], 'arrive': ['日本', '愛知', '名古屋'], 'days': 0, 'depart_date': '', 'budget': 0, 'airlines': [], 'is_grouped': 'None', 'excludes': []}
