In [None]:
import json
from process_data import get_car_type, get_price_range
from crawler import VnexpressCrawler
vnexpress_crawler = VnexpressCrawler(url="https://vnexpress.net/oto-xe-may/v-car/", base_url="https://vnexpress.net")
car_info = vnexpress_crawler.get_data()
car_info["car_type"] = get_car_type(car_info)
car_info["listed_price"].extend(get_price_range(car_info))


with open("car_info.json", "w", encoding='utf-8') as f:
    json.dump(car_info, f, ensure_ascii=False, indent=4)

In [None]:
import json
with open("car_info.json", "r", encoding='utf-8') as f:
    car_info = json.load(f)

In [None]:
from llama_index.core import Document, SummaryIndex

listed_documents = [Document(text=json.dumps(info["content"], ensure_ascii=False), metadata=info["metadata"]) for info in car_info["listed_price"]]
detail_info_documents = [Document(text=json.dumps(info["content"], ensure_ascii=False), metadata=info["metadata"]) for info in car_info["detail_infomation"]]
technical_detail_documents = [Document(text=json.dumps(info["content"], ensure_ascii=False), metadata=info["metadata"]) for info in car_info["technical_detail"]]
car_inventory_documents = [Document(text=json.dumps(info["content"], ensure_ascii=False), metadata=info["metadata"]) for info in car_info["car_inventory_by_brand"]]
car_type_documents = [Document(text=json.dumps(info["content"], ensure_ascii=False), metadata=info["metadata"]) for info in car_info["car_type"]]




In [None]:
listed_documents[0]

# Store vector index

In [None]:
import openai
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.vector_stores.chroma import ChromaVectorStore
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
import chromadb
import os

openai.api_key = os.environ.get("OPENAI_API_KEY", "")

def save_vector_index(documents, path: str="./chroma_db", collection_name: str="vnexpress_car_info"):
    # initialize client, setting path to save data
    db = chromadb.PersistentClient(path=path)
    # create collection
    chroma_collection = db.get_or_create_collection(collection_name)

    # assign chroma as the vector_store to the context
    vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
    storage_context = StorageContext.from_defaults(vector_store=vector_store)


    embedding = OpenAIEmbedding(name="text-embedding-3-small")
    vector_tool = VectorStoreIndex.from_documents(documents, storage_context=storage_context, embedding=embedding)

# save_vector_index(listed_documents, path="./chroma_db", collection_name="vnexpress_car_listed_price")
# save_vector_index(technical_detail_documents, path="./chroma_db", collection_name="vnexpress_car_technical_detail")
# save_vector_index(detail_info_documents, path="./chroma_db", collection_name="vnexpress_car_info_detail")
# save_vector_index(car_inventory_documents, path="./chroma_db", collection_name="vnexpress_car_inventory")
save_vector_index(car_type_documents, path="./chroma_db", collection_name="vnexpress_car_type")

In [None]:
from query_tool import VnExpressTool
# from search_tool import BonBanhCarPrice
import openai
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import numpy as np
import time
from typing import Tuple, Dict
from utils import get_page_soup
from llama_index.core.tools import FunctionTool

class GoogleSearch():
    def __init__(self) -> None:
        options = Options()
        options.headless = True
        self.driver = webdriver.Chrome(options=options)
        self.driver.get("https://www.google.com")
    
    def search(self, search_query: str):
        search_box = self.driver.find_element(By.NAME, "q")
        search_box.send_keys(search_query)
        search_box.send_keys(Keys.RETURN)
        time.sleep(2)
        search_results = self.driver.find_elements(By.CSS_SELECTOR, "div.g")
        return search_results
        
class BonBanhCarPrice():
    def __init__(self) -> None:
        self.tool = FunctionTool.from_defaults(fn=self.get_old_price)
        self.base_url = "https://bonbanh.com/"

    def _get_old_price_from_bonbanh(self, url: str) -> Tuple[int, int]:
        soup = get_page_soup(url)
        price_soup = soup.find_all("b", itemprop='price')
        car_items = soup.find_all(class_=re.compile(r'\bcar-item\b'))
        metadata_link = []
        for car_item in car_items:
            car_url = car_item.find("a", href=True).get("href").lower()
            car_url = self.base_url + car_url
            metadata_link.append(car_url)
        car_price = []
        for price in price_soup:
            price = price.get_text().lower()
            if "tỷ" in price:
                car_price.append(float(price.replace(" tỷ ", ".").replace("triệu", "").strip()) * 1e9)
            elif "triệu" in price:
                car_price.append(float(price.split(" ")[0]) * 1e6)
        q1 = int(np.quantile(car_price, 0.25))
        q2 = int(np.quantile(car_price, 0.75))
        return {"price_range": [q1, q2], "metadata_link": metadata_link}

    def get_old_price(self, query: str) -> Dict:
        "get old car price from bonbanh.com"
        search_tool = GoogleSearch()
        search_results = search_tool.search(query + " bonbanh")
        price_range = (None, None)
        for index, result in enumerate(search_results, start=1):
            link_element = result.find_element(By.TAG_NAME, "a")
            link = link_element.get_attribute("href")
            print(link)
            if "bonbanh" not in link:
                continue
            price_range = self._get_old_price_from_bonbanh(link)
            break
        search_tool.driver.quit()
        return price_range


listed_price_tool = VnExpressTool(tool_name="listed_price_car", 
                                  description="Cung cấp thông tin giá niêm yết của các mẫu xe hơi hiện tại ở Việt Nam",
                                  collection_name="vnexpress_car_listed_price")
info_detail_tool = VnExpressTool(tool_name="car_info_detail", 
                                 description="Cung cấp mô tả và đánh giá chi tiết của các mẫu xe hơi hiện tại ở Việt Nam",
                                 collection_name="vnexpress_car_info_detail")
technical_detail_tool = VnExpressTool(tool_name="car_technical_detail", 
                                      description="Cung cấp thông số kỹ thuật của các mẫu xe hơi hiện tại ở Việt Nam",
                                      collection_name="vnexpress_car_technical_detail")
car_inventory_tool = VnExpressTool(tool_name="car_inventory",
                                      description="Cung cấp danh sách xe của một thương hiệu xe hơi hiện tại ở Việt Nam",
                                      collection_name="vnexpress_car_inventory")

car_type_tool = VnExpressTool(tool_name="car_type",
                                      description="Cung cấp danh sách các loại xe hơi hiện cho một kiểu xe hơi cụ thể (SUV, Sedan, Hatchback, ...) ở Việt Nam",
                                      collection_name="vnexpress_car_type")                                      

old_car_price = BonBanhCarPrice()


In [None]:
from llama_index.agent.openai import OpenAIAgent, advanced_tool_call_parser
from llama_index.core.agent import ReActAgent, AgentChatResponse
from llama_index.llms.openai import OpenAI

llm = OpenAI(model="gpt-4o-mini")
agent = OpenAIAgent.from_tools([listed_price_tool.tool, info_detail_tool.tool, technical_detail_tool.tool, old_car_price.tool, car_type_tool.tool], llm=llm, verbose=True, tool_call_parser=advanced_tool_call_parser)


In [None]:
agent.chat("giá xe toyota vios cũ")