In [327]:
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

---

## 1. Price parsing

- `original_price`: Gi√° g·ªëc (gi√° ch∆∞a gi·∫£m, ƒë∆°n v·ªã: VND)
- `sale_price`: Gi√° khuy·∫øn m√£i (gi√° ƒë√£ gi·∫£m, ƒë∆°n v·ªã: VND)
- `precent_discount`: Ph·∫ßn trƒÉm s·ªë ti·ªÅn ƒë√£ gi·∫£m (ƒë∆°n v·ªã: %)

<pre>
- C√≥ gi√° khuy·∫øn m√£i: 
    &lt;fare-sale&gt; ch·ª©a gi√° khuy·∫øn m√£i
        &lt;fareSmall&gt; ch·ª©a gi√° g·ªëc v√† ph·∫ßn trƒÉm ∆∞u ƒë√£i
- Kh√¥ng c√≥ gi√° khuy·∫øn m√£i:
    &lt;fare&gt; ch·ª©a gi√° g·ªëc
        &lt;fareSmall&gt; kh√¥ng ch·ª©a d·ªØ li·ªáu
</pre>

In [328]:
def has_no_discount_price(block) -> bool:
    '''
    Ph√¢n lo·∫°i chuy·∫øn xe n√†y c√≥ gi√° khuy·∫øn m√£i hay kh√¥ng \n
    True: kh√¥ng c√≥ gi√° khuy·∫øn m√£i \n
    False: c√≥ gi√° khuy·∫øn m√£i
    '''
    fare = block.find('div', class_='fare')      
    
    if fare: 
        return True  # khong co gia khuyen mai
    
    return False     # co gia khuyen mai

def parse_fare(block):
    '''
    Tr√≠ch xu·∫•t d·ªØ li·ªáu v·ªÅ gi√° c·ªßa chuy·∫øn xe khi kh√¥ng c√≥ gi√° khuy·∫øn m√£i \n
    Tr·∫£ v·ªÅ: gi√° g·ªëc, gi√° khuy·∫øn m√£i = None, ph·∫ßn trƒÉm khuy·∫øn m√£i = None
    '''

    sale_price = None
    percent_discount = None

    fare = block.find('div', class_='fare')
    original_price = fare.get_text(strip=True).replace("ƒë", "").replace('T·ª´ ', '').strip()    
    
    return {
        "original_price": original_price ,
        "sale_price":sale_price,
        "percent_discount": percent_discount
        }

def parse_fare_small(block):
    '''
    D√†nh cho tr∆∞·ªùng h·ª£p c√≥ khuy·∫øn m√£i \n
    Tr·∫£ v·ªÅ: gi√° g·ªëc, ph·∫ßn trƒÉm khuy·∫øn m√£i
    '''

    fare_small = block.find('div', class_='fareSmall')

    original_price = fare_small.find('div', class_='small').get_text(strip=True).replace("ƒë", "").strip() if fare_small else None
    try:
        if fare_small.find('div', class_='percent'):
            percent_discount = fare_small.find('div', class_='percent').get_text(strip=True)
        else:
            percent_discount = None
    except Exception:
        percent_discount = None
        
    return original_price, percent_discount

def parse_sale_price_info(block):
    '''
    Tr√≠ch xu·∫•t d·ªØ li·ªáu v·ªÅ gi√° c·ªßa chuy·∫øn xe khi c√≥ gi√° khuy·∫øn m√£i \n
    Tr·∫£ v·ªÅ: gi√° g·ªëc, gi√° khuy·∫øn m√£i, ph·∫ßn trƒÉm khuy·∫øn m√£i
    '''
    
    fare_sale = block.find('div', class_='fare-sale')
    original_price, percent_discount = parse_fare_small(block)
    sale_price = None
    if fare_sale and fare_sale.get_text(strip=True):
        sale_price = fare_sale.get_text(strip=True).replace("ƒë", "").replace('T·ª´ ', '').strip()
        
    return {
        "original_price": original_price ,
        "sale_price":sale_price,
        "percent_discount": percent_discount
        }

In [329]:
def parse_price(block):
    '''
    Tr√≠ch xu·∫•t d·ªØ li·ªáu v·ªÅ gi√° c·ªßa chuy·∫øn xe \n
    Tr·∫£ v·ªÅ: gi√° g·ªëc, gi√° khuy·∫øn m√£i (n·∫øu c√≥), ph·∫ßn trƒÉm khuy·∫øn m√£i (n·∫øu c√≥)
    '''
    
    if has_no_discount_price(block):
        return parse_fare(block)
    else:
        return parse_sale_price_info(block)

---

## 2. Bus info parsing

In [330]:
def parser_trip_bus_info(container):
    '''
    Tr√≠ch xu·∫•t th√¥ng tin t·ª´ m·ªôt container ch·ª©a th√¥ng tin chuy·∫øn ƒëi. \n
    Tr·∫£ v·ªÅ Tuple: t√™n nh√† xe, ƒë√°nh gi√° nh√† xe, lo·∫°i gh·∫ø.
    '''
    
    # bus name / company name
    bus_element = container.find('div', class_='bus-name')
    bus_name = bus_element.get_text(strip=True) if bus_element else None

    # bus rating
    rating_element = container.find('div', class_='bus-rating').find('span')
    bus_rating = rating_element.get_text(strip=True) if rating_element else None

    # seat_type
    seat_type = container.find('div', class_='seat-type')
    seat_type = seat_type.get_text(strip=True) if seat_type else None

    return {
        'bus_name': bus_name,
        'bus_rating': bus_rating,
        'seat_type': seat_type
    }

---

## 3. Route parsing

In [331]:
# D·ªØ li·ªáu n√†y n·∫±m ·ªü √¥ filter chuy·∫øn ƒëi

def parse_route_info(block):
    '''
    Tr√≠ch xu·∫•t d·ªØ li·ªáu t·ª´ filter c·ªßa trang web \n
    Tr·∫£ v·ªÅ: ng√†y kh·ªüi h√†nh, n∆°i xu·∫•t ph√°t (th√†nh ph·ªë hi·ªán t·∫°i), n∆°i ƒë·∫øn (n∆°i ƒë·∫∑t v√© ƒë·∫øn)
    '''

    departure_date, start_point, destination = None, None, None

    try:
        departure_date = block.find('p', class_='date-input-value').get_text(strip=True)
        start_point = block.find(id="from_input").get('value')
        destination = block.find(id="to_input").get('value')
    except Exception:
        pass

    return {
        'departure_date': departure_date,
        'start_point': start_point,
        'destination': destination
    }

---

## 4. Details trip info

### 4.1 Departure

In [332]:
# D·ªØ li·ªáu n√†y n·∫±m trong container > 'from_content'

def parse_departure_trip_info(from_content):
    """
    Tr√≠ch xu·∫•t th√¥ng tin ƒëi·ªÉm ƒëi t·ª´ m·ªôt container 'from_content'.
    Tr·∫£ v·ªÅ m·ªôt tuple ch·ª©a: gi·ªù kh·ªüi h√†nh, ƒë·ªãa ƒëi·ªÉm ƒë√≥n kh√°ch.
    """
    # n·∫øu container kh√¥ng t·ªìn t·∫°i, tr·∫£ v·ªÅ gi√° tr·ªã None cho t·∫•t c·∫£
    if not from_content:
        return None, None

    # departure time
    departure_time_element = from_content.find('div', class_='hour')
    departure_time = departure_time_element.get_text(strip=True) if departure_time_element else None

    # departure place
    from_place_tag = from_content.find('div', class_='place')
    pick_up_point = from_place_tag.get_text(strip=True) if from_place_tag else None
    
    return {
        'departure_time': departure_time,
        'pick_up_point': pick_up_point
    }

### 4.2 Arrival

In [333]:
def parse_arrival_trip_info(to_content):
    """
    Tr√≠ch xu·∫•t th√¥ng tin ƒëi·ªÉm ƒë·∫øn t·ª´ m·ªôt container 'to_content'.\n
    Tr·∫£ v·ªÅ m·ªôt tuple ch·ª©a: ng√†y ƒë·∫øn, th·ªùi gian ƒë·∫øn, ƒëi·ªÉm tr·∫£ kh√°ch.
    """
    
    # n·∫øu container kh√¥ng t·ªìn t·∫°i, tr·∫£ v·ªÅ gi√° tr·ªã None cho t·∫•t c·∫£
    if not to_content:
        return None, None, None

    # l·∫•y ng√†y ƒë·∫øn
    date_arrival_tag = to_content.find('span', class_="text-date-arrival-time")
    arrival_date = date_arrival_tag.get_text(strip=True) if date_arrival_tag else None
    
    
    # l·∫•y gi·ªù v√† ƒë·ªãa ƒëi·ªÉm tr·∫£ kh√°ch
    content_to_info = to_content.find('div', class_='content-to-info')
    if content_to_info:
        to_hour_tag = content_to_info.find('div', class_='hour')
        arrival_time = to_hour_tag.get_text(strip=True) if to_hour_tag else None
        
        dropoff_place_element = content_to_info.find('div', class_='place')
        drop_off_point = dropoff_place_element.get_text(strip=True) if dropoff_place_element else None
        
    return {
        'arrival_date': arrival_date,
        'arrival_time': arrival_time,
        'drop_of_point': drop_off_point
    }


---

In [334]:
def parse_trip_timing(container):
    """
    Tr√≠ch xu·∫•t th√¥ng tin chi ti·∫øt v·ªÅ chuy·∫øn ƒëi (gi·ªù, n∆°i ƒëi - ƒë·∫øn, th·ªùi gian di chuy·ªÉn). \n
    Tr·∫£ v·ªÅ: dict ch·ª©a th√¥ng tin kh·ªüi h√†nh, ƒëi·ªÉm ƒë·∫øn v√† th·ªùi l∆∞·ª£ng chuy·∫øn.
    """
    
    # T√¨m kh·ªëi ch·ª©a th√¥ng tin ƒëi v√† ƒë·∫øn
    from_to_content = container.find('div', class_="from-to-content")

    # N·∫øu kh√¥ng t√¨m th·∫•y, tr·∫£ v·ªÅ dict r·ªóng c√≥ c·∫•u tr√∫c s·∫µn
    if not from_to_content:
        return {
            "duration": None,
            "from_hour": None,
            "from_place": None,
            "departure_date": None,
            "arrival_date": None,
            "to_hour": None,
            "to_place": None,
        }

    # L·∫•y th√¥ng tin n∆°i kh·ªüi h√†nh
    from_content = from_to_content.find('div', class_='content from')
    dict_departure_info = parse_departure_trip_info(from_content)

    # L·∫•y th√¥ng tin n∆°i ƒë·∫øn
    to_content = from_to_content.find('div', class_='content to')
    dict_arrival_info = parse_arrival_trip_info(to_content)

    # L·∫•y th·ªùi gian di chuy·ªÉn
    duration_tag = from_to_content.find('div', class_="duration")
    duration = duration_tag.get_text(strip=True) if duration_tag else None

    # G·ªôp to√†n b·ªô th√¥ng tin l·∫°i
    trip_data = dict_departure_info | dict_arrival_info | {'duration': duration}
    
    return trip_data


In [335]:
def compile_trip_info(block):
    '''
    T·∫≠p h·ª£p to√†n b·ªô th√¥ng tin c·ªßa 1 chuy·∫øn xe t·ª´ 1 kh·ªëi d·ªØ li·ªáu (block). \n
    Tr·∫£ v·ªÅ: dict ch·ª©a th√¥ng tin xe, l·ªãch tr√¨nh v√† gi√° v√©.
    '''
    
    # L·∫•y th√¥ng tin ch√≠nh c·ªßa nh√† xe
    dict_bus_info = parser_trip_bus_info(block)
    
    # L·∫•y th√¥ng tin gi·ªù ƒëi - gi·ªù ƒë·∫øn, ƒëi·ªÉm ƒë√≥n - tr·∫£
    dict_trip_details = parse_trip_timing(block)
    
    # L·∫•y th√¥ng tin gi√° v√© (gi√° g·ªëc, gi√° khuy·∫øn m√£i)
    dict_price = parse_price(block)
    
    # G·ªôp t·∫•t c·∫£ d·ªØ li·ªáu v√†o 1 dictionary duy nh·∫•t
    trip_data = dict_bus_info | dict_trip_details | dict_price
    
    return trip_data


---

## Parsing rating

In [336]:
def parse_ratings_from_container(container):
    '''
    Tr√≠ch xu·∫•t th√¥ng tin ƒë√°nh gi√° (rating) c·ªßa t·ª´ng nh√† xe trong 1 container. \n
    Tr·∫£ v·ªÅ: list c√°c c·∫∑p (rate_title, rate_point) ho·∫∑c [(None, None)] n·∫øu kh√¥ng c√≥ d·ªØ li·ªáu.
    '''
    try:
        ratings = []
        rate_divs = container.find_all('div', class_='rate-title')   # T√¨m t·∫•t c·∫£ kh·ªëi ch·ª©a th√¥ng tin ƒë√°nh gi√°
        
        for rate_div in rate_divs:
            rate_ps = rate_div.find_all('p')   # M·ªói ph·∫ßn t·ª≠ ch·ª©a ti√™u ƒë·ªÅ v√† ƒëi·ªÉm
            if len(rate_ps) >= 2:
                rate_title = rate_ps[0].get_text(strip=True)   # Ti√™u ƒë·ªÅ ƒë√°nh gi√°
                rate_point = rate_ps[1].get_text(strip=True)   # ƒêi·ªÉm ƒë√°nh gi√°
                ratings.append((rate_title, rate_point))       # L∆∞u v√†o danh s√°ch
        
        if ratings:
            return ratings     # Tr·∫£ v·ªÅ danh s√°ch n·∫øu c√≥ d·ªØ li·ªáu
        else:
            return [(None, None)]   # Kh√¥ng c√≥ d·ªØ li·ªáu ƒë√°nh gi√°
            
    except Exception:
        return [(None, None)]   # Tr∆∞·ªùng h·ª£p l·ªói v·∫´n tr·∫£ v·ªÅ gi√° tr·ªã m·∫∑c ƒë·ªãnh


total

In [337]:
# G·ªôp d·ªØ li·ªáu chuy·∫øn xe v√† c√°c ƒë√°nh gi√° l·∫°i v·ªõi nhau
def extract_all_trips(soup):
    '''
    Tr√≠ch xu·∫•t v√† g·ªôp th√¥ng tin chuy·∫øn xe, tuy·∫øn ƒë∆∞·ªùng v√† ƒë√°nh gi√° nh√† xe th√†nh m·ªôt DataFrame duy nh·∫•t. \n
    Tr·∫£ v·ªÅ: DataFrame ch·ª©a to√†n b·ªô d·ªØ li·ªáu chuy·∫øn xe.
    '''

    dict_route = parse_route_info(soup)   # L·∫•y th√¥ng tin tuy·∫øn ƒë∆∞·ªùng (ƒëi - ƒë·∫øn)
    containers = soup.find_all("div", class_="container")   # T√¨m t·∫•t c·∫£ container ch·ª©a chuy·∫øn xe

    lst_trips_info = []

    for container in containers:
        # L·∫•y th√¥ng tin chuy·∫øn xe v√† g·ªôp v·ªõi tuy·∫øn ƒë∆∞·ªùng
        dict_trip_info = compile_trip_info(container) | dict_route

        df_trip_info = pd.DataFrame([dict_trip_info])  # ƒê∆∞a v·ªÅ 1 h√†ng dataframe

        lst_trips_info.append(df_trip_info)

    # G·ªôp to√†n b·ªô chuy·∫øn xe l·∫°i th√†nh 1 dataframe
    all_trips_info = pd.concat(lst_trips_info, ignore_index=True)

    return all_trips_info

---

In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException, TimeoutException
import time, random
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

## 5. Handle Button logic

### 5.1. Button `Xem th√™m chuy·∫øn`

In [None]:
def click_load_more_until_end(driver, max_wait: int = 10):
    """
    Nh·∫•n 'Xem th√™m chuy·∫øn' li√™n t·ª•c cho ƒë·∫øn khi kh√¥ng c√≤n n√∫t n√†o hi·ªÉn th·ªã.
    T·ªëi ∆∞u t·ªëc ƒë·ªô: kh√¥ng d√πng time.sleep, ch·ªâ ch·ªù khi c·∫ßn.
    """

    wait = WebDriverWait(driver, max_wait)
    last_count = 0

    while True:
        try:
            # Ch·ªù n√∫t "Xem th√™m chuy·∫øn" s·∫µn s√†ng ƒë·ªÉ click
            load_more_button = wait.until(
                EC.element_to_be_clickable((By.CLASS_NAME, "load-more"))
            )
            driver.execute_script("arguments[0].scrollIntoView();", load_more_button)
            load_more_button.click()

            # Ch·ªù ƒë·∫øn khi trang c√≥ th√™m ph·∫ßn t·ª≠ m·ªõi xu·∫•t hi·ªán (so v·ªõi l·∫ßn tr∆∞·ªõc)
            wait.until(
                lambda d: len(d.find_elements(By.CLASS_NAME, "trip-item")) > last_count
            )
            last_count = len(driver.find_elements(By.CLASS_NAME, "trip-item"))

        except TimeoutException:
            # Kh√¥ng c√≤n n√∫t ho·∫∑c kh√¥ng t·∫£i th√™m chuy·∫øn m·ªõi
            break
        except (NoSuchElementException, ElementClickInterceptedException):
            break

### 5.2. Button `Xem c√°c ƒë√°nh gi√°`

In [None]:
def expand_all_ratings(driver, max_clicks=30, delay_range=(0.5, 1.2)):
    """
    M·ªü t·∫•t c·∫£ c√°c c·ª≠a s·ªï rating tr√™n trang Vexere (n·∫øu c√≥).
    - Gi·ªõi h·∫°n s·ªë l∆∞·ª£t click ƒë·ªÉ tr√°nh treo.
    - T·ª± b·ªè qua khi element kh√¥ng kh·∫£ d·ª•ng.
    """

    wait = WebDriverWait(driver, 10)

    try:
        # L·∫•y to√†n b·ªô c√°c icon ng√¥i sao hi·ªán c√≥
        stars = wait.until(
            EC.presence_of_all_elements_located(
                (By.CLASS_NAME, "bus-rating-button")
            )
        )
        print(f"‚≠ê T√¨m th·∫•y {len(stars)} icon rating.")

        clicks_performed = 0

        for star in stars:
            if clicks_performed >= max_clicks:
                print(f"‚èπ D·ª´ng l·∫°i sau {max_clicks} l·∫ßn click (tr√°nh treo).")
                break

            try:
                driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", star)
                time.sleep(random.uniform(*delay_range))

                # D√πng click an to√†n b·∫±ng JS (√≠t l·ªói h∆°n ActionChains)
                driver.execute_script("arguments[0].click();", star)
                clicks_performed += 1

                print(f"‚úÖ Click rating {clicks_performed}/{len(stars)}")
                time.sleep(random.uniform(*delay_range))

            except Exception as e:
                print("‚ö†Ô∏è B·ªè qua 1 icon do l·ªói:", e)
                continue

        print("üéØ Ho√†n t·∫•t m·ªü rating!")

    except Exception as e:
        print("‚ùå Kh√¥ng th·ªÉ l·∫•y danh s√°ch rating:", e)

### 5.3. Button `T√¨m ki·∫øm`

In [341]:
def click_search(driver):  
    """
    Click v√†o n√∫t t√¨m ki·∫øm tr√™n trang Vexere.

    Parameters
    ----------
    driver : webdriver
        ƒê·ªëi t∆∞·ª£ng Selenium WebDriver ƒëang ƒëi·ªÅu khi·ªÉn tr√¨nh duy·ªát.

    Returns
    -------
    bool
        True n·∫øu click th√†nh c√¥ng, False n·∫øu x·∫£y ra l·ªói.
    """
    
    try:
        button = driver.find_element(By.CLASS_NAME,"button-search")
        button.click()
        return True
    except Exception:
        return False

## 6. Automate the process of filtering website data

In [342]:
def get_target_date_components(days=0):
    """
    Tr·∫£ v·ªÅ ng√†y v√† th√°ng-nƒÉm m·ª•c ti√™u c√°ch hi·ªán t·∫°i `days` ng√†y.

    Parameters
    ----------
    days : int, optional
        S·ªë ng√†y c·ªông th√™m t·ª´ ng√†y hi·ªán t·∫°i (m·∫∑c ƒë·ªãnh = 0).

    Returns
    -------
    dict
        {'day': '15', 'month_year': '10-2025'}
    """
    target_date = datetime.today() + timedelta(days=days)  # Ng√†y m·ª•c ti√™u = ng√†y hi·ªán t·∫°i + kho·∫£ng th·ªùi gian sau k days ng√†y
    month_id = f"{target_date.month:02d}-{target_date.year}"
    day = str(target_date.day)
    return {
        'day':day,
        'month_year':month_id
    }

In [343]:
def set_search_filters(driver,start_city:str, destination_city:str, days=0):
    """
    Ch·ªçn ƒëi·ªÉm ƒëi, ƒëi·ªÉm ƒë·∫øn v√† ng√†y kh·ªüi h√†nh tr√™n trang Vexere.

    Parameters
    ----------
    driver : webdriver
        ƒê·ªëi t∆∞·ª£ng Selenium WebDriver ƒëang ƒëi·ªÅu khi·ªÉn tr√¨nh duy·ªát.
    start_city : str
        T√™n th√†nh ph·ªë kh·ªüi h√†nh.
    destination_city : str
        T√™n th√†nh ph·ªë ƒëi·ªÉm ƒë·∫øn.
    days : int, optional
        S·ªë ng√†y t√≠nh t·ª´ h√¥m nay ƒë·ªÉ ch·ªçn ng√†y ƒëi (m·∫∑c ƒë·ªãnh = 0).

    Returns
    -------
    bool
        True n·∫øu ch·ªçn ng√†y th√†nh c√¥ng, False n·∫øu x·∫£y ra l·ªói.
    """

    # T√¨m √¥ ch·ª©a n∆°i kh·ªüi h√†nh v√† n∆°i ƒë·∫øn
    departure_input = driver.find_element(By.ID, 'from_input')
    destination_input = driver.find_element(By.ID, 'to_input')

    # Nh·∫≠p d·ªØ li·ªáu
    departure_input.send_keys(start_city)
    destination_input.send_keys(destination_city)
    
    # click v√†o ch·ªçn ng√†y ƒëi ƒë·ªÉ hi·ªÉn th·ªã c√°c l·ª±a ch·ªçn
    driver.find_element(By.CLASS_NAME, "departure-date-select").click() 
    time.sleep(1)

    # G·ªçi h√†m target_time() -> ng√†y, th√°ng m√¨nh mu·ªën hi·ªÉn th·ªã t·ª´ trang web
    target_day, target_month = get_target_date_components(days).values() 

    # Ch·ªçn ng√†y trong month_section
    month_section = driver.find_element(By.ID, target_month)
    day_elements = month_section.find_elements(By.CSS_SELECTOR, "p.day")
    
    for day in day_elements:
        if day.text == target_day:
            try:
                day.click()
                break
            except Exception:
                return False
            
    return True

# FLOW OFFICIAL

---

### check data trong database

In [None]:
# expand_all_ratings_realtime.py
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import TimeoutException
import time


def check_and_update_bus_in_db(db_connection, bus_name, overall_rating, reviewer_count):
    """
    Ki·ªÉm tra d·ªØ li·ªáu nh√† xe trong DB:
    - N·∫øu ch∆∞a c√≥: return "new"
    - N·∫øu ƒë√£ c√≥ v√† reviewer_count kh√°c: return "update"
    - N·∫øu gi·ªëng h·ªát: return "skip"
    """
    query = """
        SELECT overall_rating, reviewer_count
        FROM bus_companies
        WHERE bus_company_name = %s
        LIMIT 1;
    """
    existing = db_connection.fetch(query, (bus_name,))

    if not existing:
        return "new"

    db_overall, db_count = existing[0]
    if reviewer_count != db_count:
        return "update"
    return "skip"


def click_rating_if_needed(driver, container, action_type):
    """
    Th·ª±c hi·ªán click ƒë·ªÉ m·ªü ph·∫ßn ƒë√°nh gi√° n·∫øu c·∫ßn.
    """
    if action_type not in ("new", "update"):
        return False

    try:
        star_elem = WebDriverWait(container, 15).until(
            EC.element_to_be_clickable((By.CLASS_NAME, ".bus-rating-button"))
        )
        driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", star_elem)
        ActionChains(driver).move_to_element(star_elem).click().perform()
    except Exception:
        pass


def expand_all_ratings(driver, max_wait: int = 15, db_connection=None):
    """
    M·ªü ph·∫ßn ƒë√°nh gi√° real-time:
    - N·∫øu nh√† xe m·ªõi ‚Üí click ƒë·ªÉ th√™m m·ªõi.
    - N·∫øu nh√† xe c≈© nh∆∞ng c√≥ th√™m l∆∞·ª£t review ‚Üí click ƒë·ªÉ c·∫≠p nh·∫≠t.
    - N·∫øu gi·ªëng h·ªát ‚Üí b·ªè qua.
    """
    try:
        containers = WebDriverWait(driver, max_wait).until(
            EC.presence_of_all_elements_located((By.CLASS_NAME, "container"))
        )
        if not containers:
            print("‚ö†Ô∏è Kh√¥ng t√¨m th·∫•y chuy·∫øn xe n√†o.")
            return False

        clicked_any = False

        for container in containers:
            try:
                bus_name = container.find_element(By.CLASS_NAME, "bus-name").text.strip()
                rating_elem = container.find_element(By.CSS_SELECTOR, ".bus-rating span")
                rating_text = rating_elem.text.strip()
                parts = rating_text.split(" ")

                if len(parts) < 2:
                    print(f"‚ö†Ô∏è Rating kh√¥ng h·ª£p l·ªá: {rating_text}")
                    continue

                overall_rating = float(parts[0])
                reviewer_count = int(parts[1].strip("()"))

                action_type = "new"
                if db_connection:
                    action_type = check_and_update_bus_in_db(
                        db_connection, bus_name, overall_rating, reviewer_count
                    )

                if action_type in ("new", "update"):
                    clicked = click_rating_if_needed(driver, container, bus_name, rating_text, action_type)
                    if clicked and action_type == "update":
                        # C·∫≠p nh·∫≠t DB ngay
                        update_query = """
                            UPDATE bus_companies
                            SET overall_rating = %s, reviewer_count = %s, updated_at = NOW()
                            WHERE bus_company_name = %s;
                        """
                        db_connection.execute(update_query, (overall_rating, reviewer_count, bus_name))
                    elif clicked and action_type == "new":
                        insert_query = """
                            INSERT INTO bus_companies (bus_company_name, overall_rating, reviewer_count)
                            VALUES (%s, %s, %s);
                        """
                        db_connection.execute(insert_query, (bus_name, overall_rating, reviewer_count))
                    clicked_any = clicked
                else:
                    print(f"‚è≠Ô∏è  B·ªè qua {bus_name} ‚Äî d·ªØ li·ªáu kh√¥ng thay ƒë·ªïi.")

            except Exception as e:
                print(f"‚ùå L·ªói x·ª≠ l√Ω container: {e}")
                continue

        print("üéØ Ho√†n t·∫•t ki·ªÉm tra & m·ªü ƒë√°nh gi√°.")
        return clicked_any

    except Exception as e:
        print("‚ö†Ô∏è L·ªói chung trong expand_all_ratings():", e)
        return False


## Crawl rating data

# -- Main --

In [345]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
import time

In [346]:
arrivals_HaNoi = ['H·∫£i Ph√≤ng','Ngh·ªá An','S∆°n La','H√† Giang','Qu·∫£ng Ninh','Thanh H√≥a'',SaPa','Ninh B√¨nh']
arrivals_SaiGon = ['Gia Lai','B√¨nh Thu·∫≠n','Ninh Thu·∫≠n','ƒê·∫Øk L·∫Øk','Ph√∫ Y√™n','Nha Trang','B√† R·ªãa - V≈©ng T√†u']

In [347]:
# URL = 'https://vexere.com/'
URL = "https://vexere.com/vi-VN/ve-xe-khach-tu-sai-gon-di-nha-trang-khanh-hoa-129t23591.html?date=17-10-2025&v=8&nation=84"
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get(URL)

In [None]:
import sys
import os
import pathlib
import json
# 1. X√°c ƒë·ªãnh v·ªã tr√≠ c·ªßa Notebook (v√≠ d·ª•: src/crawling)
notebook_path = pathlib.Path.cwd() 

# 2. ƒêi l√™n 1 c·∫•p ƒë·ªÉ t·ªõi th∆∞ m·ª•c g·ªëc c·ªßa d·ª± √°n (src)
project_root = str(notebook_path.parent) # src/

# 3. Th√™m th∆∞ m·ª•c g·ªëc v√†o sys.path
if project_root not in sys.path:
    sys.path.append(project_root)

# 4. Ki·ªÉm tra xem ƒë∆∞·ªùng d·∫´n 'src' ƒë√£ ƒë∆∞·ª£c th√™m ch∆∞a
print(f"Th∆∞ m·ª•c g·ªëc ƒë∆∞·ª£c th√™m v√†o sys.path: {project_root}")
print("---")

# 5. Th·ª±c hi·ªán Import
from database.db_connection import DatabaseManager
# --------------------------------------------------

# ===== L·∫•y d·ªØ li·ªáu connect DB t·ª´ json ======
# S·ª¨A L·ªñI ƒê∆Ø·ªúNG D·∫™N CONFIG.JSON TRONG NOTEBOOK
# Notebook ƒëang ·ªü src/crawling -> c·∫ßn ƒëi l√™n src/ r·ªìi v√†o database/
json_path = pathlib.Path().resolve().parent.parent /'src' / 'database' / 'config.json'

with open(json_path, "r", encoding="utf-8") as f:
    config = json.load(f)

DATABASE = config["DB_CONNECTION"]
host = DATABASE["HOST"]
port = DATABASE["PORT"]
database = DATABASE["DATABASE"]
user = DATABASE["USER"]
password = DATABASE["PASSWORD"]

# ====== Kh·ªèi t·∫°o Database ======
db_manager = DatabaseManager(
    host=host, port=port, user=user, database=database, password=password
)

departure_place = 'S√†i G√≤n' # N∆°i xu·∫•t ph√°t
days_offset = 1

day, month_year = get_target_date_components(days_offset).values()
month_year = day.replace('-', '_')

for i in arrivals_SaiGon:
    arrival_city = i

    filter_success = set_search_filters(driver, start_city=departure_place, destination_city=arrival_city, days=days_offset)

    if filter_success:
        search_suess = click_search(driver)
    
    # click_load_more_until_end(driver)

    # if search_suess:
    #     expand_all_ratings(driver, 15,db_manager)

    time.sleep(1.5)

    soup = BeautifulSoup(driver.page_source, "html.parser")

    lst_trips_info = []

    try:
        df_trips_info = extract_all_trips(soup)
    except Exception:

        with open(f'../../data/site/{departure_place}_{arrival_city}_{day}_{month_year}.html', 'w', encoding='utf-8') as f:
            f.write(soup.prettify())
        
        print("ƒê√£ l∆∞u th√†nh .html")

    df_trips_info.to_csv(f"../../data/raw/{departure_place}_{arrival_city}_{day}_{month_year}.csv", index=False)
    print("L·∫•y d·ªØ li·ªáu .csv th√†nh c√¥ng")

    driver.quit()
    break


Th∆∞ m·ª•c g·ªëc ƒë∆∞·ª£c th√™m v√†o sys.path: f:\Document\T·ªïng h·ª£p c√°c m√¥n h·ªçc\ƒê·ªì √°n DS\src
---
L·∫•y d·ªØ li·ªáu .csv th√†nh c√¥ng


In [349]:
df_trips_info

Unnamed: 0,bus_name,bus_rating,seat_type,departure_time,pick_up_point,arrival_date,arrival_time,drop_of_point,duration,original_price,sale_price,percent_discount,departure_date,start_point,destination
0,ƒê√† L·∫°t ∆°i,4.8 (3452),Limousine 24 Ph√≤ng ƒê√îI,23:45,‚Ä¢ Tr·∫°m Qu·∫≠n 1,(18/10),05:30,‚Ä¢ Tr·∫°m Nha Trang,5h45m,450.0,279.0,-38%,"T6, 17/10/2025",S√†i G√≤n,Nha Trang - Kh√°nh H√≤a
1,Tr·ªçng Th·ªßy Limousine,4.7 (1353),Limousine 24 ph√≤ng ƒê√¥i,21:00,‚Ä¢ Ng√£ 4 An S∆∞∆°ng,(18/10),04:15,‚Ä¢ VƒÉn Ph√≤ng Nha Trang,7h15m,470.0,430.0,-9%,"T6, 17/10/2025",S√†i G√≤n,Nha Trang - Kh√°nh H√≤a
2,Khanh Phong,4.7 (16973),Limousine 32 gi∆∞·ªùng n·∫±m (WC),22:45,‚Ä¢ VƒÉn Ph√≤ng Ph·∫°m Ng≈© L√£o - Qu·∫≠n 1.,(18/10),04:55,‚Ä¢ VƒÉn Ph√≤ng Nha Trang (KS M∆∞·ªùng Thanh),6h10m,320.0,300.0,-6%,"T6, 17/10/2025",S√†i G√≤n,Nha Trang - Kh√°nh H√≤a
3,B√¨nh Minh T·∫£i,4.7 (3701),Limousine 32 Ph√≤ng (WC),12:15,‚Ä¢ VƒÉn Ph√≤ng Qu·∫≠n 1,,19:20,‚Ä¢ VƒÉn ph√≤ng Nha Trang,7h5m,320.0,280.0,-12%,"T6, 17/10/2025",S√†i G√≤n,Nha Trang - Kh√°nh H√≤a
4,Nh·∫≠t D∆∞∆°ng - B√¨nh Minh Bus,4.9 (5872),Limousine 22 Ph√≤ng ƒê√¥i Luxury (WC),23:00,‚Ä¢ VƒÉn ph√≤ng Nguy·ªÖn C∆∞ Trinh Qu·∫≠n 1,(18/10),05:36,‚Ä¢ Vp Th√≠ch Qu·∫£ng ƒê·ª©c Nha Trang,6h36m,349.0,300.0,-14%,"T6, 17/10/2025",S√†i G√≤n,Nha Trang - Kh√°nh H√≤a
5,An Anh Limousine,4.8 (8389),Limousine 34 Ph√≤ng ƒê∆°n,23:30,‚Ä¢ VƒÉn Ph√≤ng Qu·∫≠n 5,(18/10),06:00,‚Ä¢ VƒÉn ph√≤ng Nha Trang,6h30m,299.0,250.0,-16%,"T6, 17/10/2025",S√†i G√≤n,Nha Trang - Kh√°nh H√≤a
6,Hu·ª≥nh Gia,4.7 (8578),Gi∆∞∆°ÃÄng nƒÉÃÄm 34 ch√¥ÃÉ (WC),22:30,‚Ä¢ VƒÉn Ph√≤ng Ph·∫°m Ng≈© L√£o,(18/10),05:00,‚Ä¢ VƒÉn Ph√≤ng Nha Trang,6h30m,280.0,,,"T6, 17/10/2025",S√†i G√≤n,Nha Trang - Kh√°nh H√≤a
7,Li√™n H∆∞ng,4 (12979),Limousine 21 ph√≤ng ƒë∆°n (WC),21:45,‚Ä¢ B·∫øn xe Mi·ªÅn T√¢y (Qu·∫ßy 24),(18/10),06:30,‚Ä¢ VƒÉn ph√≤ng Nha Trang,8h45m,500.0,300.0,-40%,"T6, 17/10/2025",S√†i G√≤n,Nha Trang - Kh√°nh H√≤a
8,Nam H·∫£i Limousine,4.7 (3509),Limousine 34 gi∆∞·ªùng,22:20,‚Ä¢ VƒÉn Ph√≤ng Ph·∫°m Ng≈© L√£o,(18/10),06:20,‚Ä¢ VƒÉn Ph√≤ng Nha Trang,8h,300.0,270.0,-10%,"T6, 17/10/2025",S√†i G√≤n,Nha Trang - Kh√°nh H√≤a
9,Ph∆∞∆°ng Nam,4.7 (2442),Gi∆∞·ªùng n·∫±m 40 ch·ªó c√≥ toilet,22:00,‚Ä¢ VƒÉn ph√≤ng Qu·∫≠n 1,(18/10),06:52,‚Ä¢ VƒÉn Ph√≤ng Chung c∆∞ CT2,8h52m,240.0,,,"T6, 17/10/2025",S√†i G√≤n,Nha Trang - Kh√°nh H√≤a
