In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, timedelta

---

## 1. Price parsing

- `price_original`: Gi√° g·ªëc (gi√° ch∆∞a gi·∫£m, ƒë∆°n v·ªã: VND)
- `price_discounted`: Gi√° khuy·∫øn m√£i (gi√° ƒë√£ gi·∫£m, ƒë∆°n v·ªã: VND)
- `precent_discount`: Ph·∫ßn trƒÉm s·ªë ti·ªÅn ƒë√£ gi·∫£m (ƒë∆°n v·ªã: %)

<pre>
- C√≥ gi√° khuy·∫øn m√£i: 
    &lt;fare-sale&gt; ch·ª©a gi√° khuy·∫øn m√£i
        &lt;fareSmall&gt; ch·ª©a gi√° g·ªëc v√† ph·∫ßn trƒÉm ∆∞u ƒë√£i
- Kh√¥ng c√≥ gi√° khuy·∫øn m√£i:
    &lt;fare&gt; ch·ª©a gi√° g·ªëc
        &lt;fareSmall&gt; kh√¥ng ch·ª©a d·ªØ li·ªáu
</pre>

In [None]:
def is_regular_fare(block) -> bool:
    """
    Ki·ªÉm tra chuy·∫øn xe c√≥ khuy·∫øn m√£i hay kh√¥ng.

    Returns:
        True  -> kh√¥ng c√≥ gi√° khuy·∫øn m√£i (fare)
        False -> c√≥ gi√° khuy·∫øn m√£i (fare-sale)
    """
    fare = block.find("div", class_="fare")
    return bool(fare)


def parse_regular_fare(block):
    """
    Tr√≠ch xu·∫•t gi√° v√© cho chuy·∫øn kh√¥ng c√≥ khuy·∫øn m√£i.

    Returns:
        dict: {
            "price_original": str,  # Gi√° g·ªëc
            "price_discounted": None,
            "percent_discount": None
        }
    """
    fare = block.find("div", class_="fare")
    price_original = (
        fare.get_text(strip=True)
        if fare else None
    )

    return {
        "price_original": price_original,
        "price_discounted": None,
        "percent_discount": None,
    }


def parse_discount_metadata(block):
    """
    Tr√≠ch xu·∫•t gi√° g·ªëc v√† ph·∫ßn trƒÉm gi·∫£m gi√° t·ª´ th·∫ª 'fareSmall'.

    Returns:
        tuple: (price_original, percent_discount)
    """
    fare_small = block.find("div", class_="fareSmall")
    price_original = (
        fare_small.find("div", class_="small").get_text(strip=True)
        if fare_small else None
    )

    try:
        percent_discount = (
            fare_small.find("div", class_="percent").get_text(strip=True)
            if fare_small and fare_small.find("div", class_="percent")
            else None
        )
    except Exception:
        percent_discount = None

    return price_original, percent_discount


def parse_discounted_fare(block):
    """
    Tr√≠ch xu·∫•t gi√° v√© cho chuy·∫øn c√≥ khuy·∫øn m√£i.

    Returns:
        dict: {
            "price_original": str,   # Gi√° g·ªëc
            "price_discounted": str, # Gi√° sau gi·∫£m
            "percent_discount": str  # Ph·∫ßn trƒÉm gi·∫£m
        }
    """
    price_original, percent_discount = parse_discount_metadata(block)

    fare_sale = block.find("div", class_="fare-sale")
    price_discounted = (
        fare_sale.get_text(strip=True).strip() if fare_sale and fare_sale.get_text(strip=True) else None
    )

    return {
        "price_original": price_original,
        "price_discounted": price_discounted,
        "percent_discount": percent_discount,
    }


In [None]:
def parse_price(block):
    '''
    Tr√≠ch xu·∫•t d·ªØ li·ªáu v·ªÅ gi√° c·ªßa chuy·∫øn xe \n
    Tr·∫£ v·ªÅ: gi√° g·ªëc, gi√° khuy·∫øn m√£i (n·∫øu c√≥), ph·∫ßn trƒÉm khuy·∫øn m√£i (n·∫øu c√≥)
    '''
    
    if is_regular_fare(block):
        return parse_regular_fare(block)
    else:
        return parse_discounted_fare(block)

---

## 2. Bus info parsing

In [4]:
def parser_trip_bus_info(container):
    '''
    Tr√≠ch xu·∫•t th√¥ng tin t·ª´ m·ªôt container ch·ª©a th√¥ng tin chuy·∫øn ƒëi. \n
    Tr·∫£ v·ªÅ Tuple: t√™n nh√† xe, ƒë√°nh gi√° nh√† xe, lo·∫°i gh·∫ø.
    '''
    
    # bus name / company name
    bus_element = container.find('div', class_='bus-name')
    company_name = bus_element.get_text(strip=True) if bus_element else None

    # bus rating
    rating_element = container.find('div', class_='bus-rating').find('span')
    bus_rating = rating_element.get_text(strip=True) if rating_element else None

    # seat_type
    seat_type = container.find('div', class_='seat-type')
    seat_type = seat_type.get_text(strip=True) if seat_type else None

    return {
        'company_name': company_name,
        'bus_rating': bus_rating,
        'seat_type': seat_type
    }

---

## 3. Route parsing

In [5]:
# D·ªØ li·ªáu n√†y n·∫±m ·ªü √¥ filter chuy·∫øn ƒëi

def parse_route_info(block):
    '''
    Tr√≠ch xu·∫•t d·ªØ li·ªáu t·ª´ filter c·ªßa trang web \n
    Tr·∫£ v·ªÅ: ng√†y kh·ªüi h√†nh, n∆°i xu·∫•t ph√°t (th√†nh ph·ªë hi·ªán t·∫°i), n∆°i ƒë·∫øn (n∆°i ƒë·∫∑t v√© ƒë·∫øn)
    '''

    departure_date, start_point, destination = None, None, None

    try:
        departure_date = block.find('p', class_='date-input-value').get_text(strip=True)
        start_point = block.find(id="from_input").get('value')
        destination = block.find(id="to_input").get('value')
    except Exception:
        pass

    return {
        'departure_date': departure_date,
        'start_point': start_point,
        'destination': destination
    }

---

## 4. Details trip info

### 4.1 Departure

In [6]:
# D·ªØ li·ªáu n√†y n·∫±m trong container > 'from_content'

def parse_departure_trip_info(from_content):
    """
    Tr√≠ch xu·∫•t th√¥ng tin ƒëi·ªÉm ƒëi t·ª´ m·ªôt container 'from_content'.
    Tr·∫£ v·ªÅ m·ªôt tuple ch·ª©a: gi·ªù kh·ªüi h√†nh, ƒë·ªãa ƒëi·ªÉm ƒë√≥n kh√°ch.
    """
    # n·∫øu container kh√¥ng t·ªìn t·∫°i, tr·∫£ v·ªÅ gi√° tr·ªã None cho t·∫•t c·∫£
    if not from_content:
        return None, None

    # departure time
    departure_time_element = from_content.find('div', class_='hour')
    departure_time = departure_time_element.get_text(strip=True) if departure_time_element else None

    # departure place
    from_place_tag = from_content.find('div', class_='place')
    pickup_point = from_place_tag.get_text(strip=True) if from_place_tag else None
    
    return {
        'departure_time': departure_time,
        'pickup_point': pickup_point
    }

### 4.2 Arrival

In [7]:
def parse_arrival_trip_info(to_content):
    """
    Tr√≠ch xu·∫•t th√¥ng tin ƒëi·ªÉm ƒë·∫øn t·ª´ m·ªôt container 'to_content'.\n
    Tr·∫£ v·ªÅ m·ªôt tuple ch·ª©a: ng√†y ƒë·∫øn, th·ªùi gian ƒë·∫øn, ƒëi·ªÉm tr·∫£ kh√°ch.
    """
    
    # n·∫øu container kh√¥ng t·ªìn t·∫°i, tr·∫£ v·ªÅ gi√° tr·ªã None cho t·∫•t c·∫£
    if not to_content:
        return None, None, None

    # l·∫•y ng√†y ƒë·∫øn
    date_arrival_tag = to_content.find('span', class_="text-date-arrival-time")
    arrival_date = date_arrival_tag.get_text(strip=True) if date_arrival_tag else None
    
    
    # l·∫•y gi·ªù v√† ƒë·ªãa ƒëi·ªÉm tr·∫£ kh√°ch
    content_to_info = to_content.find('div', class_='content-to-info')
    if content_to_info:
        to_hour_tag = content_to_info.find('div', class_='hour')
        arrival_time = to_hour_tag.get_text(strip=True) if to_hour_tag else None
        
        dropoff_place_element = content_to_info.find('div', class_='place')
        dropoff_point = dropoff_place_element.get_text(strip=True) if dropoff_place_element else None
        
    return {
        'arrival_date': arrival_date,
        'arrival_time': arrival_time,
        'dropoff_point': dropoff_point
    }


---

In [8]:
def parse_trip_timing(container):
    """
    Tr√≠ch xu·∫•t th√¥ng tin chi ti·∫øt v·ªÅ chuy·∫øn ƒëi (gi·ªù, n∆°i ƒëi - ƒë·∫øn, th·ªùi gian di chuy·ªÉn). \n
    Tr·∫£ v·ªÅ: dict ch·ª©a th√¥ng tin kh·ªüi h√†nh, ƒëi·ªÉm ƒë·∫øn v√† th·ªùi l∆∞·ª£ng chuy·∫øn.
    """
    
    # T√¨m kh·ªëi ch·ª©a th√¥ng tin ƒëi v√† ƒë·∫øn
    from_to_content = container.find('div', class_="from-to-content")

    # N·∫øu kh√¥ng t√¨m th·∫•y, tr·∫£ v·ªÅ dict r·ªóng c√≥ c·∫•u tr√∫c s·∫µn
    if not from_to_content:
        return {
            "duration": None,
            "departure_time": None,
            "pickup_point": None,
            "departure_date": None,
            "arrival_date": None,
            "arrival_time": None,
            "dropoff_point": None,
        }

    # L·∫•y th√¥ng tin n∆°i kh·ªüi h√†nh
    from_content = from_to_content.find('div', class_='content from')
    dict_departure_info = parse_departure_trip_info(from_content)

    # L·∫•y th√¥ng tin n∆°i ƒë·∫øn
    to_content = from_to_content.find('div', class_='content to')
    dict_arrival_info = parse_arrival_trip_info(to_content)

    # L·∫•y th·ªùi gian di chuy·ªÉn
    duration_tag = from_to_content.find('div', class_="duration")
    duration = duration_tag.get_text(strip=True) if duration_tag else None

    # G·ªôp to√†n b·ªô th√¥ng tin l·∫°i
    trip_data = dict_departure_info | dict_arrival_info | {'duration': duration}
    
    return trip_data


In [9]:
def compile_trip_info(block):
    '''
    T·∫≠p h·ª£p to√†n b·ªô th√¥ng tin c·ªßa 1 chuy·∫øn xe t·ª´ 1 kh·ªëi d·ªØ li·ªáu (block). \n
    Tr·∫£ v·ªÅ: dict ch·ª©a th√¥ng tin xe, l·ªãch tr√¨nh v√† gi√° v√©.
    '''
    
    # L·∫•y th√¥ng tin ch√≠nh c·ªßa nh√† xe
    dict_bus_info = parser_trip_bus_info(block)
    
    # L·∫•y th√¥ng tin gi·ªù ƒëi - gi·ªù ƒë·∫øn, ƒëi·ªÉm ƒë√≥n - tr·∫£
    dict_trip_details = parse_trip_timing(block)
    
    # L·∫•y th√¥ng tin gi√° v√© (gi√° g·ªëc, gi√° khuy·∫øn m√£i)
    dict_price = parse_price(block)
    
    # G·ªôp t·∫•t c·∫£ d·ªØ li·ªáu v√†o 1 dictionary duy nh·∫•t
    trip_data = dict_bus_info | dict_trip_details | dict_price
    
    return trip_data


---

## Parsing rating

In [10]:
def parse_ratings_from_container(container):
    '''
    Tr√≠ch xu·∫•t th√¥ng tin ƒë√°nh gi√° (rating) c·ªßa t·ª´ng nh√† xe trong 1 container. \n
    Tr·∫£ v·ªÅ: list c√°c c·∫∑p (rate_title, rate_point) ho·∫∑c [(None, None)] n·∫øu kh√¥ng c√≥ d·ªØ li·ªáu.
    '''
    try:
        ratings = []
        rate_divs = container.find_all('div', class_='rate-title')   # T√¨m t·∫•t c·∫£ kh·ªëi ch·ª©a th√¥ng tin ƒë√°nh gi√°
        
        for rate_div in rate_divs:
            rate_ps = rate_div.find_all('p')   # M·ªói ph·∫ßn t·ª≠ ch·ª©a ti√™u ƒë·ªÅ v√† ƒëi·ªÉm
            if len(rate_ps) >= 2:
                rate_title = rate_ps[0].get_text(strip=True)   # Ti√™u ƒë·ªÅ ƒë√°nh gi√°
                rate_point = rate_ps[1].get_text(strip=True)   # ƒêi·ªÉm ƒë√°nh gi√°
                ratings.append((rate_title, rate_point))       # L∆∞u v√†o danh s√°ch
        
        if ratings:
            return ratings     # Tr·∫£ v·ªÅ danh s√°ch n·∫øu c√≥ d·ªØ li·ªáu
        else:
            return [(None, None)]   # Kh√¥ng c√≥ d·ªØ li·ªáu ƒë√°nh gi√°
            
    except Exception:
        return [(None, None)]   # Tr∆∞·ªùng h·ª£p l·ªói v·∫´n tr·∫£ v·ªÅ gi√° tr·ªã m·∫∑c ƒë·ªãnh


total

In [11]:
def extract_all_trips(soup):
    '''
    Tr√≠ch xu·∫•t v√† g·ªôp th√¥ng tin chuy·∫øn xe, tuy·∫øn ƒë∆∞·ªùng v√† ƒë√°nh gi√° nh√† xe th√†nh m·ªôt DataFrame duy nh·∫•t.
    Tr·∫£ v·ªÅ: DataFrame ch·ª©a to√†n b·ªô d·ªØ li·ªáu chuy·∫øn xe.
    '''
    dict_route = parse_route_info(soup)  # L·∫•y th√¥ng tin tuy·∫øn ƒë∆∞·ªùng (ƒëi - ƒë·∫øn)
    containers = soup.find_all("div", class_="container")  # T√¨m t·∫•t c·∫£ container ch·ª©a chuy·∫øn xe

    lst_trips_info = []

    for container in containers:
        # ---- L·∫•y th√¥ng tin chuy·∫øn xe ----
        dict_trip_info = compile_trip_info(container) | dict_route

        # ---- L·∫•y th√¥ng tin ƒë√°nh gi√° ----
        ratings = parse_ratings_from_container(container)

        # ratings l√† list c√°c tuple [(rate_title, rate_point), ...]
        # ta chuy·ªÉn n√≥ th√†nh dict d·∫°ng {'rating_T√™nTi√™uƒê·ªÅ': ƒëi·ªÉm}
        rating_dict = {
            f"{title}": point
            for title, point in ratings
            if title is not None and point is not None
        }

        # ---- G·ªôp th√¥ng tin chuy·∫øn xe v√† rating ----
        full_info = dict_trip_info | rating_dict

        # ---- ƒê∆∞a v√†o DataFrame ----
        df_trip_info = pd.DataFrame([full_info])
        lst_trips_info.append(df_trip_info)

    # ---- G·ªôp to√†n b·ªô chuy·∫øn xe l·∫°i th√†nh 1 dataframe ----
    all_trips_info = pd.concat(lst_trips_info, ignore_index=True)

    return all_trips_info


---

In [12]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException, TimeoutException, StaleElementReferenceException
import time, random
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

## 5. Handle Button logic

### 5.1. Button `Xem th√™m chuy·∫øn`

In [13]:
def click_load_more(driver, max_click=7, max_wait=15):
    """
    Click n√∫t 'Xem th√™m chuy·∫øn' an to√†n, t·ªëi ƒëa max_click l·∫ßn.
    - D√πng JS click ƒë·ªÉ tr√°nh l·ªói b·ªã che.
    - T·ª± d·ª´ng n·∫øu kh√¥ng th·∫•y container m·ªõi xu·∫•t hi·ªán.
    """
    wait = WebDriverWait(driver, max_wait)
    click_count = 0
    fail_count = 0
    MAX_FAIL = 3

    for _ in range(max_click):
        try:
            containers_before = len(driver.find_elements(By.CLASS_NAME, "container"))
            btn = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "load-more")))

            if not btn.is_displayed() or not btn.is_enabled():
                print("N√∫t 'Xem th√™m chuy·∫øn' kh√¥ng kh·∫£ d·ª•ng.")
                break

            driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", btn)
            time.sleep(random.uniform(0.8, 1.5))
            driver.execute_script("arguments[0].click();", btn)
            click_count += 1

            time.sleep(random.uniform(2.0, 3.0))

            try:
                WebDriverWait(driver, 10).until(
                    lambda d: len(d.find_elements(By.CLASS_NAME, "container")) > containers_before
                )
                fail_count = 0
            except TimeoutException:
                fail_count += 1
                if fail_count >= MAX_FAIL:
                    print("Kh√¥ng th·∫•y container m·ªõi sau nhi·ªÅu l·∫ßn ‚Üí d·ª´ng.")
                    break

        except (TimeoutException, NoSuchElementException):
            print("Kh√¥ng c√≤n n√∫t 'Xem th√™m chuy·∫øn' ‚Üí d·ª´ng.")
            break
        except StaleElementReferenceException:
            time.sleep(1)
            continue
        except Exception:
            fail_count += 1
            if fail_count >= MAX_FAIL:
                break
            time.sleep(2)
            continue

    print(f"Ho√†n t·∫•t ‚Äî ƒë√£ click {click_count} l·∫ßn.")
    return click_count

### 5.2. Button `Xem c√°c ƒë√°nh gi√°`

In [14]:
def expand_ratings(driver, click_prob=0.65, delay_range=(0.5, 1.2)):
    """
    M·ªü ng·∫´u nhi√™n c√°c ph·∫ßn ƒë√°nh gi√° (rating) tr√™n trang Vexere.
    - Kh√¥ng gi·ªõi h·∫°n s·ªë l·∫ßn click.
    - Random b·ªè qua m·ªôt s·ªë n√∫t ƒë·ªÉ gi·∫£ l·∫≠p h√†nh vi ng∆∞·ªùi th·∫≠t.
    """
    try:
        time.sleep(2)
        stars = driver.find_elements(By.CLASS_NAME, "bus-rating-button")
        if not stars:
            print("Kh√¥ng t√¨m th·∫•y n√∫t ƒë√°nh gi√°.")
            return 0

        total = len(stars)
        clicks = 0

        for i in range(total):
            if random.random() > click_prob:
                continue

            try:
                stars = driver.find_elements(By.CLASS_NAME, "bus-rating-button")
                if i >= len(stars):
                    break
                star = stars[i]

                if not star.is_displayed():
                    continue

                driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", star)
                time.sleep(random.uniform(*delay_range))
                driver.execute_script("arguments[0].click();", star)
                clicks += 1
                time.sleep(random.uniform(*delay_range))

            except StaleElementReferenceException:
                continue
            except Exception:
                continue

        print(f"ƒê√£ click {clicks}/{total} n√∫t ƒë√°nh gi√°.")
        return clicks

    except Exception:
        print("Kh√¥ng th·ªÉ m·ªü ph·∫ßn ƒë√°nh gi√°.")
        return 0


### 5.3. Button `T√¨m ki·∫øm`

In [15]:
def click_search(driver, retries=3):
    """
    Click v√†o n√∫t t√¨m ki·∫øm tr√™n trang Vexere (an to√†n, t·ª± scroll, retry n·∫øu l·ªói).
    """
    wait = WebDriverWait(driver, 10)
    
    for _ in range(retries):
        try:
            btn = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "button-search")))
            driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", btn)
            time.sleep(0.5)
            driver.execute_script("arguments[0].click();", btn)
            print("Click v√†o n√∫t t√¨m ki·∫øm th√†nh c√¥ng")
            return True
        except Exception:
            print("Click t√¨m ki·∫øm l·ªói, th·ª≠ l·∫°i")
            time.sleep(1)

    print("Kh√¥ng th·ªÉ click v√†o n√∫t t√¨m ki·∫øm")
    return False

## 6. Automate the process of filtering website data

In [16]:
def get_target_date_components(days=0):
    """
    Tr·∫£ v·ªÅ ng√†y v√† th√°ng-nƒÉm m·ª•c ti√™u c√°ch hi·ªán t·∫°i `days` ng√†y.

    Parameters
    ----------
    days : int, optional
        S·ªë ng√†y c·ªông th√™m t·ª´ ng√†y hi·ªán t·∫°i (m·∫∑c ƒë·ªãnh = 0).

    Returns
    -------
    dict
        {'day': '15', 'month_year': '10-2025'}
    """
    # Ng√†y m·ª•c ti√™u = ng√†y hi·ªán t·∫°i + days ng√†y
    target_date = datetime.today() + timedelta(days=days)
    month_year = f"{target_date.month:02d}-{target_date.year}"
    day = str(target_date.day)

    return {
        'day': day,
        'month_year': month_year
    }

In [17]:
def set_search_filters(driver, start_city: str, destination_city: str, days=0):
    """
    Ch·ªçn ƒëi·ªÉm ƒëi, ƒëi·ªÉm ƒë·∫øn v√† ng√†y kh·ªüi h√†nh tr√™n trang Vexere.

    Parameters
    ----------
    driver : webdriver
        ƒê·ªëi t∆∞·ª£ng Selenium WebDriver ƒëang ƒëi·ªÅu khi·ªÉn tr√¨nh duy·ªát.
    start_city : str
        T√™n th√†nh ph·ªë kh·ªüi h√†nh.
    destination_city : str
        T√™n th√†nh ph·ªë ƒëi·ªÉm ƒë·∫øn.
    days : int, optional
        S·ªë ng√†y t√≠nh t·ª´ h√¥m nay ƒë·ªÉ ch·ªçn ng√†y ƒëi (m·∫∑c ƒë·ªãnh = 0).

    Returns
    -------
    bool
        True n·∫øu ch·ªçn ng√†y th√†nh c√¥ng, False n·∫øu x·∫£y ra l·ªói.
    """

    wait = WebDriverWait(driver, 10)

    try:
        # Nh·∫≠p n∆°i ƒëi v√† n∆°i ƒë·∫øn
        departure_input = wait.until(EC.presence_of_element_located((By.ID, 'from_input')))
        destination_input = wait.until(EC.presence_of_element_located((By.ID, 'to_input')))

        departure_input.clear()
        destination_input.clear()

        departure_input.send_keys(start_city)
        destination_input.send_keys(destination_city)
        time.sleep(0.5)

        # M·ªü ph·∫ßn ch·ªçn ng√†y ƒëi
        date_btn = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "departure-date-select")))
        date_btn.click()
        time.sleep(1)

        # L·∫•y th√¥ng tin ng√†y v√† th√°ng c·∫ßn ch·ªçn
        target = get_target_date_components(days)
        target_day = target['day']
        target_month = target['month_year']

        # T√¨m kh·ªëi th√°ng (c√≥ th·ªÉ d√πng '-' ho·∫∑c '_')
        try:
            month_section = driver.find_element(By.ID, target_month)
        except:
            month_section = driver.find_element(By.ID, target_month.replace('-', '_'))

        # T√¨m t·∫•t c·∫£ c√°c ph·∫ßn t·ª≠ ng√†y trong th√°ng
        day_elements = month_section.find_elements(By.CSS_SELECTOR, "p.day")

        for day in day_elements:
            if day.text.strip() == target_day:
                driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", day)
                time.sleep(0.3)
                driver.execute_script("arguments[0].click();", day)
                return True

        print("Kh√¥ng t√¨m th·∫•y ng√†y c·∫ßn ch·ªçn.")
        return False

    except Exception as e:
        print(f"L·ªói khi ch·ªçn b·ªô l·ªçc t√¨m ki·∫øm: {e}")
        return False

# FLOW OFFICIAL

---

## Crawl rating data

# -- Main --

In [18]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service


In [19]:
# URL = 'https://vexere.com/'

# # Danh s√°ch tuy·∫øn
# arrivals_HaNoi = ['H√† Giang','Qu·∫£ng Ninh','Thanh H√≥a','SaPa','Ninh B√¨nh']#['H·∫£i Ph√≤ng','Ngh·ªá An','S∆°n La']
# arrivals_SaiGon = ['Gia Lai','B√¨nh Thu·∫≠n','Ninh Thu·∫≠n','ƒê·∫Øk L·∫Øk','Ph√∫ Y√™n','Nha Trang','B√† R·ªãa-V≈©ng T√†u']


# departure_city = 'H√† N·ªôi'
# days_offset = 1 #int(input(""))

# # L·∫•y ng√†y / th√°ng c·∫ßn crawl
# target_date = get_target_date_components(days_offset)
# day = target_date['day']
# month_year = target_date['month_year']
# month_years = month_year.replace('-', '_')

# # Crawl t·ª´ng tuy·∫øn
# for arrival_city in arrivals_HaNoi:

#     driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
#     driver.get(URL)

#     print(f"ƒêang crawl: {departure_city} ‚Üí {arrival_city}")

#     filter_success = set_search_filters(
#         driver,
#         start_city=departure_city,
#         destination_city=arrival_city,
#         days=days_offset
#     )

#     if not filter_success:
#         print(f"Kh√¥ng th·ªÉ ch·ªçn ng√†y cho {arrival_city}")
#         continue

#     time.sleep(2)
#     click_search(driver)    # Click t√¨m ki·∫øm sau khi l·ªçc d·ªØ li·ªáu
#     time.sleep(2.5)
#     click_load_more(driver,5) # Click xem th√™m ƒë·ªÉ th·∫•y ƒëc nhi·ªÅu chuy·∫øn xe
    
#     expand_ratings(driver, 0.6)  # M·ªü c√°c th√¥ng tin ƒë√°nh gi√° t·ª´ng chuy·∫øn
#     time.sleep(1.5)

#     # Parse d·ªØ li·ªáu HTML
#     soup = BeautifulSoup(driver.page_source, "html.parser")

#     try:
#         df_trips_info = extract_all_trips(soup)

#         save_path = f"../../data/processed/{departure_city}_{arrival_city}_{day}_{month_years}.csv"
#         df_trips_info.to_csv(save_path, index=False, encoding='utf-8')

#         print(f"L·∫•y d·ªØ li·ªáu {arrival_city} th√†nh c√¥ng! ‚Üí {save_path}")

#     except Exception as e:
#         html_path = f"../../data/site/{departure_city}_{arrival_city}_{day}_{month_years}.html"
#         with open(html_path, 'w', encoding='utf-8') as f:
#             f.write(soup.prettify()) 
#         print(f"L∆∞u HTML ƒë·ªÉ debug: {html_path}\n{e}")

# # ƒê√≥ng tr√¨nh duy·ªát sau khi ho√†n t·∫•t
#     driver.quit()

In [None]:
import time, random, os
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import WebDriverException, TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
import undetected_chromedriver as uc

# ====================== C·∫§U H√åNH BAN ƒê·∫¶U ======================
URL = 'https://vexere.com/'

# Danh s√°ch tuy·∫øn
arrivals_HaNoi = ['H√† Giang', 'Qu·∫£ng Ninh', 'Thanh H√≥a', 'SaPa', 'Ninh B√¨nh']
# arrivals_SaiGon = ['Gia Lai','B√¨nh Thu·∫≠n','Ninh Thu·∫≠n','ƒê·∫Øk L·∫Øk','Ph√∫ Y√™n','Nha Trang','B√† R·ªãa-V≈©ng T√†u']

departure_city = 'H√† N·ªôi'
days_offset = 1  # int(input("Nh·∫≠p s·ªë ng√†y mu·ªën crawl: "))

# L·∫•y ng√†y / th√°ng c·∫ßn crawl
target_date = get_target_date_components(days_offset)
day = target_date['day']
month_year = target_date['month_year']
month_years = month_year.replace('-', '_')

print(f"\nüö¶ B·∫Øt ƒë·∫ßu crawl t·ª´ {departure_city} cho ng√†y: {day}/{month_year}")

# ====================== V√íNG L·∫∂P CH√çNH ======================
for arrival_city in arrivals_HaNoi:

    # --- C·∫•u h√¨nh Chrome t·ªëi ∆∞u ---
    options = Options()
    # options.add_argument("--headless=new")                   # ch·∫°y n·ªÅn, kh√¥ng m·ªü giao di·ªán
    options.add_argument("--no-sandbox")                     # gi·∫£m overhead, ƒë·∫∑c bi·ªát khi ch·∫°y server
    options.add_argument("--disable-dev-shm-usage")          # tr√°nh crash khi RAM th·∫•p
    options.add_argument("--disable-gpu")                    # t·∫Øt GPU render
    options.add_argument("--disable-software-rasterizer")    # b·ªè v·∫Ω 3D
    options.add_argument("--disable-extensions")             # t·∫Øt extension kh√¥ng c·∫ßn thi·∫øt
    options.add_argument("--disable-infobars")               # t·∫Øt banner ‚ÄúChrome is being controlled...‚Äù
    options.add_argument("--disable-popup-blocking")         # tr√°nh popup
    options.add_argument("--disable-notifications")          # t·∫Øt th√¥ng b√°o
    options.add_argument("--disable-blink-features=AutomationControlled")  # gi·∫£m ph√°t hi·ªán bot
    options.add_argument("--window-size=1920,1080")          # c·ªë ƒë·ªãnh viewport
    options.add_argument("--start-maximized")                # t·ªëi ∆∞u hi·ªÉn th·ªã
    options.add_argument("--log-level=3")                    # gi·∫£m log
    options.add_argument("--disable-logging")                # t·∫Øt ghi log n·ªôi b·ªô Chrome
    options.add_argument(f"--remote-debugging-port={random.randint(9000,9999)}")  # tr√°nh tr√πng port

    # --- T·∫Øt t·∫£i ·∫£nh, popup, plugin ƒë·ªÉ ti·∫øt ki·ªám t√†i nguy√™n ---
    prefs = {
        "profile.managed_default_content_settings.images": 2,
        "profile.default_content_setting_values.notifications": 2,
        "profile.managed_default_content_settings.stylesheets": 1,
        "profile.managed_default_content_settings.cookies": 1,
        "profile.managed_default_content_settings.plugins": 2,
        "profile.managed_default_content_settings.popups": 2
    }
    options.add_experimental_option("prefs", prefs)
    options.page_load_strategy = "eager"  # ch·ªâ ƒë·ª£i DOM, kh√¥ng ch·ªù to√†n b·ªô page load

    # --- Kh·ªüi t·∫°o service ch·ªâ 1 l·∫ßn ---
    service = Service(ChromeDriverManager().install())

    print(f"\nüöç ƒêang crawl: {departure_city} ‚Üí {arrival_city}")

    driver = None
    try:
        driver = webdriver.Chrome(service=service, options=options)
        driver.get(URL)

        # 1Ô∏è‚É£ Thi·∫øt l·∫≠p filter
        filter_success = set_search_filters(
            driver,
            start_city=departure_city,
            destination_city=arrival_city,
            days=days_offset
        )

        if not filter_success:
            print(f"‚ö†Ô∏è Kh√¥ng th·ªÉ ch·ªçn ng√†y cho {arrival_city}")
            continue

        # 2Ô∏è‚É£ Click t√¨m ki·∫øm
        time.sleep(1)
        click_search(driver)

        # 3Ô∏è‚É£ Ch·ªù trang k·∫øt qu·∫£
        try:
            WebDriverWait(driver, 20).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".bus-item"))
            )
        except TimeoutException:
            print(f"‚è∞ Kh√¥ng t√¨m th·∫•y chuy·∫øn xe n√†o cho tuy·∫øn {arrival_city}")
            continue

        # 4Ô∏è‚É£ Click ‚ÄúXem th√™m‚Äù v√† m·ªü ƒë√°nh gi√°
        click_load_more(driver, max_click=5)
        expand_ratings(driver, click_prob=0.75)
        time.sleep(1.5)

        # 5Ô∏è‚É£ Parse HTML v√† l∆∞u file CSV
        soup = BeautifulSoup(driver.page_source, "html.parser")
        try:
            df_trips_info = extract_all_trips(soup)

            os.makedirs("../../data/processed", exist_ok=True)
            save_path = f"../../data/processed/{departure_city}_{arrival_city}_{day}_{month_years}.csv"
            df_trips_info.to_csv(save_path, index=False, encoding='utf-8')

            print(f"‚úÖ L·∫•y d·ªØ li·ªáu {arrival_city} th√†nh c√¥ng! ‚Üí {save_path}")

        except Exception as e_parse:
            os.makedirs("../../data/site", exist_ok=True)
            html_path = f"../../data/site/{departure_city}_{arrival_city}_{day}_{month_years}_error.html"
            with open(html_path, 'w', encoding='utf-8') as f:
                f.write(soup.prettify())
            print(f"‚ùå L·ªói x·ª≠ l√Ω d·ªØ li·ªáu: {e_parse}\nƒê√£ l∆∞u HTML ƒë·ªÉ debug: {html_path}")

        finally:
            del soup

    except WebDriverException as e_driver:
        print(f"üí• L·ªói driver khi x·ª≠ l√Ω {arrival_city}: {e_driver}")

    finally:
        if driver:
            driver.quit()
        print(f"üßπ ƒê√£ ƒë√≥ng driver cho tuy·∫øn {arrival_city}")
        time.sleep(random.uniform(5, 10))  # ngh·ªâ ng·∫´u nhi√™n ƒë·ªÉ tr√°nh b·ªã ch·∫∑n IP

print("\nüéØ --- HO√ÄN T·∫§T TO√ÄN B·ªò QU√Å TR√åNH CRAWL ---")



üö¶ B·∫Øt ƒë·∫ßu crawl t·ª´ H√† N·ªôi cho ng√†y: 1/11-2025

üöç ƒêang crawl: H√† N·ªôi ‚Üí H√† Giang
üí• L·ªói driver khi x·ª≠ l√Ω H√† Giang: Message: session not created: cannot connect to chrome at 127.0.0.1:64886
from session not created: This version of ChromeDriver only supports Chrome version 142
Current browser version is 141.0.7390.123; For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#sessionnotcreatedexception
Stacktrace:
Symbols not available. Dumping unresolved backtrace:
	0x4b58c3
	0x4b5904
	0x2ce76d
	0x30b20d
	0x30a159
	0x30008f
	0x2ffeb6
	0x348993
	0x34830a
	0x33c766
	0x30dac0
	0x30ede4
	0x737974
	0x732bea
	0x4de5b4
	0x4cdd28
	0x4d4d8d
	0x4bded8
	0x4be09c
	0x4a7d1a
	0x767e5d49
	0x77c0d6db
	0x77c0d661

üßπ ƒê√£ ƒë√≥ng driver cho tuy·∫øn H√† Giang

üöç ƒêang crawl: H√† N·ªôi ‚Üí Qu·∫£ng Ninh
üí• L·ªói driver khi x·ª≠ l√Ω Qu·∫£ng Ninh: Message: session not created: cannot connect to chrome at 127.0.

KeyboardInterrupt: 