In [24]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [25]:
def fetch_html(url):
    response = requests.get(url)
    if response.status_code == 200:
        print('Fetch OK')
        return response.text
    else:
        print('Fetch ERROR')
        return None

---

### Handle price

In [26]:
def parse_price(block):
    fare_sale = block.find('div', class_='fare-sale')
    fare = block.find('div', class_='fare')
    small = block.find('div', class_='small')
    percent = block.find('div', class_='percent')

    # Discounted price
    if fare_sale:
        price_discount = fare_sale.get_text(strip=True).replace("From", "").replace("ƒë", "").strip()
    elif fare:
        price_discount = fare.get_text(strip=True).replace("ƒë", "").strip()
    else:
        price_discount = None

    # Original price
    if small and small.get_text(strip=True):
        price_original = small.get_text(strip=True).replace("ƒë", "").strip()
    else:
        price_original = price_discount

    # Discount percent
    discount_percent = percent.get_text(strip=True) if percent else None

    return [price_original, price_discount, discount_percent]

---

### Parser bus infor

In [27]:
def parse_bus_info(container):
    bus_name = container.find('div', class_='bus-name').text.strip() if container.find('div', class_='bus-name') else None
    bus_rating = container.find('div', class_='bus-rating').find('span').text.strip() if container.find('div', class_='bus-rating') and container.find('div', class_='bus-rating').find('span') else None
    seat_type = container.find('div', class_='seat-type').get_text(strip=True) if container.find('div', class_='seat-type') else None

    from_to_content = container.find('div', class_="from-to-content")
    if from_to_content:
        to_content = from_to_content.find('div', class_='content to')
        from_content = from_to_content.find('div', class_='content from')
        duration = from_to_content.find('div', class_="duration").get_text(strip=True) if from_to_content.find('div', class_="duration") else None

        # Arrival info
        date_arrival = None
        to_hour = None
        to_place = None
        if to_content:
            span = to_content.find('span', class_="text-date-arrival-time")
            date_arrival = span.get_text(strip=True) if span else None
            content_to_info = to_content.find('div', class_='content-to-info')
            if content_to_info:
                to_hour = content_to_info.find('div',class_='hour' ).get_text(strip=True) if content_to_info.find('div',class_='hour' ) else None
                to_place = content_to_info.find('div',class_='place' ).get_text(strip=True) if content_to_info.find('div',class_='place' ) else None

        # Departure info
        from_hour = from_content.find('div',class_='hour' ).get_text(strip=True) if from_content and from_content.find('div',class_='hour' ) else None
        from_place = from_content.find('div',class_='place' ).get_text(strip=True) if from_content and from_content.find('div',class_='place' ) else None
    else:
        duration = None
        date_arrival = None
        to_hour = None
        to_place = None
        from_hour = None
        from_place = None

    price_original, price_discount, discount_percent = parse_price(container)

    notification = container.find('div', class_='link')
    notification = notification.get_text(strip=True) if notification else None

    return [
        bus_name, bus_rating, seat_type,
        from_hour, from_place, duration,
        to_hour, to_place, date_arrival,
        price_original, price_discount, discount_percent, notification
    ]

#### Convert from List to DataFrame

In [28]:
def extract_all_bus_info(soup) -> list:
    containers = soup.find_all("div", class_="container")
    bus_info_list = [parse_bus_info(container) for container in containers]
    return bus_info_list

# FLOW OFFICIAL

In [29]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
import time, random

def click_load_more(driver):
    while True:
        try:
            load_more_span = driver.find_element(By.XPATH, "//span[text()='Xem th√™m chuy·∫øn']")
            load_more_button = load_more_span.find_element(By.XPATH, "./ancestor::button")
            driver.execute_script("arguments[0].scrollIntoView();", load_more_button)
            load_more_button.click()
            time.sleep(random.uniform(2, 3))
        except NoSuchElementException:
            break
        except ElementClickInterceptedException:
            time.sleep(2)

# driver = webdriver.Chrome()
# driver.get("https://vexere.com/vi-VN/ve-xe-khach-tu-sai-gon-di-nha-trang-khanh-hoa-129t23591.html?date=11-10-2025&v=6")
# click_load_more(driver)

# # After all trips are loaded, get page source
# html = driver.page_source
# soup = BeautifulSoup(html, 'html.parser')

# bus_info_list = []
# bus_seen = set()
# containers = soup.find_all("div", class_="container")
# for container in containers:
#     bus_name = container.find('div', class_='bus-name').text.strip() if container.find('div', class_='bus-name') else None
#     bus_rating = container.find('div', class_='bus-rating').find('span').text.strip() if container.find('div', class_='bus-rating') and container.find('div', class_='bus-rating').find('span') else None
#     key = (bus_name, bus_rating)
#     if key in bus_seen:
#         # Already have rating for this bus, skip opening rating
#         pass
#     else:
#         # If rating not visible, open rating (not implemented, placeholder)
#         # TODO: open rating if not visible
#         bus_seen.add(key)
#     bus_info_list.append(parse_bus_info(container))

# df_bus_info = pd.DataFrame(bus_info_list, columns=[
#     'bus_name', 'bus_rating', 'seat_type',
#     'from_hour', 'from_place', 'duration',
#     'to_hour', 'to_place', 'date_arrival',
#     'price_original', 'price_discount', 'discount_percent', 'notification'
# ])
# df_bus_info.to_csv("bus_info.csv", index=False)
# driver.quit()
# df_bus_info

---

## Crawl rating data

In [30]:
# from selenium import webdriver
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.common.action_chains import ActionChains
# from bs4 import BeautifulSoup
# import time

# driver = webdriver.Chrome()
# driver.get("https://vexere.com/vi-VN/ve-xe-khach-tu-sai-gon-di-nha-trang-khanh-hoa-129t23591.html?date=11-10-2025&v=6")

# try:
#     buttons = WebDriverWait(driver, 15).until(
#         EC.presence_of_all_elements_located((By.CLASS_NAME, "bus-rating-button"))
#     )
#     for btn in buttons[:2]:
#         # Cu·ªôn ƒë·∫øn n√∫t ƒë·ªÉ ƒë·∫£m b·∫£o n√≥ trong t·∫ßm nh√¨n
#         driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", btn)
#         time.sleep(1)
#         # Click an to√†n
#         ActionChains(driver).move_to_element(btn).click().perform()
#         time.sleep(2)
#     print("‚úÖ ƒê√£ click 2 n√∫t ƒë·∫ßu ti√™n th√†nh c√¥ng.")
# except Exception as e:
#     print("‚ö†Ô∏è L·ªói khi click:", e)

# time.sleep(3)
# soup = BeautifulSoup(driver.page_source, "html.parser")

# with open("data_rating_site.html", "w", encoding="utf-8") as f:
#     f.write(soup.prettify())

# print("üíæ ƒê√£ l∆∞u file data_rating_site.html")
# driver.quit()


In [30]:
def extract_rating_from_container(container):
    try:
        ratings = []
        rate_divs = container.find_all('div', class_='rate-title')
        for rate_div in rate_divs:
            rate_ps = rate_div.find_all('p')
            if len(rate_ps) >= 2:
                rate_title = rate_ps[0].get_text(strip=True)
                rate_point = rate_ps[1].get_text(strip=True)
                ratings.append((rate_title, rate_point))
        if ratings:
            return ratings
        else:
            return [(None, None)]
    except Exception:
        return [(None, None)]

In [31]:
import pandas as pd
from bs4 import BeautifulSoup
with open("data_rating_site.html", 'r', encoding='utf-8') as f:
    html_content = f.read()
soup = BeautifulSoup(html_content, 'html.parser')

# rating_list = []

# rating = soup.find_all('div', class_='rate-title')
# for r in rating:
#     rate = r.find_all('p')
#     rate_title = rate[0].get_text(strip=True)
#     rate_point = rate[1].get_text(strip=True)

#     rating_list.append((rate_title, rate_point))
# df_rating = pd.DataFrame(rating_list, columns=['rate_title', 'rate_point'])
# df_rating.T
# df_rating.pivot_table(index='rate_title',values='rate_point' ,aggfunc=len, fill_value=0)

In [35]:
bus_info_list = []
containers = soup.find_all("div", class_="container")
for container in containers:
    bus_info = parse_bus_info(container)
    ratings = extract_rating_from_container(container)  # list of (rate_title, rate_point)
    bus_name = bus_info[0]
    df_ratings = pd.DataFrame(bus_name+ratings, columns=['bus_name','rate_title', 'rate_point']).T

columns = [
    'bus_name', 'bus_rating', 'seat_type',
    'from_hour', 'from_place', 'duration',
    'to_hour', 'to_place', 'date_arrival',
    'price_original', 'price_discount', 'discount_percent', 'notification',
]
df_bus_info = pd.DataFrame(bus_info_list, columns=columns)
df_bus_info.to_csv("bus_info_with_rating.csv", index=False)

TypeError: can only concatenate str (not "list") to str

In [32]:
# # Main workflow
# URL = 'https://vexere.com/vi-VN/ve-xe-khach-tu-sai-gon-di-nha-trang-khanh-hoa-129t23591.html?date=27-09-2025&nation=84&ts=1758796742310'
# html = fetch_html(URL)
# if html:
#     save_html(html, "data_site.html")

# html_content = load_html("data_site.html")
# soup = BeautifulSoup(html_content, 'html.parser')
# bus_data = extract_all_bus_info(soup)
# df_bus_info = pd.DataFrame(bus_data, columns=[
#     'bus_name', 'bus_rating', 'seat_type',
#     'from_hour', 'from_place', 'duration',
#     'to_hour', 'to_place', 'date_arrival',
#     'price_original', 'price_discount', 'discount_percent', 'notification'
# ])

In [34]:
df = pd.read_csv("bus_info_with_rating.csv")
df

Unnamed: 0,bus_name,bus_rating,seat_type,from_hour,from_place,duration,to_hour,to_place,date_arrival,price_original,price_discount,discount_percent,notification,Ch·∫•t l∆∞·ª£ng,ƒêi·ªÉm
0,ƒê√† L·∫°t ∆°i,4.8 (3416),Limousine 24 Ph√≤ng ƒê√îI,23:45,‚Ä¢ Tr·∫°m Qu·∫≠n 1,5h45m,05:30,‚Ä¢ Tr·∫°m Nha Trang,(12/10),450.0,T·ª´ 320.000,-29%,L·ªô tr√¨nh: H√†ng Xanh - Qu·∫≠n 1 - Cao t·ªëc ƒë·∫øn Cam...,,
1,Hu·ª≥nh Gia,4.7 (8518),Gi∆∞∆°ÃÄng nƒÉÃÄm 38 ch√¥ÃÉ (WC),22:30,‚Ä¢ VƒÉn Ph√≤ng Ph·∫°m Ng≈© L√£o,6h30m,05:00,‚Ä¢ VƒÉn Ph√≤ng Nha Trang,(12/10),280.0,250.000,-11%,L·ªô tr√¨nh: Cao t·ªëc (Long Th√†nh - D·∫ßu Gi√¢y - Pha...,,
2,B√¨nh Minh T·∫£i,4.7 (3633),Limousine 22 Ph√≤ng ƒê∆°n,22:30,‚Ä¢ VƒÉn Ph√≤ng Qu·∫≠n 1,7h5m,05:35,‚Ä¢ VƒÉn ph√≤ng Nha Trang,(12/10),350.0,T·ª´ 300.000,-14%,H∆∞·ªõng ƒëi: Cao t·ªëc (Long Th√†nh - D·∫ßu Gi√¢y - Pha...,,
3,Khanh Phong,4.7 (16812),Limousine 20 gi∆∞·ªùng ph√≤ng (WC),22:10,‚Ä¢ VƒÉn Ph√≤ng Ph·∫°m Ng≈© L√£o - Qu·∫≠n 1.,6h10m,04:20,‚Ä¢ VƒÉn Ph√≤ng Nha Trang (KS M∆∞·ªùng Thanh),(12/10),480.0,T·ª´ 450.000,-6%,Qu√Ω kh√°ch l∆∞u √Ω,,
4,An Anh Limousine,4.8 (8302),Limousine 34 Ph√≤ng ƒê∆°n,23:30,‚Ä¢ VƒÉn Ph√≤ng Qu·∫≠n 5,6h30m,06:00,‚Ä¢ VƒÉn ph√≤ng Nha Trang,(12/10),299.0,199.000,-33%,Ti·ªán √≠ch mi·ªÖn ph√≠,,
5,Nh·∫≠t D∆∞∆°ng - B√¨nh Minh Bus,4.9 (5820),Limousine 22 Ph√≤ng ƒê√¥i Luxury (WC),23:30,‚Ä¢ VƒÉn ph√≤ng Nguy·ªÖn C∆∞ Trinh Qu·∫≠n 1,6h36m,06:06,‚Ä¢ Vp Th√≠ch Qu·∫£ng ƒê·ª©c Nha Trang,(12/10),349.0,T·ª´ 304.000,-13%,NX xu·∫•t Hƒê VAT - H∆∞·ªõng ƒëi: Cao t·ªëc,,
6,Bus365,4.7 (164),Limousine 24 ph√≤ng ƒë√¥i,22:00,‚Ä¢ B·∫øn Xe Mi·ªÅn ƒê√¥ng M·ªõi,6h,04:00,‚Ä¢ VƒÉn Ph√≤ng Nha Trang,(12/10),400.0,T·ª´ 289.000,-28%,V√© Metro mi·ªÖn ph√≠,,
7,Nam H·∫£i Limousine,4.7 (3456),Limousine 34 gi∆∞·ªùng,22:20,‚Ä¢ VƒÉn Ph√≤ng Ph·∫°m Ng≈© L√£o,8h,06:20,‚Ä¢ VƒÉn Ph√≤ng Nha Trang,(12/10),300.0,300.000,,L∆∞u √Ω ƒê√≥n/Tr·∫£ t·∫°i TP.HCM,,
8,Li√™n H∆∞ng,4 (12910),Limousine 21 ph√≤ng ƒë∆°n (WC),22:15,‚Ä¢ B·∫øn xe Mi·ªÅn T√¢y (Qu·∫ßy 24),8h45m,07:00,‚Ä¢ VƒÉn ph√≤ng Nha Trang,(12/10),500.0,T·ª´ 300.000,-40%,,,
9,Tr√† Lan Vi√™n,4.4 (4193),Limousine 30 Ph√≤ng ƒê∆°n (WC),22:15,‚Ä¢ Vp. Qu·∫≠n 1,7h20m,05:35,‚Ä¢ VP H√† Quang 2,(12/10),280.0,260.000,-7%,Qu√Ω kh√°ch l∆∞u √Ω gi∆∞·ªùng cu·ªëi,,


In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from bs4 import BeautifulSoup
import time, random


driver = webdriver.Chrome()
driver.get("https://vexere.com/vi-VN/ve-xe-khach-tu-sai-gon-di-nha-trang-khanh-hoa-129t23591.html?date=11-10-2025&v=6")

# click_load_more(driver)

# Open all ratings after all trips are loaded
try:
    buttons = WebDriverWait(driver, 15).until(
        EC.presence_of_all_elements_located((By.CLASS_NAME, "bus-rating-button"))
    )
    for btn in buttons:
        driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", btn)
        time.sleep(1)
        ActionChains(driver).move_to_element(btn).click().perform()
        time.sleep(2)
        exit()
except Exception as e:
    print("‚ö†Ô∏è Error clicking rating buttons:", e)

time.sleep(3)
soup = BeautifulSoup(driver.page_source, "html.parser")

bus_info_list = []
containers = soup.find_all("div", class_="container")
for container in containers:
    bus_info = parse_bus_info(container)
    ratings = extract_rating_from_container(container)  # list of (rate_title, rate_point)
    for rate_title, rate_point in ratings:
        row = bus_info + [rate_title, rate_point]
        bus_info_list.append(row)

columns = [
    'bus_name', 'bus_rating', 'seat_type',
    'from_hour', 'from_place', 'duration',
    'to_hour', 'to_place', 'date_arrival',
    'price_original', 'price_discount', 'discount_percent', 'notification',
    'rate_title', 'rate_point'
]
df_bus_info = pd.DataFrame(bus_info_list, columns=columns)
df_bus_info.to_csv("bus_info_with_rating.csv", index=False)
driver.quit()
df_bus_info

Unnamed: 0,bus_name,bus_rating,seat_type,from_hour,from_place,duration,to_hour,to_place,date_arrival,price_original,price_discount,discount_percent,notification,rate_title,rate_point
0,ƒê√† L·∫°t ∆°i,4.8 (3416),Limousine 24 Ph√≤ng ƒê√îI,23:45,‚Ä¢ Tr·∫°m Qu·∫≠n 1,5h45m,05:30,‚Ä¢ Tr·∫°m Nha Trang,(12/10),450.000,T·ª´ 320.000,-29%,L·ªô tr√¨nh: H√†ng Xanh - Qu·∫≠n 1 - Cao t·ªëc ƒë·∫øn Cam...,An to√†n,4.8
1,ƒê√† L·∫°t ∆°i,4.8 (3416),Limousine 24 Ph√≤ng ƒê√îI,23:45,‚Ä¢ Tr·∫°m Qu·∫≠n 1,5h45m,05:30,‚Ä¢ Tr·∫°m Nha Trang,(12/10),450.000,T·ª´ 320.000,-29%,L·ªô tr√¨nh: H√†ng Xanh - Qu·∫≠n 1 - Cao t·ªëc ƒë·∫øn Cam...,Th√¥ng tin ch√≠nh x√°c,4.8
2,ƒê√† L·∫°t ∆°i,4.8 (3416),Limousine 24 Ph√≤ng ƒê√îI,23:45,‚Ä¢ Tr·∫°m Qu·∫≠n 1,5h45m,05:30,‚Ä¢ Tr·∫°m Nha Trang,(12/10),450.000,T·ª´ 320.000,-29%,L·ªô tr√¨nh: H√†ng Xanh - Qu·∫≠n 1 - Cao t·ªëc ƒë·∫øn Cam...,Th√¥ng tin ƒë·∫ßy ƒë·ªß,4.8
3,ƒê√† L·∫°t ∆°i,4.8 (3416),Limousine 24 Ph√≤ng ƒê√îI,23:45,‚Ä¢ Tr·∫°m Qu·∫≠n 1,5h45m,05:30,‚Ä¢ Tr·∫°m Nha Trang,(12/10),450.000,T·ª´ 320.000,-29%,L·ªô tr√¨nh: H√†ng Xanh - Qu·∫≠n 1 - Cao t·ªëc ƒë·∫øn Cam...,Th√°i ƒë·ªô nh√¢n vi√™n,4.8
4,ƒê√† L·∫°t ∆°i,4.8 (3416),Limousine 24 Ph√≤ng ƒê√îI,23:45,‚Ä¢ Tr·∫°m Qu·∫≠n 1,5h45m,05:30,‚Ä¢ Tr·∫°m Nha Trang,(12/10),450.000,T·ª´ 320.000,-29%,L·ªô tr√¨nh: H√†ng Xanh - Qu·∫≠n 1 - Cao t·ªëc ƒë·∫øn Cam...,Ti·ªán nghi & tho·∫£i m√°i,4.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117,Tr√† Lan Vi√™n,4.4 (4193),Limousine 21 Ph√≤ng ƒê∆°n (WC),23:00,‚Ä¢ Vp. Qu·∫≠n 1,7h20m,06:20,‚Ä¢ VP H√† Quang 2,(12/10),450.000,400.000,-11%,Qu√Ω kh√°ch l∆∞u √Ω gi∆∞·ªùng cu·ªëi,Th√¥ng tin ƒë·∫ßy ƒë·ªß,4.5
118,Tr√† Lan Vi√™n,4.4 (4193),Limousine 21 Ph√≤ng ƒê∆°n (WC),23:00,‚Ä¢ Vp. Qu·∫≠n 1,7h20m,06:20,‚Ä¢ VP H√† Quang 2,(12/10),450.000,400.000,-11%,Qu√Ω kh√°ch l∆∞u √Ω gi∆∞·ªùng cu·ªëi,Th√°i ƒë·ªô nh√¢n vi√™n,4.4
119,Tr√† Lan Vi√™n,4.4 (4193),Limousine 21 Ph√≤ng ƒê∆°n (WC),23:00,‚Ä¢ Vp. Qu·∫≠n 1,7h20m,06:20,‚Ä¢ VP H√† Quang 2,(12/10),450.000,400.000,-11%,Qu√Ω kh√°ch l∆∞u √Ω gi∆∞·ªùng cu·ªëi,Ti·ªán nghi & tho·∫£i m√°i,4.3
120,Tr√† Lan Vi√™n,4.4 (4193),Limousine 21 Ph√≤ng ƒê∆°n (WC),23:00,‚Ä¢ Vp. Qu·∫≠n 1,7h20m,06:20,‚Ä¢ VP H√† Quang 2,(12/10),450.000,400.000,-11%,Qu√Ω kh√°ch l∆∞u √Ω gi∆∞·ªùng cu·ªëi,Ch·∫•t l∆∞·ª£ng d·ªãch v·ª•,4.3


: 