In [1]:
import requests
import pandas as pd
import unicodedata
import re
import os

from collections import Counter
from datetime import datetime
from bs4 import BeautifulSoup


def slugify(text):
    """
    Convert a string to a slug:
    - Remove diacritics (e.g., 'Đà Nẵng' → 'Da Nang')
    - Convert to lowercase
    - Replace non-alphanumeric characters with hyphens
    """
    text = text.replace('Đ', 'D').replace('đ', 'd')
    text = unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')
    text = text.lower()
    return re.sub(r'[^a-z0-9]+', '-', text).strip('-')


def format_api_date(date_str):
    """
    Convert a date from 'DD-MM-YYYY' format to the API format: 'YYYY-MM-DDT00:00:00+07:00'
    """
    try:
        dt = datetime.strptime(date_str, "%d-%m-%Y")
        return dt.strftime("%Y-%m-%dT00:00:00+07:00")
    except ValueError:
        raise ValueError("Invalid date format. Use 'DD-MM-YYYY'.")


def get_bus_trip_count(url):

    response = requests.get(url)
    if response.status_code != 200:
        print("Failed to retrieve the page.")
        return None

    soup = BeautifulSoup(response.text, "html.parser")
    
    # Find the element with class "text-result-number"
    result_element = soup.find(class_="text-result-number")

    if result_element:
        # Get the text and strip any surrounding whitespace
        count_text = result_element.get_text(strip=True)
        count_text = count_text.split()
        return count_text[0]
    else:
        return 0


def get_total(from_destination, to_destination, date="27-02-2025", mapping_csv="mapping.csv"):
    """
    Given departure and destination names, enrich the data by:
      - Reading the mapping CSV (which has columns: id, name, slug),
      - Normalizing the destination names using slugify,
      - Building the route code and URL,
      - Fetching the total bus trip count from the URL.
      
    Returns a DataFrame with columns: from_name, to_name, total_trip, date.
    """
    # Read the mapping CSV into a DataFrame.
    city_df = pd.read_csv(mapping_csv)
    
    # Normalize the destination names using slugify.
    normalized_from = slugify(from_destination)
    normalized_to = slugify(to_destination)
    
    # Look up the corresponding rows in the mapping CSV.
    row_from = city_df[city_df['slug'] == normalized_from]
    row_to = city_df[city_df['slug'] == normalized_to]
    
    if row_from.empty or row_to.empty:
        print("Error: One or both destinations not found in mapping CSV.")
        return None

    # Extract required fields.
    from_id = row_from.iloc[0]['id']
    from_slug = row_from.iloc[0]['slug']
    to_id = row_to.iloc[0]['id']
    to_slug = row_to.iloc[0]['slug']
    
    # Generate the route code.
    route_code = f"1{from_id}t1{to_id}1"
    
    # Build the URL.
    url = (
        f"https://vexere.com/vi-VN/ve-xe-khach-tu-{from_slug}-di-{to_slug}-"
        f"{route_code}.html?date={date}"
    )
    
    # Fetch the total trip count using the provided function.
    total_trip = get_bus_trip_count(url)
    
    return total_trip



def get_city_id(city_df, city_name):
    """
    Retrieve the city ID from the mapping CSV based on the city name.
    """
    city_row = city_df[city_df['slug'] == slugify(city_name)]
    if not city_row.empty:
        return city_row.iloc[0, 0]  # Assuming city ID is in the first column
    raise ValueError(f"City '{city_name}' not found in the mapping file.")



def fetch_bus_data(token, from_id, to_id, date):
    """
    Fetch bus data from the API for a given route and date.
    Implements pagination to retrieve more than 100 records.
    """
    api_url = "https://internal-vroute-cmc.vexere.com/v2/route"
    headers = {"Authorization": f"Bearer {token}"}
    
    all_data = []
    page = 1
    pagesize = 100000  # adjust if needed
    while True:
        query_params = {
            "filter[from]": from_id,
            "filter[to]": to_id,
            "filter[date]": format_api_date(date),
            "filter[online_ticket]": 0,
            "filter[is_promotion]": 0,
            "filter[covid_utility]": 0,
            "filter[speaking_english_utility]": 0,
            "filter[enabled_gps]": 0,
            "filter[has_cop]": 0,
            "filter[online_reserved]": 0,
            "filter[suggestion]": "DEFAULT",
            "filter[fare][min]": 0,
            "filter[fare][max]": 2000000,
            "filter[available_seat][min]": 1,
            "filter[available_seat][max]": 50,
            "filter[rating][min]": 0,
            "filter[rating][max]": 5,
            "filter[limousine]": 0,
            "filter[has_unfixed_point]": 0,
            "page": page,
            "pagesize": pagesize
        }

        response = requests.get(api_url, headers=headers, params=query_params)
        if response.status_code != 200:
            print(f"Failed to retrieve data for {date} on page {page}. Status Code: {response.status_code}")
            break

        data = response.json().get('data', [])
        if not data:
            # No more records to fetch
            break

        all_data.extend(data)
        page += 1

    return all_data



def process_bus_data(data):
    """
    Process the API response data and count occurrences of each bus company.
    """
    company_list = [route['company']['name'] for route in data if 'company' in route]
    return Counter(company_list)


def group_and_sum(dataframe):
    """
    Groups the DataFrame by 'Company Name' and sums the 'Count' for each group.
    
    Parameters:
        dataframe (pd.DataFrame): The input DataFrame with columns 'Company Name' and 'Count'.
    
    Returns:
        pd.DataFrame: A DataFrame with each company and the corresponding summed count.
    """
    return dataframe.groupby('Company Name', as_index=False)['Count'].sum()


def get_bus_count(token, from_destination, to_destination, dates=["27-02-2025"], mapping_csv="mapping.csv"):
    """
    Get the number of bus trips from a source to a destination for multiple dates.
    Saves the result as a CSV in the "result" directory.

    :param token: API authentication token
    :param from_destination: Departure city
    :param to_destination: Destination city
    :param dates: List of dates (default: ["27-02-2025"])
    :param mapping_csv: Path to the city mapping CSV file
    :return: None
    """
    # Load city mapping file
    try:
        city_df = pd.read_csv(mapping_csv)
        from_id = get_city_id(city_df, from_destination)
        to_id = get_city_id(city_df, to_destination)
    except (FileNotFoundError, ValueError) as e:
        print(f"Error: {e}")
        return None

    # Ensure result directory exists
    result_dir = "result"
    os.makedirs(result_dir, exist_ok=True)

    all_data = []
    total_count = 0

    for date in dates:
        data = fetch_bus_data(token, from_id, to_id, date)
        company_counts = process_bus_data(data)
        # print(company_counts)
        if not company_counts:
            print(f"No data available for {date}.")
            continue

        df_counts = pd.DataFrame(sorted(company_counts.items(), key=lambda x: x[1], reverse=True),
                                 columns=["Company Name", "Count"])
        total_count += df_counts["Count"].sum()
        all_data.append(df_counts)
    # Merge all dates into one DataFrame
    if all_data:
        final_df = pd.concat(all_data, ignore_index=True)
        total_row = pd.DataFrame([["Total", total_count]], columns=["Company Name", "Count"])
        group_and_sum_df = group_and_sum(final_df)

        df_final = pd.concat([final_df, total_row], ignore_index=True)
        # Save results to CSV
        final_file = f"{result_dir}/{from_destination}_{to_destination}_all_dates.csv"
        df_final.to_csv(final_file, index=False)
        # print(f"All data saved to {final_file}")
    else:
        print("No data to save.")

    return from_destination, to_destination, total_count, group_and_sum_df


def get_best_20(df):
    # Create a normalized route column by sorting the two destinations
    df['route'] = df.apply(
        lambda row: tuple(sorted([row['from destination'], row['to destination']])), axis=1
    )
    # For each route group, get the index of the row with the highest total count
    idx = df.groupby('route')['total count'].idxmax()

    # Get the unique routes with their maximum count
    df_unique = df.loc[idx]

    # Sort the results by 'total count' in descending order and take the top 20
    top_20 = df_unique.sort_values('total count', ascending=False).head(20)

    return top_20



In [6]:
import itertools

city_df = pd.read_csv("mapping.csv")
city_list = ["Đà Nẵng", "Quảng Nam", "Quảng Ngãi", "Bình Định", "Phú Yên", "Khánh Hòa", "Ninh Thuận", "Bình Thuận", "Kon Tum", "Gia Lai", "Đắk Lắk", "Đắk Nông", "Lâm Đồng", "Bình Phước", "Bình Dương", "Đồng Nai", "Tây Ninh", "Bà Rịa - Vũng Tàu", "Hồ Chí Minh", "Long An", "Tiền Giang", "Bến Tre", "Trà Vinh", "Vĩnh Long", "Đồng Tháp", "An Giang", "Kiên Giang", "Cần Thơ", "Hậu Giang", "Sóc Trăng", "Bạc Liêu", "Cà Mau"]

pairs = itertools.product(city_list[:5], repeat=2)
date = ["01-03-2025"]
token= "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ0eXAiOjIsInVzciI6ImZlIiwiY2lkIjoiYTRlYWM1MDAtMzYyNC0xMWU1LWFjOWUtMDkxMjRjNjAxMDEzIiwiZXhwIjoxNzQwNjY1OTE3fQ.ja09PNJMnDz4Cz4kGC0dTmZ8qI7MJddFl8hlA-62__8"
# Print pairs, skipping (i, i) pairs
results = []
df = pd.DataFrame(columns=['from destination', 'to destination', 'total count'])
for pair in pairs:
    if pair[0] != pair[1]:
        a, b, c, d = get_bus_count(token,pair[0], pair[1], date)
        df.loc[len(df)] = [a, b, c]  # Inserts at the next available row index
        results.append(d)

results = pd.concat(results, ignore_index=True)
results = group_and_sum(results).sort_values(by='Count', ascending=False, ignore_index=True)
df = df.sort_values(by='total count', ascending=False, ignore_index=True)



Unnamed: 0,Company Name,Count
0,Hà Thảo,270
1,Kim Anh,180
2,Trâm Jet,120
3,Gia Phát Car,102
4,Hoàng Phúc,81
5,Tân Kim Chi,76
6,Vy Trần Car,60
7,Thuận Thảo,59
8,Diên Hồng (Đà Nẵng),57
9,Viet Nam Travel Bus,57


In [8]:
df

Unnamed: 0,from destination,to destination,total count
0,Quảng Ngãi,Đà Nẵng,222
1,Đà Nẵng,Quảng Ngãi,212
2,Quảng Nam,Đà Nẵng,194
3,Quảng Ngãi,Quảng Nam,171
4,Quảng Nam,Quảng Ngãi,166
5,Đà Nẵng,Quảng Nam,165
6,Quảng Nam,Bình Định,112
7,Bình Định,Quảng Nam,81
8,Phú Yên,Quảng Nam,63
9,Đà Nẵng,Bình Định,60


  from destination to destination  total count                    route
0       Quảng Ngãi        Đà Nẵng          238    (Quảng Ngãi, Đà Nẵng)
1          Đà Nẵng     Quảng Ngãi          216    (Quảng Ngãi, Đà Nẵng)
2        Quảng Nam        Đà Nẵng          203     (Quảng Nam, Đà Nẵng)
3       Quảng Ngãi      Quảng Nam          181  (Quảng Nam, Quảng Ngãi)
4          Đà Nẵng      Quảng Nam          179     (Quảng Nam, Đà Nẵng)
5        Quảng Nam     Quảng Ngãi          161  (Quảng Nam, Quảng Ngãi)
