In [1]:
!pip install requests
!pip install pymongo



In [2]:
import requests
import pandas as pd
import json
import time  # Để delay tránh spam

# Đọc CSV (cấu trúc: Route_Number, Route_Name, URL)
df_routes = pd.read_csv('tuyen_xe_buyt.csv')
print(f"Đọc CSV OK: {len(df_routes)} routes")

# Base URL API
base_url = "https://api.xebuyt.net/businfo/"

# Dict để lưu data: {route_num: {vars: [], stops: {var_id: []}, paths: {var_id: {lat: [], lng: []}}}}
all_data = {}

for index, row in df_routes.iterrows():
    route_num = row['Route_Number'].zfill(2)  # Đảm bảo '01' thay vì 1
    route_name = row['Route_Name']
    print(f"\n--- Crawling tuyến {route_num} - {route_name} ---")

    try:
        # API 1: Get vars by route (e.g., /getvarsbyroute/1_1 cho tuyến 01)
        vars_url = f"{base_url}getvarsbyroute/{route_num}_1"
        response_vars = requests.get(vars_url, timeout=10)
        if response_vars.status_code != 200:
            print(f"Lỗi fetch vars {vars_url}: {response_vars.status_code}")
            continue
        vars_data = response_vars.json()
        print(f"Vars extracted: {len(vars_data)} variants")
        all_data[route_num] = {'vars': vars_data, 'stops': {}, 'paths': {}}

        # Loop qua từng variant (thường 2: đi và về)
        for var in vars_data:
            var_id = var['RouteVarId']  # '1' cho đi, '2' cho về
            direction = var['RouteVarName']
            print(f"  Processing variant {var_id}: {direction}")

            # API 2: Get stops by var (e.g., /getstopsbyvar/1_1/1)
            stops_url = f"{base_url}getstopsbyvar/{route_num}_1/{var_id}"
            response_stops = requests.get(stops_url, timeout=10)
            if response_stops.status_code == 200:
                stops_data = response_stops.json()
                print(f"    Stops extracted: {len(stops_data)} stops")
                all_data[route_num]['stops'][var_id] = stops_data
            else:
                print(f"    Lỗi fetch stops {stops_url}: {response_stops.status_code}")

            # API 3: Get paths by var (e.g., /getpathsbyvar/1_1/1)
            paths_url = f"{base_url}getpathsbyvar/{route_num}_1/{var_id}"
            response_paths = requests.get(paths_url, timeout=10)
            if response_paths.status_code == 200:
                paths_data = response_paths.json()
                print(f"    Paths extracted: {len(paths_data.get('lat', []))} points")
                all_data[route_num]['paths'][var_id] = paths_data
            else:
                print(f"    Lỗi fetch paths {paths_url}: {response_paths.status_code}")

        # time.sleep(1)  # Delay
    except Exception as e:
        print(f"Lỗi tổng {route_num}: {e}")
        continue

# Lưu full data vào JSON
with open('api_crawl_data.json', 'w', encoding='utf-8') as f:
    json.dump(all_data, f, ensure_ascii=False, indent=4)
print("\nLưu data full vào api_crawl_data.json")

# Nếu muốn CSV flat cho stops (dễ xem)
stops_list = []
for route_num, data in all_data.items():
    for var_id, stops in data['stops'].items():
        for stop in stops:
            stops_list.append({
                'Route_Number': route_num,
                'RouteVarId': var_id,
                'Stop_Name': stop['Name'],
                'Stop_Code': stop['Code'],
                'Lat': stop['Lat'],
                'Lng': stop['Lng'],
                'Address': f"{stop['AddressNo']} {stop['Street']}",
                'Routes': stop['Routes'],
                'Status': stop['Status'],
                'StopType': stop['StopType']
            })

df_stops = pd.DataFrame(stops_list).drop_duplicates()
df_stops.to_csv('stops_crawl.csv', index=False, encoding='utf-8-sig')
print(f"Lưu stops vào stops_crawl.csv (Tổng {len(df_stops)} records)")

# Tương tự cho paths nếu cần, nhưng paths là coords dài, nên giữ JSON
print("Crawl xong! Check files để xem data.")

Đọc CSV OK: 140 routes

--- Crawling tuyến 01 - Bến Thành- BX Chợ Lớn ---
Vars extracted: 2 variants
  Processing variant 1: Lượt đi: Bến Thành - BX Chợ Lớn
    Stops extracted: 29 stops
    Paths extracted: 99 points
  Processing variant 2: Lượt về: BX Chợ Lớn - Bến Thành
    Stops extracted: 35 stops
    Paths extracted: 141 points

--- Crawling tuyến 02 - Bến Thành- BX Miền Tây ---
Vars extracted: 2 variants
  Processing variant 3: Lượt đi: Bến Thành - BX Miền Tây
    Stops extracted: 43 stops
    Paths extracted: 167 points
  Processing variant 4: Lượt về: BX Miền Tây - Bến Thành
    Stops extracted: 37 stops
    Paths extracted: 166 points

--- Crawling tuyến 03 - Bến Thành- Thạnh Lộc ---
Vars extracted: 2 variants
  Processing variant 5: Lượt đi: Bến Thành - Thạnh Lộc
    Stops extracted: 48 stops
    Paths extracted: 203 points
  Processing variant 6: Lượt về: Thạnh Lộc - Bến Thành
    Stops extracted: 48 stops
    Paths extracted: 193 points

--- Crawling tuyến 04 - Bến Thành- 

In [3]:
import requests
import pandas as pd
import time

# Đọc CSV
try:
    df_routes = pd.read_csv('tuyen_xe_buyt.csv')
    print(f"Đọc CSV OK: {len(df_routes)} routes")
except Exception as e:
    print(f"Lỗi đọc CSV: {e}")
    exit()

# Set của Route_Number để check khớp
route_numbers = set(df_routes['Route_Number'].astype(str))
print(f"Route numbers in CSV: {route_numbers}")

base_url = "https://api.xebuyt.net/businfo/"
stops_list = []
max_x = 200  # Max ID thử (mày có thể giảm xuống 100, 150 nếu biết range)

for x in range(1, max_x + 1):
    print(f"\n--- Trying X={x} ---")

    try:
        # API 1: Get vars
        vars_url = f"{base_url}getvarsbyroute/{x}_1"
        response_vars = requests.get(vars_url, timeout=10)
        if response_vars.status_code != 200:
            print(f"Lỗi fetch vars {vars_url}: {response_vars.status_code}")
            continue
        vars_data = response_vars.json()
        if not vars_data:  # Response rỗng
            print(f"No data for X={x}")
            continue

        # Check RouteId khớp với CSV
        route_id = vars_data[0].get('RouteId', '')
        if route_id not in route_numbers:
            print(f"RouteId {route_id} không trong CSV, skip")
            continue

        # Tìm route_name từ CSV
        route_name = df_routes[df_routes['Route_Number'] == route_id]['Route_Name'].iloc[0]
        print(f"Vars extracted for {route_id} - {route_name}: {len(vars_data)} variants")
        print("  Sample vars:")
        for var in vars_data:
            print(f"    ID {var['RouteVarId']}: {var['RouteVarName']} (Thời gian: {var['RunningTime']}p, Khoảng cách: {var['Distance']}m)")

        # Loop qua variants
        for var in vars_data:
            var_id = var['RouteVarId']
            direction = var['RouteVarName']
            print(f"  Processing variant {var_id}: {direction}")

            # API 2: Get stops
            stops_url = f"{base_url}getstopsbyvar/{x}_1/{var_id}"
            response_stops = requests.get(stops_url, timeout=10)
            if response_stops.status_code == 200:
                stops_data = response_stops.json()
                print(f"    Stops extracted: {len(stops_data)} stops")
                for stop in stops_data:
                    stops_list.append({
                        'Route_Number': route_id,
                        'RouteVarId': var_id,
                        'Direction': direction,
                        'Stop_Name': stop['Name'],
                        'Stop_Code': stop['Code'],
                        'Lat': stop['Lat'],
                        'Lng': stop['Lng'],
                        'Address': f"{stop['AddressNo']} {stop['Street']}",
                        'Routes': stop['Routes'],
                        'Status': stop['Status'],
                        'StopType': stop['StopType']
                    })
            else:
                print(f"    Lỗi fetch stops {stops_url}: {response_stops.status_code}")

            # API 3: Get paths
            paths_url = f"{base_url}getpathsbyvar/{x}_1/{var_id}"
            response_paths = requests.get(paths_url, timeout=10)
            if response_paths.status_code == 200:
                paths_data = response_paths.json()
                print(f"    Paths extracted: {len(paths_data.get('lat', []))} points")
            else:
                print(f"    Lỗi fetch paths {paths_url}: {response_paths.status_code}")

        time.sleep(1)
    except Exception as e:
        print(f"Lỗi tổng X={x}: {e}")
        continue

# Tạo DF
df_stops = pd.DataFrame(stops_list).drop_duplicates()

# Preview top 20
print("\n=== PREVIEW TOP 20 STOPS ===")
print(df_stops.head(20).to_string(index=False))

# In ~100 rows
print("\n=== DATAFRAME (KHOẢNG 100 ROWS) ===")
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)
pd.set_option('display.max_columns', None)
print(df_stops.to_string(index=False))
pd.reset_option('all')

# Lưu CSV
df_stops.to_csv('stops_crawl.csv', index=False, encoding='utf-8-sig')
print(f"\nCrawl xong! Tổng {len(df_stops)} records unique, lưu tại stops_crawl.csv")

[1;30;43mKết quả truyền trực tuyến bị cắt bớt đến 5000 dòng cuối.[0m
          32          1                        Lượt đi: Bến xe Miền Tây - BÃI HẬU CẦN SỐ 1                                Trạm Đăng Kiểm       Q11 072       10.770437      106.645073                                                432 Lạc Long Quân                                                                                                          145, 148, 38      Đang khai thác  Trụ dừng
          32          1                        Lượt đi: Bến xe Miền Tây - BÃI HẬU CẦN SỐ 1                                         Âu Cơ       Q11 073       10.773831      106.647461                                                612 Lạc Long Quân                                                                                                          145, 148, 38      Đang khai thác   Nhà chờ
          32          1                        Lượt đi: Bến xe Miền Tây - BÃI HẬU CẦN SỐ 1                                  Ngã Tư Âu Cơ 

  pd.reset_option('all')
  pd.reset_option('all')


In [4]:
import requests

base_url = "https://api.xebuyt.net/businfo/getvarsbyroute/{}_1"

for route_id in range(1, 201):  # chạy từ 1 đến 200
    url = base_url.format(route_id)
    try:
        response = requests.get(url, timeout=5)  # timeout 5 giây

        if response.status_code == 200:
            print(f"[OK] RouteId {route_id} ✅")
        else:
            print(f"[FAIL] RouteId {route_id} ❌ (HTTP {response.status_code})")

    except requests.exceptions.RequestException as e:
        print(f"[ERROR] RouteId {route_id} ⚠️ ({e})")


[OK] RouteId 1 ✅
[OK] RouteId 2 ✅
[OK] RouteId 3 ✅
[OK] RouteId 4 ✅
[OK] RouteId 5 ✅
[OK] RouteId 6 ✅
[OK] RouteId 7 ✅
[OK] RouteId 8 ✅
[OK] RouteId 9 ✅
[OK] RouteId 10 ✅
[OK] RouteId 11 ✅
[OK] RouteId 12 ✅
[OK] RouteId 13 ✅
[OK] RouteId 14 ✅
[OK] RouteId 15 ✅
[OK] RouteId 16 ✅
[OK] RouteId 17 ✅
[OK] RouteId 18 ✅
[OK] RouteId 19 ✅
[OK] RouteId 20 ✅
[OK] RouteId 21 ✅
[OK] RouteId 22 ✅
[OK] RouteId 23 ✅
[OK] RouteId 24 ✅
[OK] RouteId 25 ✅
[OK] RouteId 26 ✅
[OK] RouteId 27 ✅
[OK] RouteId 28 ✅
[OK] RouteId 29 ✅
[OK] RouteId 30 ✅
[OK] RouteId 31 ✅
[OK] RouteId 32 ✅
[OK] RouteId 33 ✅
[OK] RouteId 34 ✅
[OK] RouteId 35 ✅
[OK] RouteId 36 ✅
[OK] RouteId 37 ✅
[OK] RouteId 38 ✅
[OK] RouteId 39 ✅
[OK] RouteId 40 ✅
[OK] RouteId 41 ✅
[OK] RouteId 42 ✅
[OK] RouteId 43 ✅
[OK] RouteId 44 ✅
[OK] RouteId 45 ✅
[OK] RouteId 46 ✅
[OK] RouteId 47 ✅
[OK] RouteId 48 ✅
[OK] RouteId 49 ✅
[OK] RouteId 50 ✅
[OK] RouteId 51 ✅
[OK] RouteId 52 ✅
[OK] RouteId 53 ✅
[OK] RouteId 54 ✅
[OK] RouteId 55 ✅
[OK] RouteId 56 ✅
[

In [5]:
import requests

base_url = "https://api.xebuyt.net/businfo/getvarsbyroute/{}_1"

have_data = []
no_data = []

for route_id in range(1, 201):
    url = base_url.format(route_id)
    print("=" * 80)
    print(f"🔹 Đang request RouteId {route_id}: {url}")

    try:
        response = requests.get(url, timeout=8)
        print(f"HTTP {response.status_code}")

        # In toàn bộ nội dung phản hồi
        print(response.text)

        if response.status_code == 200 and response.text.strip() not in ["", "null", "[]", "{}"]:
            have_data.append(route_id)
        else:
            no_data.append(route_id)

    except requests.exceptions.RequestException as e:
        print(f"⚠️ Lỗi khi request RouteId {route_id}: {e}")
        no_data.append(route_id)

print("\n" + "=" * 80)
print("✅ TỔNG KẾT")
print(f"Có dữ liệu ({len(have_data)}): {have_data}")
print(f"Không có dữ liệu hoặc lỗi ({len(no_data)}): {no_data}")


🔹 Đang request RouteId 1: https://api.xebuyt.net/businfo/getvarsbyroute/1_1
HTTP 200



[{"Distance":"7680","EndStop":"B\u1ebfn xe Ch\u1ee3 L\u1edbn","Outbound":"true","RouteId":"1","RouteNo":"1","RouteVarId":"1","RouteVarName":"L\u01b0\u1ee3t \u0111i: B\u1ebfn Th\u00e0nh - BX Ch\u1ee3 L\u1edbn","RouteVarShortName":"BX Ch\u1ee3 L\u1edbn","RunningTime":"35","StartStop":"C\u00f4ng Tr\u01b0\u1eddng M\u00ea Linh"},{"Distance":"9534","EndStop":"C\u00f4ng Tr\u01b0\u1eddng M\u00ea Linh","Outbound":"false","RouteId":"1","RouteNo":"1","RouteVarId":"2","RouteVarName":"L\u01b0\u1ee3t v\u1ec1: BX Ch\u1ee3 L\u1edbn - B\u1ebfn Th\u00e0nh","RouteVarShortName":"B\u1ebfn Th\u00e0nh","RunningTime":"30","StartStop":"B\u1ebfn xe Ch\u1ee3 L\u1edbn"}]
🔹 Đang request RouteId 2: https://api.xebuyt.net/businfo/getvarsbyroute/2_1
HTTP 200



[{"Distance":"13501","EndStop":"B\u1ebfn xe Mi\u1ec1n T\u00e2y","Outbound":"true","RouteId":"2","RouteNo":"2","RouteVarId":"3","RouteVarName":"L\u01b0\u1ee3t \u0111i: B\u1e

In [6]:
import requests

base_url = "https://api.xebuyt.net/businfo/getvarsbyroute/{}_1"

have_data = []
no_data = []

for route_id in range(1, 1000):
    url = base_url.format(route_id)
    print("=" * 80)
    print(f"🔹 Đang request RouteId {route_id}: {url}")

    try:
        response = requests.get(url, timeout=8)
        print(f"HTTP {response.status_code}")

        # In toàn bộ nội dung phản hồi
        print(response.text)

        if response.status_code == 200 and response.text.strip() not in ["", "null", "[]", "{}"]:
            have_data.append(route_id)
        else:
            no_data.append(route_id)

    except requests.exceptions.RequestException as e:
        print(f"⚠️ Lỗi khi request RouteId {route_id}: {e}")
        no_data.append(route_id)

print("\n" + "=" * 80)
print("✅ TỔNG KẾT")
print(f"Có dữ liệu ({len(have_data)}): {have_data}")
print(f"Không có dữ liệu hoặc lỗi ({len(no_data)}): {no_data}")

[1;30;43mKết quả truyền trực tuyến bị cắt bớt đến 5000 dòng cuối.[0m



[]
🔹 Đang request RouteId 287: https://api.xebuyt.net/businfo/getvarsbyroute/287_1
HTTP 200



[]
🔹 Đang request RouteId 288: https://api.xebuyt.net/businfo/getvarsbyroute/288_1
HTTP 200



[]
🔹 Đang request RouteId 289: https://api.xebuyt.net/businfo/getvarsbyroute/289_1
HTTP 200



[]
🔹 Đang request RouteId 290: https://api.xebuyt.net/businfo/getvarsbyroute/290_1
HTTP 200



[]
🔹 Đang request RouteId 291: https://api.xebuyt.net/businfo/getvarsbyroute/291_1
HTTP 200



[]
🔹 Đang request RouteId 292: https://api.xebuyt.net/businfo/getvarsbyroute/292_1
HTTP 200



[]
🔹 Đang request RouteId 293: https://api.xebuyt.net/businfo/getvarsbyroute/293_1
HTTP 200



[]
🔹 Đang request RouteId 294: https://api.xebuyt.net/businfo/getvarsbyroute/294_1
HTTP 200



[]
🔹 Đang request RouteId 295: https://api.xebuyt.net/businfo/getvarsbyroute/295_1
HTTP 200



[]
🔹 Đang request RouteId 296: https://api.xebuyt.net/businfo/getvarsby