In [22]:
import random
import pandas as pd
from datetime import datetime, timedelta

min_lat, max_lat = 11.51, 11.61
min_lon, max_lon = 104.85, 104.97

# Generate sorted date list from Feb 1 to Feb 28, 2026
start_date = datetime(2026, 2, 1)
end_date = datetime(2026, 2, 28)
total_days = (end_date - start_date).days + 1  # 28 days

trip_dates = sorted([
    (start_date + timedelta(days=random.randint(0, total_days - 1))).strftime("%Y-%m-%d")
    for _ in range(2168)
])

def random_coord():
    lat = round(random.uniform(min_lat, max_lat), 6)
    lon = round(random.uniform(min_lon, max_lon), 6)
    return lat, lon

def simulate_trip(trip_id, date):
    dept_lat, dept_lon = random_coord()
    arr_lat, arr_lon = random_coord()

    dist_km = max(0.5, round(random.gauss(6, 2), 2))

    # --- Vehicle Type (purely random) ---
    vehicle_type = random.choices(
        ["Motor Dup", "Rickshaw", "Remork", "EV Car"],
        weights=[10, 53, 21, 16]
    )[0]

    # --- Request Time (capped at 22:00) ---
    hours = list(range(6, 23))  # 6 to 22 only
    hour_weights = [3, 5, 5, 2, 1, 3, 4, 4, 2, 1, 3, 5, 5, 2, 2, 1, 1]
    hr = random.choices(hours, weights=hour_weights, k=1)[0]
    mn = random.randint(0, 59)
    request_time = f"{hr:02d}:{mn:02d}"

    # --- Traffic correlated with rush hour ---
    rush_hours = {7, 8, 12, 13, 17, 18}
    semi_rush  = {6, 9, 11, 14, 19}

    if hr in rush_hours:
        traffic = random.choices(
            ["Low", "Medium", "High", "Very High"],
            weights=[5, 15, 45, 35]
        )[0]
    elif hr in semi_rush:
        traffic = random.choices(
            ["Low", "Medium", "High", "Very High"],
            weights=[15, 40, 35, 10]
        )[0]
    else:
        traffic = random.choices(
            ["Low", "Medium", "High", "Very High"],
            weights=[50, 35, 10, 5]
        )[0]

    # --- Est Time ---
    est_time = round(dist_km / 0.3)

    # --- Actual Time: correlated with traffic & rush hour ---
    traffic_delay = {
        "Low"      : (0, 2),
        "Medium"   : (1, 4),
        "High"     : (3, 8),
        "Very High": (4, 12)
    }
    delay_min, delay_max = traffic_delay[traffic]
    actual_time = max(1, est_time + random.randint(-2, delay_max) + random.randint(0, delay_min))

    # --- arr_time = request_time + actual_time ---
    req_dt = datetime.strptime(f"2000-01-01 {request_time}", "%Y-%m-%d %H:%M")
    arr_dt = req_dt + timedelta(minutes=actual_time)
    arr_time = arr_dt.strftime("%H:%M")

    # --- Wait Time: correlated with traffic & rush hour ---
    wait_traffic = {
        "Low"      : (1, 4,  [40, 35, 15, 10]),   # (min, max, weights for 1-4 min bands)
        "Medium"   : (2, 7,  [20, 35, 30, 15]),
        "High"     : (3, 10, [10, 25, 40, 25]),
        "Very High": (5, 13, [5,  20, 40, 35])
    }
    w_min, w_max, w_weights = wait_traffic[traffic]
    wait_time = random.choices(range(w_min, w_max + 1), 
                               k=1,
                               weights=w_weights + [2] * (w_max - w_min - 3))[0]

    # --- Base Fare by Vehicle Type ---
    fare_rates = {
        "Motor Dup": (0.25, 0.40),
        "Rickshaw" : (0.35, 0.55),
        "Remork"   : (0.45, 0.65),
        "EV Car"   : (0.60, 0.85)
    }
    rate_min, rate_max = fare_rates[vehicle_type]
    base_fare = round(dist_km * random.uniform(rate_min, rate_max), 2)

    # --- Surge tied to rush hours ---
    if hr in rush_hours:
        surge = random.choices(
            [1.0, 1.1, 1.2, 1.3, 1.4, 1.5],
            weights=[10, 15, 20, 20, 15, 20]
        )[0]
    else:
        surge = random.choices(
            [1.0, 1.1, 1.2, 1.3, 1.4, 1.5],
            weights=[68, 7, 8, 7, 5, 5]
        )[0]

    fare_usd = round(base_fare * surge, 2)

    # --- Tip ---
    tip = random.choices(
        [0.0, round(random.uniform(0.5, 2.0), 2), round(random.uniform(2.5, 10.0), 2)],
        weights=[59, 35, 6]
    )[0]

    # --- Rating ---
    rating = random.choices(
        [1, 2, 3, 4, 5],
        weights=[2, 3, 8, 30, 57]
    )[0]

    # --- Weather ---
    weather = random.choices(
        ["Clear", "Cloudy", "Windy", "Smog"],
        weights=[50, 30, 10, 10]
    )[0]

    # --- CO2 by Vehicle Type ---
    co2_rates = {
        "Motor Dup": (60,  90),
        "Rickshaw" : (50,  80),
        "Remork"   : (80, 120),
        "EV Car"   : (0,   10)
    }
    co2_min, co2_max = co2_rates[vehicle_type]
    co2 = round(dist_km * random.uniform(co2_min, co2_max), 2)

    return [
        trip_id, date,
        dept_lat, dept_lon, arr_lat, arr_lon,
        vehicle_type,
        request_time, wait_time,
        dist_km, est_time, actual_time, arr_time,
        fare_usd, surge, tip, rating,
        traffic, weather, co2
    ]

columns = [
    "trip_id", "date",
    "dept_lat", "dept_lon", "arr_lat", "arr_lon",
    "vehicle_type",
    "request_time", "wait_time_min",
    "trip_distance_km", "est_time_min", "actual_time_min", "arr_time",
    "fare_usd", "surge_pricing", "tip_usd", "driver_rating",
    "traffic_level", "weather", "estimated_CO2_g"
]

# Generate synthetic data with sorted dates
data = [simulate_trip(i + 1, trip_dates[i]) for i in range(2168)]
df = pd.DataFrame(data, columns=columns)

df.to_csv("synthetic_ride_hail_phnom_penh.csv", index=False)
print("Dataset created: synthetic_ride_hail_phnom_penh.csv")
print(df[["trip_id", "date", "vehicle_type", "request_time", "arr_time", "wait_time_min"]].head(10))

Dataset created: synthetic_ride_hail_phnom_penh.csv
   trip_id        date vehicle_type request_time arr_time  wait_time_min
0        1  2026-02-01     Rickshaw        06:39    07:09              7
1        2  2026-02-01       Remork        08:58    09:21              6
2        3  2026-02-01     Rickshaw        20:33    20:54              3
3        4  2026-02-01       Remork        08:40    08:51              9
4        5  2026-02-01       Remork        14:32    14:48              6
5        6  2026-02-01       Remork        07:59    08:29              8
6        7  2026-02-01     Rickshaw        14:16    14:43              4
7        8  2026-02-01     Rickshaw        06:36    07:10              5
8        9  2026-02-01       Remork        16:40    16:52              2
9       10  2026-02-01     Rickshaw        10:31    10:52              5


In [21]:
PATH = 'synthetic_ride_hail_phnom_penh.csv'

data = pd.read_csv(PATH)
data.head()

Unnamed: 0,trip_id,date,dept_lat,dept_lon,arr_lat,arr_lon,request_time,trip_distance_km,est_time_min,actual_time_min,fare_usd,surge_pricing,tip_usd,driver_rating,traffic_level,weather,estimated_CO2_g
0,1,2026-02-01,11.60866,104.853425,11.550101,104.946397,08:14,4.61,15,25,2.88,1.2,0.0,3,High,Clear,325.05
1,2,2026-02-01,11.605624,104.857362,11.579519,104.873565,14:25,7.4,25,23,5.11,1.0,0.0,4,High,Clear,748.78
2,3,2026-02-01,11.604646,104.89127,11.577737,104.921429,15:50,6.64,22,25,3.94,1.0,0.0,5,Medium,Clear,679.89
3,4,2026-02-01,11.526998,104.917747,11.52874,104.914942,07:36,9.35,31,38,4.52,1.2,1.04,5,Medium,Clear,694.01
4,5,2026-02-01,11.530798,104.922414,11.604906,104.861407,08:20,8.87,30,30,6.75,1.5,0.0,5,Very High,Clear,814.99


In [14]:
import folium
import pandas as pd

df = pd.read_csv("synthetic_ride_hail_phnom_penh.csv")

map_pp = folium.Map(location=[11.56, 104.92], zoom_start=12)

for _, row in df.iterrows():
    folium.PolyLine(
        [(row.dept_lat, row.dept_lon),
         (row.arr_lat, row.arr_lon)],
        color="blue", weight=1, opacity=0.5
    ).add_to(map_pp)

map_pp.save("ride_hail_trips_pp.html")
