In [20]:
import random
import pandas as pd
from datetime import datetime, timedelta

min_lat, max_lat = 11.51, 11.61
min_lon, max_lon = 104.85, 104.97

weather_categories = ["Clear", "Cloudy", "Rain", "Smog"]
traffic_levels = ["Low", "Medium", "High", "Very High"]

# Generate sorted date list from Jan 1 to Feb 20, 2026
start_date = datetime(2026, 2, 1)
end_date = datetime(2026, 2, 28)
total_days = (end_date - start_date).days + 1  # 51 days

# Spread 2168 trips across 51 days, then sort
trip_dates = sorted([
    (start_date + timedelta(days=random.randint(0, total_days - 1))).strftime("%Y-%m-%d")
    for _ in range(2168)
])

def random_coord():
    lat = round(random.uniform(min_lat, max_lat), 6)
    lon = round(random.uniform(min_lon, max_lon), 6)
    return lat, lon

def simulate_trip(trip_id, date):
    dept_lat, dept_lon = random_coord()
    arr_lat, arr_lon = random_coord()

    dist_km = max(0.5, round(random.gauss(6, 2), 2))

    hours = list(range(6, 24))
    weights = [3, 5, 5, 2, 1, 2, 4, 4, 2, 1, 3, 5, 5, 2, 2, 2, 1, 1]
    hr = random.choices(hours, weights=weights, k=1)[0]
    mn = random.randint(0, 59)
    request_time = f"{hr:02d}:{mn:02d}"

    est_time = round(dist_km / 0.3)
    actual_time = est_time + random.randint(-2, random.randint(0, 10))

    base_fare = round(dist_km * random.uniform(0.4, 0.7), 2)

    rush_hours = {7, 8, 12, 13, 17, 18}
    if hr in rush_hours:
        surge = random.choices(
            [1.0, 1.2, 1.3, 1.5],
            weights=[25, 30, 30, 15]
        )[0]
    else:
        surge = random.choices(
            [1.0, 1.2, 1.3, 1.5],
            weights=[70, 15, 10, 5]
        )[0]

    fare_usd = round(base_fare * surge, 2)

    # Tip: 69% no tip, 28% chance of $0.50–$2.00, 3% chance of $2.5-$10.00
    tip = random.choices(
        [0.0, round(random.uniform(0.5, 2.0), 2), round(random.uniform(2.5, 10.00), 2)],
        weights=[69, 28, 3]
    )[0]

    # Discrete rating 1–5, mostly high but small chance of low
    rating = random.choices(
        [1, 2, 3, 4, 5],
        weights=[2, 3, 8, 30, 57]
    )[0]

    weather = random.choices(
        ["Clear", "Cloudy", "Rain", "Smog"],
        weights=[49, 28, 0, 23]
    )[0]

    rush_hours = {7, 8, 12, 13, 17, 18}
    semi_rush = {6, 9, 11, 14, 13, 19}  # shoulders of rush hours

    # Traffic level correlated with time of day
    if hr in rush_hours:
        traffic = random.choices(
            ["Low", "Medium", "High", "Very High"],
            weights=[5, 15, 45, 35]     # mostly High/Very High
        )[0]
    elif hr in semi_rush:
        traffic = random.choices(
            ["Low", "Medium", "High", "Very High"],
            weights=[15, 40, 35, 10]    # mostly Medium/High
        )[0]
    else:
        traffic = random.choices(
            ["Low", "Medium", "High", "Very High"],
            weights=[50, 35, 10, 5]     # mostly Low/Medium
        )[0]
    co2 = round(dist_km * random.uniform(70, 110), 2)

    return [
        trip_id, date, dept_lat, dept_lon, arr_lat, arr_lon,
        request_time, dist_km, est_time, actual_time,
        fare_usd, surge, tip, rating, traffic, weather, co2
    ]

columns = [
    "trip_id", "date", "dept_lat", "dept_lon",
    "arr_lat", "arr_lon", "request_time",
    "trip_distance_km", "est_time_min",
    "actual_time_min", "fare_usd",
    "surge_pricing", "tip_usd", "driver_rating",   # tip added here
    "traffic_level", "weather", "estimated_CO2_g"
]

# generate synthetic data with sorted dates
data = [simulate_trip(i + 1, trip_dates[i]) for i in range(2168)]
df = pd.DataFrame(data, columns=columns)

df.to_csv("synthetic_ride_hail_phnom_penh.csv", index=False)
print("Dataset created: synthetic_ride_hail_phnom_penh.csv")
print(df[["trip_id", "date"]].head(10))

Dataset created: synthetic_ride_hail_phnom_penh.csv
   trip_id        date
0        1  2026-02-01
1        2  2026-02-01
2        3  2026-02-01
3        4  2026-02-01
4        5  2026-02-01
5        6  2026-02-01
6        7  2026-02-01
7        8  2026-02-01
8        9  2026-02-01
9       10  2026-02-01


In [21]:
PATH = 'synthetic_ride_hail_phnom_penh.csv'

data = pd.read_csv(PATH)
data.head()

Unnamed: 0,trip_id,date,dept_lat,dept_lon,arr_lat,arr_lon,request_time,trip_distance_km,est_time_min,actual_time_min,fare_usd,surge_pricing,tip_usd,driver_rating,traffic_level,weather,estimated_CO2_g
0,1,2026-02-01,11.60866,104.853425,11.550101,104.946397,08:14,4.61,15,25,2.88,1.2,0.0,3,High,Clear,325.05
1,2,2026-02-01,11.605624,104.857362,11.579519,104.873565,14:25,7.4,25,23,5.11,1.0,0.0,4,High,Clear,748.78
2,3,2026-02-01,11.604646,104.89127,11.577737,104.921429,15:50,6.64,22,25,3.94,1.0,0.0,5,Medium,Clear,679.89
3,4,2026-02-01,11.526998,104.917747,11.52874,104.914942,07:36,9.35,31,38,4.52,1.2,1.04,5,Medium,Clear,694.01
4,5,2026-02-01,11.530798,104.922414,11.604906,104.861407,08:20,8.87,30,30,6.75,1.5,0.0,5,Very High,Clear,814.99


In [14]:
import folium
import pandas as pd

df = pd.read_csv("synthetic_ride_hail_phnom_penh.csv")

map_pp = folium.Map(location=[11.56, 104.92], zoom_start=12)

for _, row in df.iterrows():
    folium.PolyLine(
        [(row.dept_lat, row.dept_lon),
         (row.arr_lat, row.arr_lon)],
        color="blue", weight=1, opacity=0.5
    ).add_to(map_pp)

map_pp.save("ride_hail_trips_pp.html")
