In [None]:
import sqlite3
import pandas as pd
db_path="workspace/database/gtfs_raw.db"
conn = sqlite3.connect(db_path)
pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", conn)

Unnamed: 0,name
0,TripUpdates
1,routes
2,agency
3,stops
4,trips
5,stops_poi
6,TripUpdates_poi
7,TripUpdates_poi_joint
8,stop_pairs


In [2]:
from src.poi_filter import filter_trip_updates_poi
filter_trip_updates_poi(db_path)

Filtered to 7751703 trip updates in POI area
Filtered to 212094 trip updates in POI area


In [2]:
from src.poi_joint import prepare_stop_pairs
ob_window=10*60
gap=1
prepare_stop_pairs(db_path, gap, ob_window)

created index for (trip_id, vehicle_id, start_date, stop_sequence)


In [3]:
df_pairs = pd.read_sql_query("SELECT * FROM stop_pairs limit 5", conn)
for column in df_pairs.columns:
    print(column)

trip_id
start_date
vehicle_id
route_id
line_name
route_type
stop_i
seq_i
arr_delay_i
dep_delay_i
arr_time_i
duration_i
his_dwell_count_i
his_avg_delay_i
stop_j
seq_j
arr_delay_j
dep_delay_j
arr_time_j
duration_j
travel_time
delay_change
hour_of_day
day_of_week


In [None]:
import sqlite3
db_path = "workspace/database/gtfs_raw.db"
output_dir = "workspace/output"
conn = sqlite3.connect(db_path)

# Basic statistics
stats_query = """
SELECT 
    COUNT(*) as total_records,
    COUNT(DISTINCT trip_id) as n_trips,
    COUNT(DISTINCT stop_id) as n_stops,
    COUNT(DISTINCT start_date) as n_days,
    MIN(start_date) as first_date,
    MAX(start_date) as last_date
FROM TripUpdates_poi
"""
stats = pd.read_sql(stats_query, conn)
print(stats.to_string(index=False))


 total_records  n_trips  n_stops  n_days first_date last_date
        212094     3101      817      12   20250310  20250508


In [6]:
# Delay statistics - fixed for SQLite
delay_stats = """
SELECT 
    AVG(arr_delay) as mean_delay,
    MIN(arr_delay) as min_delay,
    MAX(arr_delay) as max_delay,
    COUNT(CASE WHEN arr_delay > 60 THEN 1 END) as late_count,
    COUNT(CASE WHEN arr_delay < -60 THEN 1 END) as early_count,
    COUNT(CASE WHEN arr_delay BETWEEN -60 AND 60 THEN 1 END) as ontime_count,
    COUNT(*) as total_count,
    ROUND(100.0 * COUNT(CASE WHEN arr_delay > 60 THEN 1 END) / COUNT(*), 2) as late_pct,
    ROUND(100.0 * COUNT(CASE WHEN arr_delay < -60 THEN 1 END) / COUNT(*), 2) as early_pct,
    ROUND(100.0 * COUNT(CASE WHEN arr_delay BETWEEN -60 AND 60 THEN 1 END) / COUNT(*), 2) as ontime_pct
FROM TripUpdates_poi
WHERE arr_delay IS NOT NULL
"""

print("\nDelay Statistics:")
print("(Negative = Early, Positive = Late, unit: seconds)")
delay_df = pd.read_sql(delay_stats, conn)
print(delay_df.to_string(index=False))

# Compute std using pandas
all_delays = pd.read_sql("SELECT arr_delay FROM TripUpdates_poi WHERE arr_delay IS NOT NULL", conn)
print(f"\nStandard Deviation of Delays: {all_delays['arr_delay'].std():.2f} seconds")


Delay Statistics:
(Negative = Early, Positive = Late, unit: seconds)
 mean_delay  min_delay  max_delay  late_count  early_count  ontime_count  total_count  late_pct  early_pct  ontime_pct
  96.530934      -4097       9280      109240        25143         77711       212094     51.51      11.85       36.64

Standard Deviation of Delays: 219.17 seconds


In [None]:
db_path = "workspace/database/gtfs_raw.db"
output_dir = "workspace/output"
Path(output_dir).mkdir(parents=True, exist_ok=True)

In [None]:
db_path = "workspace/database/gtfs_raw.db"
gap = 1
ob_window = 20*60
output_dir = "workspace/output"
from src.statistical_learning import compare_methods
compare_methods(db_path, gap, output_dir)

created index for (trip_id, vehicle_id, start_date, stop_sequence)
Loaded 173,737 observations
Remained 173,737 observations after outlier filtering
Treatment: delay_i (continuous, in seconds)
  Mean: 106.01 seconds
  Std: 208.32 seconds
  Range: [-4097.00, 7537.00]

Outcome: delay_j (continuous, in seconds)
  Mean: 107.46 seconds
  Std: 209.89 seconds

Saved: /home/jinghan/projecttest/output/after_1_stop/treatment_outcome_distribution.png

IDENTIFYING CONFOUNDERS

Confounders included:
  - hour_of_day
  - his_dwell_count_i
  - his_avg_delay_i
  - his_avg_dwell_i
  - is_morning_peak
  - is_evening_peak

Clean dataset: 173,737 observations

METHOD 1: NAIVE COMPARISON (BASELINE)
Naive estimate: β = 0.9153
Interpretation: Each 1-second increase in delay_i is associated
                with 0.9153 seconds increase in delay_j

METHOD 2: REGRESSION ADJUSTMENT
Regression-adjusted estimate: β = 0.9223
Interpretation: Controlling for confounders, each 1-second increase
                in delay_

In [None]:
gap = 2
from src.statistical_learning import compare_methods
compare_methods(db_path, gap, output_dir)

created index for (trip_id, vehicle_id, start_date, stop_sequence)
Loaded 154,497 observations
Remained 154,497 observations after outlier filtering
Treatment: delay_i (continuous, in seconds)
  Mean: 107.15 seconds
  Std: 207.09 seconds
  Range: [-4097.00, 7523.00]

Outcome: delay_j (continuous, in seconds)
  Mean: 109.07 seconds
  Std: 210.82 seconds

Saved: /home/jinghan/projecttest/output/after_2_stop/treatment_outcome_distribution.png

IDENTIFYING CONFOUNDERS

Confounders included:
  - hour_of_day
  - his_dwell_count_i
  - his_avg_delay_i
  - his_avg_dwell_i
  - is_morning_peak
  - is_evening_peak

Clean dataset: 154,497 observations

METHOD 1: NAIVE COMPARISON (BASELINE)
Naive estimate: β = 0.8931
Interpretation: Each 1-second increase in delay_i is associated
                with 0.8931 seconds increase in delay_j

METHOD 2: REGRESSION ADJUSTMENT
Regression-adjusted estimate: β = 0.9010
Interpretation: Controlling for confounders, each 1-second increase
                in delay_

In [None]:
gap = 3
from src.statistical_learning import compare_methods
compare_methods(db_path, gap, output_dir)

created index for (trip_id, vehicle_id, start_date, stop_sequence)
Loaded 136,949 observations
Remained 136,949 observations after outlier filtering
Treatment: delay_i (continuous, in seconds)
  Mean: 108.01 seconds
  Std: 206.39 seconds
  Range: [-4097.00, 7523.00]

Outcome: delay_j (continuous, in seconds)
  Mean: 108.90 seconds
  Std: 212.26 seconds

Saved: /home/jinghan/projecttest/output/after_3_stop/treatment_outcome_distribution.png

IDENTIFYING CONFOUNDERS

Confounders included:
  - hour_of_day
  - his_dwell_count_i
  - his_avg_delay_i
  - his_avg_dwell_i
  - is_morning_peak
  - is_evening_peak

Clean dataset: 136,949 observations

METHOD 1: NAIVE COMPARISON (BASELINE)
Naive estimate: β = 0.8681
Interpretation: Each 1-second increase in delay_i is associated
                with 0.8681 seconds increase in delay_j

METHOD 2: REGRESSION ADJUSTMENT
Regression-adjusted estimate: β = 0.8776
Interpretation: Controlling for confounders, each 1-second increase
                in delay_