In [None]:
from datetime import datetime
import pandas as pd

from shared.constants import GAEVLE_LONGITUDE, GAEVLE_LATITUDE, OperatorsWithRT
from shared.file_logger import setup_logger
import weather.fetch as wf
import weather.parse as wp
import gtfs_regional.pipeline as gp
import shared.features as sf
import koda.koda_transform as kt

pd.options.mode.copy_on_write = True

OPERATOR = OperatorsWithRT.X_TRAFIK
today = datetime.now().strftime("%Y-%m-%d")

In [None]:
rt_df, map_df = gp.get_gtfr_data_for_day(today, OPERATOR)

rt_df

In [None]:
columns_to_keep = [
    "trip_id", "start_date", "timestamp",
    "vehicle_id", "stop_sequence", "stop_id", "arrival_delay",
    "arrival_time", "departure_delay", "departure_time"
]
rt_df = rt_df[columns_to_keep]
rt_df = kt.keep_only_latest_stop_updates(rt_df)

# Merge with map_df to get route_type
rt_df = rt_df.merge(map_df, on='trip_id', how='inner')

# Set up arrival_time as our index and main datetime column
rt_df = rt_df.dropna(subset=['arrival_time'])  # Drop rows with missing arrival_time
rt_df['arrival_time'] = rt_df['arrival_time'].astype(int)
rt_df['arrival_time'] = pd.to_datetime(rt_df['arrival_time'], unit='s')
rt_df.sort_values(by='arrival_time', inplace=True)
rt_df.set_index('arrival_time', inplace=True)

rt_df

In [None]:
# Count unique trip_id
print(rt_df['trip_id'].nunique())

# Count unique route_id
print(rt_df['route_id'].nunique())

# Get max and min arrival_time
print(rt_df.index.max())
print(rt_df.index.min())

In [None]:
final_metrics = sf.build_feature_group(rt_df, map_df)
final_metrics