In [2]:
import os
import pcre2  # Strictly something sane regex, not python regex

import pandas as pd
import pathlib

from sklearn.metrics import mean_squared_error
import numpy as np


In [3]:
# Data loader thingy

class DataLoader:
    # dir = /app/data

    def __init__(self, dir: str):
        a = 0
        expr = r'^data-(?<n>\d+).csv$'
        pattern = pcre2.compile(expr, flags=pcre2.I, jit=True)
        for x in os.listdir(dir):
            match = pattern.match(x)
            if match:
                i = int(match["n"])
                if i > a:
                    a = i
        self._count = a
        self.path = pathlib.Path(dir)

    def open(self, i: int) -> pd.DataFrame | None:
        if self._count <= i:
            return pd.read_csv(self.path / "data-{}.csv".format(i))
        return None

    def count(self) -> int:
        return self._count + 1

    def all(self) -> pd.DataFrame:
        df = pd.read_csv(self.path / "data-0.csv").copy()
        for i in range(1, self.count()):
            df = pd.concat([df, self.open(i)])
        return df


loader = DataLoader("./")

In [4]:
# Model interface
from abc import ABC, abstractmethod


class Model(ABC):
    @abstractmethod
    def train(self, data):
        pass

    @abstractmethod
    def predict(self, data) -> pd.DataFrame:
        pass

Create a method for generating timetables.

In [11]:
import json

with open('routes.json', 'r') as f:
  routes_data = json.load(f)


def generate_timetable(model: Model, route_name: str, direction: bool, start_time='08:00:00'):
    selected_route = None
    for route in routes_data:
        if route['name'] == route_name and route['direction'] == direction:
            selected_route = route
            break

    if selected_route is None:
        print(f"Error: Route '{route_name}' not found.")
        return None

    stops = selected_route['stops']
    segments_data = []
    for i in range(len(stops) - 1):
        segments_data.append({'from': stops[i], 'to': stops[i+1]})

    # Create a DataFrame for prediction
    segments_df = pd.DataFrame(segments_data)
    print(f"Prepared {len(segments_df)} segments for prediction.")

    # Predict durations using the model
    # The model expects a DataFrame with 'from' and 'to' columns
    predicted_durations_df = model.predict(segments_df)

    # Initialize timetable
    timetable = []
    current_time_str = start_time
    # Convert start_time to datetime object for calculations
    current_time = pd.to_datetime(start_time, format='%H:%M:%S')

    # Add the first stop with its initial arrival time
    timetable.append({
        'stop': stops[0],
        'arrival_time': current_time.strftime('%H:%M:%S')
    })

    # Iterate through predicted durations to build the timetable
    for i, row in predicted_durations_df.iterrows():
        duration_seconds = row['predicted_duration']
        # Add duration to current_time
        current_time += pd.to_timedelta(duration_seconds, unit='s')

        timetable.append({
            'stop': stops[i+1],
            'arrival_time': current_time.strftime('%H:%M:%S')
        })

    # Convert timetable to DataFrame for better readability and return
    timetable_df = pd.DataFrame(timetable)
    return timetable_df


In [5]:
# Baseline


class BaselineModel(Model):
    def __init__(self):
        self.avg_durations = None

    def train(self, data: pd.DataFrame):
        # calculate average duration for each from-to pair
        # Group by from-to pairs and compute average duration
        self.avg_durations = (
            df
            .groupby(['from', 'to'], as_index=False)['duration']
            .mean()
            .rename(columns={'duration': 'avg_duration'})
        )

    def predict(self, data: pd.DataFrame) -> pd.DataFrame:
        # merge input data with avg_durations to get predictions
        predictions = pd.merge(
            data,
            self.avg_durations,
            on=['from', 'to'],
            how='left'
        )
        # rename avg_duration to predicted_duration
        predictions = predictions.rename(columns={'avg_duration': 'predicted_duration'})

        return predictions[['from', 'to', 'predicted_duration']]


Test baseline model

In [12]:

# heck, hope Python is able to handle this mess
# from,to,duration,vehicle_model,vehicle_type,time
df = loader.all().sample(frac=1).reset_index(drop=True).copy()

model = BaselineModel()
model.train(df)
predictions = model.predict(df)

# evaluate with RMSE
rmse = np.sqrt(mean_squared_error(df['duration'], predictions['predicted_duration']))
print(f"RMSE: {rmse}")




RMSE: 43.535571183750776


In [106]:

def create_embeds(df: pd.DataFrame) -> dict:
    """
    I have no fricking idea how does this function work, but somehow it does.
    :param df: times
    :return: something
    """
    # 1. Stop Embeddings (Latitude and Longitude)
    unique_stop_ids = df[['from', 'to']].drop_duplicates()

    # from: same index as unique_stop_ids, looked up 'from' from stops dataframe
    from_stuff = unique_stop_ids.merge(stops, left_on=['from'], right_on=['stop_id'])
    to_stuff = unique_stop_ids.merge(stops, left_on=['to'], right_on=['stop_id'])

    unique_stop_ids['lat'] = ((from_stuff['stop_lat'] + to_stuff['stop_lat']) / 2).map(
        lambda x: sinusoidal_positional_encoding(x, 12))
    unique_stop_ids['lon'] = ((from_stuff['stop_lon'] + to_stuff['stop_lon']) / 2).map(
        lambda x: sinusoidal_positional_encoding(x, 12))

    # 2. Vehicle Type Embeddings (4 bits)
    unique_vehicle_types = df['vehicle_type'].astype(int).unique()

    # Generate a unique 4-dimensional vector for each vehicle type.
    # Using a fixed seed and the vehicle type itself to ensure deterministic and unique embeddings.
    np.random.seed(42)  # For reproducibility
    vehicle_embedding_dict = {}
    for vt in unique_vehicle_types:
        rng = np.random.default_rng(42 + vt)  # Use vehicle type in seed for uniqueness
        vehicle_embedding_dict[vt] = rng.uniform(-1, 1, 4)  # Random 4-dim vector

    vehicle_embedding_features = pd.DataFrame.from_dict(
        vehicle_embedding_dict,
        orient='index',
        columns=[f'vehicle_embed_{i}' for i in range(4)]
    )
    vehicle_embedding_features.index.name = 'vehicle_type'

    # Define embed_dim based on the expected output of sinusoidal_positional_encoding (from bv11y8jw0uGI)
    embed_dim = 12

    for i in range(embed_dim):
        unique_stop_ids['lat_embed_{}'.format(i)] = unique_stop_ids['lat'].map(
            lambda x: x[i] if type(x) is not float else 0)
        unique_stop_ids['lon_embed_{}'.format(i)] = unique_stop_ids['lon'].map(
            lambda x: x[i] if type(x) is not float else 0)

    return {
        'stop_embed': unique_stop_ids.drop('lat', axis=1).drop('lon', axis=1),
        'vehicle_embed': vehicle_embedding_features
    }

embeds = create_embeds(df)

In [110]:
df.head()

Unnamed: 0,from,to,duration,vehicle_model,vehicle_type,time
0,F02952,F02899,64,31.0,11,2025-10-18T14:36:56Z
1,F03214,F03210,75,11.0,3,2025-10-17T11:31:56Z
2,061188,008918,49,12.0,3,2025-10-18T11:36:45Z
3,F03545,F03542,66,11.0,3,2025-10-19T18:38:59Z
4,F00313,F00430,63,9.0,0,2025-10-19T12:36:53Z
