## Importing Libraries

In [2]:
import pandas as pd
import json
import folium
import numpy as np
import math
import json
from collections import OrderedDict
from shapely.geometry import Point, LineString
from catboost import CatBoostRegressor

## Importing All Dataset

In [None]:
train_df = pd.read_csv('dataset/public/train.csv')
test_df = pd.read_csv("dataset/public/test.csv")

with open('dataset/public/routes.json') as f:
    routes = json.load(f)
with open('dataset/public/jalanraya_ui_flowcoord.json') as f:
    flowcoord = json.load(f)

## Process Infer Onerun

In [4]:
import json
import numpy as np
from collections import OrderedDict

def preprocess_json():
    global routes, flowcoord, stops_data, track_data, color_to_route

    # Assume routes and flowcoord have already been loaded externally.
    # Mapping warna ke key rute pada JSON
    color_to_route = {'red': 'RUTE_MERAH', 'blue': 'RUTE_BIRU'}

    # Siapkan data halte per rute: nama-nama halte dan array koordinat
    stops_data = {}
    for route_name, stops in routes.items():
        stop_names = list(stops.keys())
        stop_coords = np.array(list(stops.values()))  # array Nx2: [lat, lon] untuk setiap halte
        stops_data[route_name] = {
            'names': stop_names,
            'coords': stop_coords
        }

    # Siapkan data koordinat jalur per rute (TRACK_RED, TRACK_BLUE)
    track_data = {}
    for track_name, points in flowcoord.items():
        # Ambil list koordinat (lat, lon) dari masing-masing titik jalur
        coords = np.array([(pt["latitude"], pt["longitude"]) for pt in points])
        track_data[track_name] = coords

    # Specify the new stop details
    new_stop_key = "tempat_turun_asrama"
    new_stop_coords = [-6.348499, 106.829623]

    def insert_stop_before_target(route_dict, target_stop, new_key, new_coords):
        new_route = OrderedDict()
        for stop, coords in route_dict.items():
            # When we reach the target, insert the new stop first
            if stop == target_stop:
                new_route[new_key] = new_coords
            new_route[stop] = coords
        return new_route

    # Update both RUTE_MERAH and RUTE_BIRU by inserting the new stop before "asrama_ui_01_end"
    for route_key in ["RUTE_MERAH", "RUTE_BIRU"]:
        if route_key in routes:
            routes[route_key] = insert_stop_before_target(routes[route_key], "asrama_ui_01_end", new_stop_key, new_stop_coords)


In [5]:
def insert_stop_after_target_point(flow_list, target_lat, target_lon, new_stop_name, new_stop_coords):

    updated_list = []
    inserted = False
    for idx, stop in enumerate(flow_list):
        updated_list.append(stop)
        # Check if this stop matches the target (using a tolerance if needed)
        if (abs(stop.get('latitude', 0) - target_lat) < 1e-6 and 
            abs(stop.get('longitude', 0) - target_lon) < 1e-6 and not inserted):
            # Insert the new stop immediately after this one.
            new_stop = {
                "name": new_stop_name,
                "latitude": new_stop_coords[0],
                "longitude": new_stop_coords[1]
            }
            updated_list.append(new_stop)
            inserted = True

    if not inserted:
        new_stop = {
            "name": new_stop_name,
            "latitude": new_stop_coords[0],
            "longitude": new_stop_coords[1]
        }
        updated_list.append(new_stop)
    return updated_list

In [6]:
def update_flowcoord_tracks():
    global flowcoord, flowcoord_dict  # this global variable will be affected by the function

    # Set target coordinates to look for and new stop details
    target_lat = -6.3485456
    target_lon = 106.8300938
    new_stop_name = "tempat_turun_asrama"
    new_stop_coords = (-6.348499, 106.829623)

    # For both TRACK_RED and TRACK_BLUE, update the list
    for route_key in ["TRACK_RED", "TRACK_BLUE"]:
        if route_key in flowcoord:
            flowcoord[route_key] = insert_stop_after_target_point(
                flowcoord[route_key], target_lat, target_lon, new_stop_name, new_stop_coords
            )

    flowcoord_dict = {
            'red': flowcoord['TRACK_RED'],
            'blue': flowcoord['TRACK_BLUE']
    }

In [7]:
def update_nearest_halte(train_df, test_df):

    # Add a column for halte_distance only
    train_df['halte_distance'] = np.nan     # jarak ke halte terdekat (dalam meter)
    test_df['halte_distance'] = np.nan

    # For each color, calculate halte distance for train_df
    for color_val in ['red', 'blue']:
        route_key = color_to_route[color_val]             # e.g. 'red' -> 'RUTE_MERAH'
        stop_coords = stops_data[route_key]['coords']       # array koordinat halte
        # Subset rows for the given color
        mask = train_df['color'] == color_val
        lat_points = train_df.loc[mask, 'lat'].to_numpy()
        lon_points = train_df.loc[mask, 'lon'].to_numpy()
        if lat_points.size == 0:
            continue

        lat_diff = lat_points[:, None] - stop_coords[:, 0][None, :]
        lon_diff = lon_points[:, None] - stop_coords[:, 1][None, :]
        cos_lat = np.cos(np.deg2rad(lat_points))
        lat_diff_m = lat_diff * 111320.0
        lon_diff_m = lon_diff * (111320.0 * cos_lat[:, None])
        dist_sq = lat_diff_m**2 + lon_diff_m**2
        min_dist = np.sqrt(np.min(dist_sq, axis=1))
        train_df.loc[mask, 'halte_distance'] = min_dist

    # For each color, calculate halte distance for test_df
    for color_val in ['red', 'blue']:
        route_key = color_to_route[color_val]             # e.g., 'red' -> 'RUTE_MERAH'
        stop_coords = stops_data[route_key]['coords']
        mask = test_df['color'] == color_val
        lat_points = test_df.loc[mask, 'lat'].to_numpy()
        lon_points = test_df.loc[mask, 'lon'].to_numpy()
        if lat_points.size == 0:
            continue

        lat_diff = lat_points[:, None] - stop_coords[:, 0][None, :]
        lon_diff = lon_points[:, None] - stop_coords[:, 1][None, :]
        cos_lat = np.cos(np.deg2rad(lat_points))
        lat_diff_m = lat_diff * 111320.0
        lon_diff_m = lon_diff * (111320.0 * cos_lat[:, None])
        dist_sq = lat_diff_m**2 + lon_diff_m**2
        min_dist = np.sqrt(np.min(dist_sq, axis=1))
        test_df.loc[mask, 'halte_distance'] = min_dist

    return train_df, test_df


In [8]:
def update_route_distance(train_df, test_df, track_data):
    import numpy as np

    train_df['route_distance'] = np.nan
    test_df['route_distance'] = np.nan

    for color_val in ['red', 'blue']:
        track_key = 'TRACK_' + color_val.upper()          # e.g. 'red' -> 'TRACK_RED'
        track_coords = track_data[track_key]              # array titik-titik jalur rute
        mask = train_df['color'] == color_val
        lat_points = train_df.loc[mask, 'lat'].to_numpy()
        lon_points = train_df.loc[mask, 'lon'].to_numpy()
        if lat_points.size == 0:
            continue

        # Hitung jarak terdekat ke jalur (gunakan tiap titik jalur sebagai referensi)
        lat_diff = lat_points[:, None] - track_coords[:, 0][None, :]
        lon_diff = lon_points[:, None] - track_coords[:, 1][None, :]
        cos_lat = np.cos(np.deg2rad(lat_points))
        lat_diff_m = lat_diff * 111320.0
        lon_diff_m = lon_diff * (111320.0 * cos_lat[:, None])
        dist_sq = lat_diff_m**2 + lon_diff_m**2
        min_dist = np.sqrt(np.min(dist_sq, axis=1))  # jarak ke titik jalur terdekat
        train_df.loc[mask, 'route_distance'] = min_dist

    for color_val in ['red', 'blue']:
        track_key = 'TRACK_' + color_val.upper()          # e.g. 'red' -> 'TRACK_RED'
        track_coords = track_data[track_key]              # array titik-titik jalur rute
        mask = test_df['color'] == color_val
        lat_points = test_df.loc[mask, 'lat'].to_numpy()
        lon_points = test_df.loc[mask, 'lon'].to_numpy()
        if lat_points.size == 0:
            continue

        # Hitung jarak terdekat ke jalur (gunakan tiap titik jalur sebagai referensi)
        lat_diff = lat_points[:, None] - track_coords[:, 0][None, :]
        lon_diff = lon_points[:, None] - track_coords[:, 1][None, :]
        cos_lat = np.cos(np.deg2rad(lat_points))
        lat_diff_m = lat_diff * 111320.0
        lon_diff_m = lon_diff * (111320.0 * cos_lat[:, None])
        dist_sq = lat_diff_m**2 + lon_diff_m**2
        min_dist = np.sqrt(np.min(dist_sq, axis=1))  # jarak ke titik jalur terdekat
        test_df.loc[mask, 'route_distance'] = min_dist

    return train_df, test_df


In [9]:
def haversine(lat1, lon1, lat2, lon2):
    R = 6371000  # radius bumi dalam meter
    phi1, phi2 = math.radians(lat1), math.radians(lat2)
    d_phi = math.radians(lat2 - lat1)
    d_lambda = math.radians(lon2 - lon1)
    a = math.sin(d_phi / 2) ** 2 + math.cos(phi1) * math.cos(phi2) * math.sin(d_lambda / 2) ** 2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    return R * c

# Fungsi mendapatkan titik terdekat pada jalur (flowcoord)
def get_closest_point_on_route(lat, lon, flowcoord):
    line = LineString([(pt["longitude"], pt["latitude"]) for pt in flowcoord])
    point = Point(lon, lat)
    proj_point = line.interpolate(line.project(point))
    return proj_point.y, proj_point.x

# Fungsi mendapatkan halte terdekat berdasarkan koordinat di flowcoord
def get_nearest_stop_from_flowcoord(lat, lon, flowcoord):
    min_distance = float('inf')
    nearest_stop = None
    for pt in flowcoord:
        if pt["name"]:  # hanya pertimbangkan titik dengan nama (halte)
            stop_lat, stop_lon = pt["latitude"], pt["longitude"]
            dist = haversine(lat, lon, stop_lat, stop_lon)
            if dist < min_distance:
                min_distance = dist
                nearest_stop = pt["name"]
    return nearest_stop, min_distance

# Fungsi utama menghitung halte dan jarak terdekat berbasis flowcoord
def assign_nearest_stop_from_flowcoord(df, flowcoord_dict):
    df = df.copy()
    df['nearest_stop'] = None
    df['distance_to_stop'] = np.nan

    for idx, row in df.iterrows():
        color = row['color'].lower()
        flowcoord = flowcoord_dict[color]

        # Dapatkan koordinat bus yang terproyeksi di rute flowcoord
        proj_lat, proj_lon = get_closest_point_on_route(row['lat'], row['lon'], flowcoord)

        # Dapatkan halte terdekat dari koordinat terproyeksi
        nearest_stop, distance_to_stop = get_nearest_stop_from_flowcoord(proj_lat, proj_lon, flowcoord)

        # Assign ke dataframe
        df.at[idx, 'nearest_stop'] = nearest_stop
        df.at[idx, 'distance_to_stop'] = distance_to_stop

    return df

In [10]:
def initialize_stop_columns(train_df, test_df):
    import numpy as np
    train_df['nearest_stop'] = None
    train_df['distance_to_stop'] = np.nan
    test_df['nearest_stop'] = None
    test_df['distance_to_stop'] = np.nan
    return train_df, test_df


In [11]:
def convert_timestamps(train_df, test_df):
    import pandas as pd
    train_df['ts'] = pd.to_datetime(train_df['ts'])
    test_df['ts'] = pd.to_datetime(test_df['ts'])
    return train_df, test_df


In [12]:
def add_prev_next_stop_features(df, red_stops, blue_stops):

    df = df.copy()
    df['distance_to_prev_stop'] = np.nan
    df['distance_to_next_stop'] = np.nan

    def get_stop_index(nearest_stop, stops_list):
        for idx, (stop_name, coords) in enumerate(stops_list):
            if stop_name == nearest_stop:
                return idx
        return None

    for idx, row in df.iterrows():
        # Pilih list halte berdasarkan warna rute
        stops_list = red_stops if row['color'] == 'red' else blue_stops
        nearest_stop = row.get('nearest_stop')
        if pd.isnull(nearest_stop) or nearest_stop is None:
            continue
        
        idx_stop = get_stop_index(nearest_stop, stops_list)
        if idx_stop is None:
            continue

        lat, lon = row['lat'], row['lon']
        # Calculate distance to previous stop:
        if idx_stop > 0:
            prev_coords = stops_list[idx_stop - 1][1]
        else:
            # Wrap-around: if at the first stop, use the last stop as previous
            prev_coords = stops_list[-1][1]
        dist_prev = haversine(lat, lon, prev_coords[0], prev_coords[1])
        df.at[idx, 'distance_to_prev_stop'] = dist_prev

        # Calculate distance to next stop:
        if idx_stop < len(stops_list) - 1:
            next_coords = stops_list[idx_stop + 1][1]
        else:
            # Wrap-around: if at the last stop, use the first stop as next
            next_coords = stops_list[0][1]
        dist_next = haversine(lat, lon, next_coords[0], next_coords[1])
        df.at[idx, 'distance_to_next_stop'] = dist_next

    return df

In [13]:
def compute_time_diff(df, boole = False):
    
    df = df.copy()
    df['ts'] = pd.to_datetime(df['ts'])
    df['time_diff'] = df['ts'].diff().dt.total_seconds()
    
    return df

In [14]:
def compute_rta(df):
    

    # Tambah kolom tanggal untuk memisahkan per hari
    df['tanggal'] = df['ts'].dt.date

    # Fungsi utama
    def process_daily_group(daily_df):
        marker_indices = []
        current_halte = None
        current_min_dist = None
        current_min_idx = None

        for i, row in daily_df.iterrows():
            halte = row['nearest_stop']
            dist = row['halte_distance']

            if halte != current_halte:
                if current_min_idx is not None:
                    marker_indices.append(current_min_idx)

                current_halte = halte
                current_min_dist = dist
                current_min_idx = i
            else:
                if dist < current_min_dist:
                    current_min_dist = dist
                    current_min_idx = i

        if current_min_idx is not None:
            marker_indices.append(current_min_idx)

        def get_next_marker(idx):
            for marker in marker_indices:
                if marker > idx:
                    return marker
            return None

        def compute_rta_row(row):
            next_marker = get_next_marker(row.name)
            if next_marker is not None:
                return (daily_df.loc[next_marker, 'ts'] - row['ts']).total_seconds()
            else:
                return 0

        daily_df['rta'] = daily_df.apply(compute_rta_row, axis=1)

        return daily_df

    # Proses tiap hari secara terpisah
    df = df.groupby('tanggal', group_keys=False).apply(process_daily_group).reset_index(drop=True)

    df.drop(columns=['tanggal'], inplace=True)
    df.drop(columns = ["time_diff"], inplace = True)

    return df

In [15]:
def add_time_series_features(df, ts_column='ts'):

    df[ts_column] = pd.to_datetime(df[ts_column])
    
    # Basic features (to be dropped later)
    df['year'] = df[ts_column].dt.year
    df['month'] = df[ts_column].dt.month
    df['day'] = df[ts_column].dt.day
    df['weekday'] = df[ts_column].dt.weekday
    df['weekofyear'] = df[ts_column].dt.isocalendar().week.astype(int)
    df['hour'] = df[ts_column].dt.hour
    df['minute'] = df[ts_column].dt.minute
    df['second'] = df[ts_column].dt.second
    df['quarter'] = df[ts_column].dt.quarter

    # Cyclical features
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

    df['minute_sin'] = np.sin(2 * np.pi * df['minute'] / 60)
    df['minute_cos'] = np.cos(2 * np.pi * df['minute'] / 60)

    df['second_sin'] = np.sin(2 * np.pi * df['second'] / 60)
    df['second_cos'] = np.cos(2 * np.pi * df['second'] / 60)

    df['weekday_sin'] = np.sin(2 * np.pi * df['weekday'] / 7)
    df['weekday_cos'] = np.cos(2 * np.pi * df['weekday'] / 7)

    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)

    df['weekofyear_sin'] = np.sin(2 * np.pi * df['weekofyear'] / 52)
    df['weekofyear_cos'] = np.cos(2 * np.pi * df['weekofyear'] / 52)
    
    # Now drop the non-cyclical (basic) time features:
    basic_features = ['year', 'month', 'day', 'weekday', 'weekofyear', 'hour', 'minute', 'second', 'quarter']
    df = df.drop(columns=basic_features)
    
    return df

In [16]:
def has_passed_nearest_stop(row, flowcoord_dict):

    # Tentukan rute berdasarkan kolom 'color' (misalnya 'red' atau 'blue')
    route = row['color'].lower()
    flowcoord_list = flowcoord_dict.get(route)
    if flowcoord_list is None:
        return np.nan  # Jika tidak ada data flowcoord, tidak bisa dihitung
    
    # Buat LineString dari flowcoord (ingat urutan (longitude, latitude))
    line = LineString([(pt["longitude"], pt["latitude"]) for pt in flowcoord_list])
    
    # Cari titik marker halte sesuai 'nearest_stop'
    marker = None
    for pt in flowcoord_list:
        if pt.get("name") == row['nearest_stop']:
            marker = pt
            break
    if marker is None:
        # Jika marker tidak ditemukan, kembalikan np.nan atau False sesuai kebijakan
        return np.nan
    
    # Buat titik untuk marker halte dan posisi bus
    stop_point = Point(marker["longitude"], marker["latitude"])
    bus_point = Point(row['lon'], row['lat'])
    
    # Dapatkan jarak kumulatif (proyeksi) pada LineString untuk marker dan bus
    stop_proj = line.project(stop_point)
    bus_proj = line.project(bus_point)
    
    # Jika bus_proj >= stop_proj, artinya bus sudah melewati (atau tepat di titik) marker halte
    return bus_proj >= stop_proj

In [17]:
def adjust_local_min_rta_improved(df, dwell_time=15, tolerance_distance=10):
    
    n = len(df)
    start = 0
    while start < n:
        current_halte = df.loc[start, 'nearest_stop']
        end = start
        # Bentuk blok: baris-baris dengan 'nearest_halte' yang sama
        while end < n and df.loc[end, 'nearest_stop'] == current_halte:
            end += 1
        
        block = df.iloc[start:end]
        if not block.empty:
            # Cari baris dengan 'halte_distance' terkecil dalam blok tersebut
            idx_min = block['halte_distance'].idxmin()
            # Jika baris tersebut belum passed atau (sudah passed tetapi jaraknya < tolerance_distance)
            if (not df.loc[idx_min, 'passed_stop']) or (df.loc[idx_min, 'passed_stop'] and df.loc[idx_min, 'halte_distance'] < tolerance_distance):
                new_rta = (df.loc[idx_min, 'halte_distance'] * 3.6 / 8)
                df.loc[idx_min, 'rta'] = new_rta
        start = end
    return df


In [18]:
def separate_outside_and_short_route(df, flowcoord_dict, start_stop='tempat_turun_asrama', end_stop='asrama_ui_01_end'):
    df = df.copy()

    # Buat LineString jalur penuh
    full_routes = {}
    short_routes = {}

    for color, coords in flowcoord_dict.items():
        line_full = LineString([(pt['longitude'], pt['latitude']) for pt in coords])
        full_routes[color] = line_full

        # Potong jalur pendek (dari start_stop ke end_stop)
        start_idx, end_idx = None, None
        for i, pt in enumerate(coords):
            if pt['name'] == start_stop:
                start_idx = i
            elif pt['name'] == end_stop:
                end_idx = i
                break

        if start_idx is not None and end_idx is not None and start_idx < end_idx:
            short_route_coords = coords[start_idx:end_idx + 1]
        else:
            short_route_coords = coords[end_idx:start_idx + 1]

        line_short = LineString([(pt['longitude'], pt['latitude']) for pt in short_route_coords])
        short_routes[color] = line_short

    # Pisahkan data
    outside_flowcoord = []
    short_route_data = []
    normal_route_data = []

    for idx, row in df.iterrows():
        bus_point = Point(row['lon'], row['lat'])
        color = row['color'].lower()

        # Jarak terdekat ke jalur utama
        dist_to_full_route = bus_point.distance(full_routes[color])

        # Threshold (misalnya, 50 meter dari jalur utama dianggap di luar jalur)
        if dist_to_full_route > 0.0005:  # ~50m dalam derajat
            outside_flowcoord.append(idx)
            continue

        # Cek apakah di jalur pendek khusus
        dist_to_short_route = bus_point.distance(short_routes[color])

        if dist_to_short_route <= 0.0005:
            short_route_data.append(idx)
        else:
            normal_route_data.append(idx)

    df_outside_flowcoord = df.loc[outside_flowcoord]
    df_short_route = df.loc[short_route_data]
    df_normal_route = df.loc[normal_route_data]

    return df_outside_flowcoord, df_short_route, df_normal_route

In [19]:
def filter_train_data(df_normal, df_short_route, df_outside):
    train_df_normal = df_normal[
        (df_normal['rta'] >= 0) & (df_normal['rta'] <= 500) &
        (df_normal['speed'] >= 0) & (df_normal['speed'] <= 100)
    ].copy()

    # For df_short_route, also keep rows where rta is between 0 and 500 and speed is between 0 and 100.
    train_df_short_route = df_short_route[
        (df_short_route['rta'] >= 0) &
        (df_short_route['speed'] >= 0) & (df_short_route['speed'] <= 100)
    ].copy()

    # For df_outside, likewise keep rows where rta is between 0 and 500 and speed is between 0 and 100.
    train_df_outside = df_outside[
        (df_outside['rta'] >= 0) &
        (df_outside['speed'] >= 0) & (df_outside['speed'] <= 100)
    ].copy()

    return train_df_normal, train_df_short_route, train_df_outside


In [20]:
def cut_excess_same_group_speed_zero(
    df,
    group_cols = [
        'speed',
        'halte_distance',
        'route_distance',
        'nearest_stop',
        'distance_to_stop',
        'distance_to_prev_stop',
        'distance_to_next_stop'
    ],
    max_rows=2
):
    
    # Separate the subset where speed=0
    df_speed_zero = df[df['speed'] == 0].copy()
    # Subset where speed != 0 remains untouched
    df_not_zero = df[df['speed'] != 0]

    # We'll store filtered groups here
    filtered_groups = []

    # Group the speed=0 subset by the chosen columns
    grouped = df_speed_zero.groupby(group_cols, dropna=False)  # dropna=False to include NaN in grouping if present

    for group_key, group_df in grouped:
        if len(group_df) > max_rows:
            # Sort by rta ascending, keep only top `max_rows`
            group_df = group_df.sort_values('rta', ascending=True).head(max_rows)
        filtered_groups.append(group_df)

    # Combine the filtered speed=0 groups
    df_speed_zero_filtered = pd.concat(filtered_groups, ignore_index=True)

    # Combine with the rest of the DataFrame (where speed != 0)
    df_filtered = pd.concat([df_not_zero, df_speed_zero_filtered], ignore_index=True)

    # Calculate how many rows were dropped
    dropped_count = len(df) - len(df_filtered)

    return df_filtered, dropped_count

In [21]:
def remove_outliers_iqr(df, column, factor=1.5):

    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - factor * IQR
    upper_bound = Q3 + factor * IQR
    filtered_df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)].copy()
    return filtered_df

## Process

In [22]:
# Load and define stops and flowcoord dictionary


def process(train_df, test_df):
    
    preprocess_json()
    update_flowcoord_tracks()
    red_stops = list(routes["RUTE_MERAH"].items())
    blue_stops = list(routes["RUTE_BIRU"].items())
    train_df, test_df = convert_timestamps(train_df, test_df)
    train_df, test_df = update_nearest_halte(train_df, test_df)
    train_df, test_df = update_route_distance(train_df, test_df, track_data)
    train_df, test_df = initialize_stop_columns(train_df, test_df)
    train_df = assign_nearest_stop_from_flowcoord(train_df, flowcoord_dict)
    test_df = assign_nearest_stop_from_flowcoord(test_df, flowcoord_dict)
    test_df = add_prev_next_stop_features(test_df, red_stops, blue_stops)
    train_df = add_prev_next_stop_features(train_df, red_stops, blue_stops)
    train_df = compute_time_diff(train_df)
    train_df = compute_rta(train_df)
    
    test_df = compute_time_diff(test_df)
    test_df = compute_rta(test_df)
    
    train_df = add_time_series_features(train_df)
    test_df = add_time_series_features(test_df)
    train_df['passed_stop'] = train_df.apply(lambda row: has_passed_nearest_stop(row, flowcoord_dict), axis=1)
    test_df['passed_stop'] = test_df.apply(lambda row: has_passed_nearest_stop(row, flowcoord_dict), axis=1)
    
    # Separate training data into groups
    df_outside, df_short_route, df_normal = separate_outside_and_short_route(train_df, flowcoord_dict)
    
    # Filter train data (e.g., clip rta values, speed, etc.)
    train_df_normal, train_df_short_route, train_df_outside = filter_train_data(df_normal, df_short_route, df_outside)
    
    # Remove excessive rows (based on speed==0 block logic)
    train_df_normal, dropped_count = cut_excess_same_group_speed_zero(train_df_normal)
    train_df_short_route, dropped_count2 = cut_excess_same_group_speed_zero(train_df_short_route)
    train_df_outside, dropped_count3 = cut_excess_same_group_speed_zero(train_df_outside)
    
    # Remove outliers on 'rta'
    train_df_normal_clean = remove_outliers_iqr(train_df_normal, 'rta', factor=1.5)
    train_df_short_route_clean = remove_outliers_iqr(train_df_short_route, 'rta', factor=1.5)
    train_df_outside_clean = remove_outliers_iqr(train_df_outside, 'rta', factor=1.5)
    
    # Separate test data into groups using flowcoord
    test_outside, test_short_route, test_normal = separate_outside_and_short_route(test_df, flowcoord_dict)
    
    print(f"Number of rows dropped: {dropped_count} {dropped_count2} {dropped_count3}")
    print("Normal group shape before:", df_normal.shape, "after:", train_df_normal_clean.shape)
    print("Short route group shape before:", df_short_route.shape, "after:", train_df_short_route_clean.shape)
    print("Outside group shape before:", df_outside.shape, "after:", train_df_outside_clean.shape)
    
    return {
        'train_df_normal_clean': train_df_normal_clean,
        'train_df_short_route_clean': train_df_short_route_clean,
        'train_df_outside_clean': train_df_outside_clean,
        'test_normal': test_normal,
        'test_short_route': test_short_route,
        'test_outside': test_outside,
        'train_df': train_df,  # full processed train_df if needed
        'test_df': test_df     # full processed test_df if needed
    }

## Models

In [23]:
from catboost import CatBoostRegressor

def build_models_with_best_params(train_processed, best_params, 
                                  cat_features=['color', 'nearest_stop', 'passed_stop'], 
                                  exclude_cols=['rta', 'imei', 'ts']):

    # Build feature lists by dropping the excluded columns.
    features_normal = [col for col in train_processed['df_normal'].columns if col not in exclude_cols]
    features_short  = [col for col in train_processed['df_short_route'].columns if col not in exclude_cols]
    features_outside = [col for col in train_processed['df_outside'].columns if col not in exclude_cols]
    
    # Extract training features and target for each group.
    X_normal = train_processed['df_normal'][features_normal]
    y_normal = train_processed['df_normal']['rta']
    
    X_short = train_processed['df_short_route'][features_short]
    y_short = train_processed['df_short_route']['rta']
    
    X_outside = train_processed['df_outside'][features_outside]
    y_outside = train_processed['df_outside']['rta']
    
    # Instantiate models using the best parameters.
    model_normal = CatBoostRegressor(**best_params['normal'], random_seed=42, verbose=100, cat_features=cat_features)
    model_short  = CatBoostRegressor(**best_params['short'], random_seed=42, verbose=100, cat_features=cat_features)
    model_outside = CatBoostRegressor(**best_params['outside'], random_seed=42, verbose=100, cat_features=cat_features)
    
    # Train the models.
    model_normal.fit(X_normal, y_normal)
    model_short.fit(X_short, y_short)
    model_outside.fit(X_outside, y_outside)
    
    models = {
        'normal': model_normal,
        'short': model_short,
        'outside': model_outside
    }
    
    feature_lists = {
        'features_normal': features_normal,
        'features_short': features_short,
        'features_outside': features_outside
    }
    
    return models, feature_lists




## Infer


In [24]:
def infer(models, train_processed, test_processed):
    

    exclude_cols = ['rta', 'imei', 'ts']
    

    features_normal = [col for col in train_processed['df_normal'].columns if col not in exclude_cols]
    features_short  = [col for col in train_processed['df_short_route'].columns if col not in exclude_cols]
    features_outside = [col for col in train_processed['df_outside'].columns if col not in exclude_cols]
    

    test_normal = test_processed['test_normal'].copy()
    test_short  = test_processed['test_short_route'].copy()
    test_outside = test_processed['test_outside'].copy()
    

    test_normal['predicted_rta'] = models['normal'].predict(test_normal[features_normal])
    test_short['predicted_rta']  = models['short'].predict(test_short[features_short])
    test_outside['predicted_rta'] = models['outside'].predict(test_outside[features_outside])
    

    final_predictions = pd.concat([test_normal, test_short, test_outside])
    

    if 'Id' in final_predictions.columns:
        final_predictions = final_predictions.sort_values(by='Id').reset_index(drop=True)
    else:
        final_predictions = final_predictions.sort_values(by='ts').reset_index(drop=True)
    
    return final_predictions

## Full run

In [25]:
processed = process(train_df, test_df)


train_processed = {
    'df_normal': processed['train_df_normal_clean'],
    'df_short_route': processed['train_df_short_route_clean'],
    'df_outside': processed['train_df_outside_clean']
}
test_processed = {
    'test_normal': processed['test_normal'],
    'test_short_route': processed['test_short_route'],
    'test_outside': processed['test_outside']
}

best_params = {
    'normal': {'iterations': 935, 'learning_rate': 0.026969688839420358, 'depth': 9, 'l2_leaf_reg': 0.26599614070518346}, 'short': {'iterations': 707, 'learning_rate': 0.011882972946608549, 'depth': 10, 'l2_leaf_reg': 0.014346258174116461}, 'outside': {'iterations': 623, 'learning_rate': 0.04864367507269438, 'depth': 7, 'l2_leaf_reg': 2.0705930414149356}
}

models, feature_lists = build_models_with_best_params(train_processed, best_params)

final_predictions = infer(models, train_processed, test_processed)

  return lib.line_locate_point(line, other)
  return lib.line_locate_point(line, other)
  df = df.groupby('tanggal', group_keys=False).apply(process_daily_group).reset_index(drop=True)
  df = df.groupby('tanggal', group_keys=False).apply(process_daily_group).reset_index(drop=True)
  return lib.line_locate_point(line, other)
  return lib.line_locate_point(line, other)


Number of rows dropped: 3033 1619 64
Normal group shape before: (31546, 26) after: (26481, 26)
Short route group shape before: (3913, 26) after: (1964, 26)
Outside group shape before: (1932, 26) after: (1740, 26)
0:	learn: 40.9272727	total: 82ms	remaining: 1m 16s
100:	learn: 30.4854836	total: 2.07s	remaining: 17.1s
200:	learn: 28.5723483	total: 4.1s	remaining: 15s
300:	learn: 27.2124939	total: 6.13s	remaining: 12.9s
400:	learn: 26.1121624	total: 8.22s	remaining: 10.9s
500:	learn: 25.1301165	total: 10.3s	remaining: 8.88s
600:	learn: 24.1734010	total: 12.3s	remaining: 6.83s
700:	learn: 23.3346100	total: 14.3s	remaining: 4.77s
800:	learn: 22.5462028	total: 16.4s	remaining: 2.74s
900:	learn: 21.8412845	total: 18.4s	remaining: 695ms
934:	learn: 21.6200958	total: 19.1s	remaining: 0us
0:	learn: 206.0527040	total: 18ms	remaining: 12.7s
100:	learn: 153.1890502	total: 1.43s	remaining: 8.57s
200:	learn: 120.4697874	total: 2.83s	remaining: 7.13s
300:	learn: 98.0825585	total: 4.25s	remaining: 5.73s

In [26]:
for key, df in train_processed.items():
    df.to_csv(f"{key}.csv", index=False)


In [27]:
submission = pd.DataFrame({
    'id': final_predictions['Id'],  # ensure this column exists in your test data
    'rta': final_predictions['predicted_rta']
})

# Clip any negative values to 0
submission['rta'] = submission['rta'].clip(lower=0)

submission.to_csv("submission_pake_3model_highest_ditambainmethodrta_tuned_asrama.csv", index=False)
print("Submission preview:")
print(submission.head())


Submission preview:
   id         rta
0   0   93.588947
1   1   94.940226
2   2  102.297535
3   3   72.651790
4   4   86.034883
