In [1]:
import os
import numpy as np
from pathlib import Path
import pandas as pd

# Project root (auto-detect)
BASE_PATH = Path.cwd().parents[0]

# Data paths
SILVER_PATH = BASE_PATH / "data" / "silver"
GOLD_PATH = BASE_PATH / "data" / "gold"

# Ensure gold directory exists
GOLD_PATH.mkdir(parents=True, exist_ok=True)

# Read silver dataset
port_events = pd.read_csv(SILVER_PATH / "port_events.csv")



In [2]:
time_cols = ["etaSchedule", "eta", "ata", "etdSchedule", "etd", "atd"]
for col in time_cols:
    if col in port_events.columns:
        port_events[col] = pd.to_datetime(port_events[col], errors="coerce")


## Temporal Traffic Features

In [3]:
port_events["arrival_hour"] = port_events["ata"].dt.hour
port_events["arrival_dayofweek"] = port_events["ata"].dt.dayofweek
port_events["is_weekend"] = (port_events["arrival_dayofweek"] >= 5).astype(int)


## Delay & Early-Arrival Logic

In [4]:
# Preserve early-arrival information
port_events["early_arrival_flag"] = (port_events["arrival_delay_hrs"] < 0).astype(int)

# Clip delays (schedule deviation only)
for col in ["arrival_delay_hrs", "departure_delay_hrs"]:
    port_events[col] = port_events[col].clip(lower=0)

# Port stay directly captures congestion
port_events["port_stay_hrs"] = port_events["port_stay_hrs"].clip(lower=0)


In [5]:
port_events.head()

Unnamed: 0,vessel_name,vessel_id,departure_port,arrival_port,etaSchedule,eta,ata,etdSchedule,etd,atd,arrival_delay_hrs,departure_delay_hrs,port_stay_hrs,congestion_flag,arrival_hour,arrival_dayofweek,is_weekend,early_arrival_flag
0,Megastar,9773064,FIHEL,EETLL,2018-05-04 21:30:00,2018-04-05 21:25:00,2018-04-05 21:23:00,2018-05-04 19:30:00,2018-07-04 15:29:00,2018-04-05 19:18:20,0.0,0.0,0.0,0,21,3,0,1
1,Megastar,9773064,FIHEL,EETLL,2018-05-04 21:30:00,2018-04-05 21:25:00,2018-04-05 21:29:00,2018-05-04 19:30:00,2018-07-04 15:29:00,2018-04-05 19:18:20,0.0,0.0,0.0,0,21,3,0,1
2,Megastar,9773064,FIHEL,EETLL,2018-05-04 21:30:00,2018-04-05 21:25:00,2018-04-05 21:30:00,2018-05-04 19:30:00,2018-07-04 15:29:00,2018-04-05 19:18:20,0.0,0.0,0.0,0,21,3,0,1
3,Star,9364722,EETLL,FIHEL,2018-05-04 21:30:00,2018-04-05 21:26:00,2018-04-05 21:46:00,2018-05-04 19:30:00,2018-07-04 15:25:00,2018-04-05 19:21:17,0.0,0.0,0.0,0,21,3,0,1
4,Megastar,9773064,FIHEL,EETLL,2018-05-04 21:30:00,2018-04-05 21:25:00,2018-04-05 21:32:00,2018-05-04 19:30:00,2018-07-04 15:29:00,2018-04-05 19:18:20,0.0,0.0,0.0,0,21,3,0,1


In [6]:
q50 = port_events["port_stay_hrs"].quantile(0.50)
q80 = port_events["port_stay_hrs"].quantile(0.80)

def congestion_label_quantile(hours):
    if hours <= q50:
        return 0  # Low
    elif hours <= q80:
        return 1  # Medium
    else:
        return 2  # High

port_events["congestion_label"] = port_events["port_stay_hrs"].apply(congestion_label_quantile)


In [7]:
port_events["congestion_label"].value_counts()


congestion_label
0    319427
2      2769
Name: count, dtype: int64

In [8]:
port_events["is_congested"] = (
    port_events["congestion_label"] == 2
).astype(int)


In [9]:
port_events.head()

Unnamed: 0,vessel_name,vessel_id,departure_port,arrival_port,etaSchedule,eta,ata,etdSchedule,etd,atd,arrival_delay_hrs,departure_delay_hrs,port_stay_hrs,congestion_flag,arrival_hour,arrival_dayofweek,is_weekend,early_arrival_flag,congestion_label,is_congested
0,Megastar,9773064,FIHEL,EETLL,2018-05-04 21:30:00,2018-04-05 21:25:00,2018-04-05 21:23:00,2018-05-04 19:30:00,2018-07-04 15:29:00,2018-04-05 19:18:20,0.0,0.0,0.0,0,21,3,0,1,0,0
1,Megastar,9773064,FIHEL,EETLL,2018-05-04 21:30:00,2018-04-05 21:25:00,2018-04-05 21:29:00,2018-05-04 19:30:00,2018-07-04 15:29:00,2018-04-05 19:18:20,0.0,0.0,0.0,0,21,3,0,1,0,0
2,Megastar,9773064,FIHEL,EETLL,2018-05-04 21:30:00,2018-04-05 21:25:00,2018-04-05 21:30:00,2018-05-04 19:30:00,2018-07-04 15:29:00,2018-04-05 19:18:20,0.0,0.0,0.0,0,21,3,0,1,0,0
3,Star,9364722,EETLL,FIHEL,2018-05-04 21:30:00,2018-04-05 21:26:00,2018-04-05 21:46:00,2018-05-04 19:30:00,2018-07-04 15:25:00,2018-04-05 19:21:17,0.0,0.0,0.0,0,21,3,0,1,0,0
4,Megastar,9773064,FIHEL,EETLL,2018-05-04 21:30:00,2018-04-05 21:25:00,2018-04-05 21:32:00,2018-05-04 19:30:00,2018-07-04 15:29:00,2018-04-05 19:18:20,0.0,0.0,0.0,0,21,3,0,1,0,0


In [10]:
port_events = port_events.drop(['congestion_label'], axis = 1)

## Port Encoding

In [11]:
port_events = pd.get_dummies(
    port_events,
    columns=["arrival_port"],
    drop_first=True, dtype = int
)


## Final Feature Set

In [12]:
FEATURE_COLS = [
    "arrival_hour",
    "arrival_dayofweek",
    "is_weekend",
    "arrival_delay_hrs",
    "departure_delay_hrs",
    "port_stay_hrs",
    "early_arrival_flag"
]

PORT_COLS = [c for c in port_events.columns if c.startswith("arrival_port_")]

TARGET_COL = "is_congested"

X = port_events[FEATURE_COLS + PORT_COLS].copy()
y = port_events[TARGET_COL]


## Minimal Feature Filtering

In [13]:
from sklearn.feature_selection import VarianceThreshold

vt = VarianceThreshold(threshold=0.0)
X = X[X.columns[vt.fit(X).get_support()]]


## Save Gold Dataset

In [14]:
X_final = X.copy()
X_final[TARGET_COL] = y

In [15]:
X_final.head()

Unnamed: 0,arrival_hour,arrival_dayofweek,is_weekend,arrival_delay_hrs,departure_delay_hrs,early_arrival_flag,arrival_port_FIHEL,is_congested
0,21,3,0,0.0,0.0,1,0,0
1,21,3,0,0.0,0.0,1,0,0
2,21,3,0,0.0,0.0,1,0,0
3,21,3,0,0.0,0.0,1,1,0
4,21,3,0,0.0,0.0,1,0,0


In [16]:
X_final.shape

(322196, 8)

In [17]:
X_final.to_csv(
    os.path.join(GOLD_PATH, "ml_features_binary.csv"),
    index=False
)