# Voyage Creator

Builds a voyage dataset from AIS tracking data using `VoyageCreator` from `src/voyage_creator.py`.

**Pipeline:**
1. Load AIS data (cargo ships, vessel_type 70–79) and port reference data
2. Detect individual port visits (spatial proximity + speed + gap-based splitting)
3. Label every ping with `current_port`, `origin_port`, and `destination_port`
4. Group into voyages — one voyage per sea leg between consecutive port visits

In [1]:
import sys
sys.path.insert(0, '..')
from pathlib import Path

import pandas as pd
import geopandas as gpd
from shapely.geometry import Point

from src.methods import dms_to_dd
from src.voyage_creator import VoyageCreator

## 1. Load data

In [2]:
folder = Path("../data/ais")

df_ais = pd.concat(
    [pd.read_csv(f) for f in sorted(folder.glob("*.csv"))],
    ignore_index=True,
)

df_cargo = df_ais[df_ais["vessel_type"].between(70, 79)].copy()
df_cargo["base_date_time"] = pd.to_datetime(df_cargo["base_date_time"])
df_cargo = df_cargo.sort_values(["mmsi", "base_date_time"]).reset_index(drop=True)

print(f"Cargo ship pings : {len(df_cargo):,}")
print(f"Unique vessels   : {df_cargo['mmsi'].nunique():,}")
print(f"Date range       : {df_cargo['base_date_time'].min()} → {df_cargo['base_date_time'].max()}")

Cargo ship pings : 9,974
Unique vessels   : 3,029
Date range       : 2025-01-01 00:00:00 → 2025-05-28 20:06:49


In [3]:
df_ports = pd.read_csv("../data/ports/ports.csv")
df_ports["lat_dd"] = df_ports["latitude"].apply(dms_to_dd)
df_ports["lon_dd"] = df_ports["longitude"].apply(dms_to_dd)
df_ports["geometry"] = df_ports.apply(lambda r: Point(r["lon_dd"], r["lat_dd"]), axis=1)
gdf_ports = gpd.GeoDataFrame(df_ports, geometry="geometry", crs="EPSG:4326")

print(f"Ports loaded: {len(gdf_ports):,}")

Ports loaded: 657


## 2. Detect port visits

In [4]:
creator = VoyageCreator(gdf_ports, radius_nm=10, max_speed_knots=1.5, gap_threshold_h=24)

port_visits = creator.find_port_visits(df_cargo)

print(f"Port visits    : {len(port_visits):,}")
print(f"Unique vessels : {port_visits['mmsi'].nunique():,}")
print(f"Unique ports   : {port_visits['portName'].nunique():,}")
display(port_visits.head(10))

Port visits    : 4,847
Unique vessels : 1,504
Unique ports   : 214


Unnamed: 0,mmsi,portName,entry_time,exit_time,duration_hours
0,205700000,Gretna,2025-01-01 00:02:14,2025-01-01 00:02:14,0.0
1,205700000,New Orleans,2025-01-01 00:02:14,2025-01-01 00:02:14,0.0
2,205728000,Gretna,2025-03-23 00:02:39,2025-03-23 00:02:39,0.0
3,205728000,New Orleans,2025-03-23 00:02:39,2025-03-23 00:02:39,0.0
4,207138000,Savannah,2025-01-01 00:02:15,2025-01-01 00:02:15,0.0
5,209207000,Port Everglades,2025-05-28 00:00:59,2025-05-28 00:00:59,0.0
6,209277000,Convent,2025-01-01 00:01:37,2025-01-01 00:01:37,0.0
7,209277000,St. James,2025-01-01 00:01:37,2025-01-01 00:01:37,0.0
8,209390000,Creosote,2025-01-10 00:02:52,2025-01-10 00:02:52,0.0
9,209390000,Fort Ward,2025-01-10 00:02:52,2025-01-10 00:02:52,0.0


## 3. Label every ping

| Column | At port | At sea |
|---|---|---|
| `current_port` | port name | `NaN` |
| `origin_port` | `NaN` | last port departed |
| `destination_port` | `NaN` | next port to arrive at |

In [5]:
df_labeled = creator.label_pings(df_cargo, port_visits)

at_port = df_labeled["current_port"].notna().sum()
print(f"Pings at port          : {at_port:,} ({at_port / len(df_labeled) * 100:.1f}%)")
print(f"Pings at sea           : {df_labeled['current_port'].isna().sum():,}")
print(f"Pings with destination : {df_labeled['destination_port'].notna().sum():,}")

display(
    df_labeled[["mmsi", "base_date_time", "sog", "current_port", "origin_port", "destination_port"]]
    .head(20)
)

Pings at port          : 3,018 (30.3%)
Pings at sea           : 6,956
Pings with destination : 813


Unnamed: 0,mmsi,base_date_time,sog,current_port,origin_port,destination_port
0,205221000,2025-05-28 00:01:51,11.1,,,
1,205700000,2025-01-01 00:02:14,0.1,Gretna,,
2,205728000,2025-03-23 00:02:39,0.0,Gretna,,
3,205789000,2025-01-10 00:01:14,0.1,,,
4,205789000,2025-01-10 00:04:14,0.1,,,
5,205789000,2025-05-28 00:00:05,8.6,,,
6,205789000,2025-05-28 00:01:14,8.6,,,
7,205789000,2025-05-28 00:02:15,8.5,,,
8,205789000,2025-05-28 00:03:16,8.4,,,
9,205790000,2025-05-28 20:06:05,11.7,,,


## 4. Build voyage records

In [6]:
df_labeled, df_voyages = creator.build_voyages(df_labeled, port_visits)

print(f"Voyages constructed : {len(df_voyages)}")
display(df_voyages.head(10))

Voyages constructed : 487


Unnamed: 0,voyage_id,mmsi,departure_port,departure_time,arrival_port,arrival_time,duration_hours,ping_count
0,0,209390000,Winslow,2025-01-10 00:02:52,Quartermaster Harbor,2025-03-23 00:02:10,1727.988333,0
1,1,210065000,Port Manatee,2025-02-20 00:00:47,Port Manatee,2025-05-28 00:01:00,2328.003611,0
2,2,210959000,Panama City,2025-01-10 00:01:58,Port Everglades,2025-02-20 00:00:20,983.972778,0
3,3,210959000,Port Everglades,2025-02-20 00:00:20,Port Everglades,2025-05-28 00:00:07,2327.996389,0
4,4,212205000,Newport News,2025-02-20 00:05:16,Boston,2025-05-28 00:02:43,2327.9575,0
5,5,212275000,New Orleans,2025-01-10 00:02:57,Destrehan,2025-05-28 00:01:49,3311.981111,0
6,6,212533000,New Orleans,2025-02-20 00:03:44,Gretna,2025-03-23 00:01:08,743.956667,0
7,7,215139000,Port Manatee,2025-01-01 00:02:41,Port Manatee,2025-01-10 00:00:41,215.966667,0
8,8,215196000,Baytown,2025-01-01 00:01:58,Miami,2025-01-10 00:02:49,216.014167,0
9,9,215196000,Miami,2025-01-10 00:02:49,Creosote,2025-05-28 00:01:34,3311.979167,0


## 5. Explore

In [7]:
print("=== Voyage dataset summary ===")
print(f"Total voyages   : {len(df_voyages)}")
print(f"Unique vessels  : {df_voyages['mmsi'].nunique()}")
print(f"Unique routes   : {df_voyages.groupby(['departure_port', 'arrival_port']).ngroups}")
print()
print("Duration statistics (hours):")
display(df_voyages["duration_hours"].describe().round(1))

=== Voyage dataset summary ===
Total voyages   : 487
Unique vessels  : 298
Unique routes   : 231

Duration statistics (hours):


count     487.0
mean     1104.5
std       799.9
min       215.9
25%       216.0
50%       983.9
75%      1584.0
max      3528.0
Name: duration_hours, dtype: float64

In [8]:
print("Most common routes:")
display(
    df_voyages
    .groupby(["departure_port", "arrival_port"])
    .agg(count=("voyage_id", "count"), avg_duration_h=("duration_hours", "mean"))
    .reset_index()
    .sort_values("count", ascending=False)
    .head(10)
    .round(1)
)

print("\nSample sea-leg pings:")
display(
    df_labeled[df_labeled["voyage_id"].notna()]
    [["mmsi", "base_date_time", "sog", "origin_port", "destination_port", "voyage_id"]]
    .head(20)
)

Most common routes:


Unnamed: 0,departure_port,arrival_port,count,avg_duration_h
59,Los Angeles,Long Beach,19,1328.8
77,Morgan City,Morgan City,16,1054.5
10,Baltimore,Baltimore,16,1111.5
198,Texas City,Galveston,15,940.8
70,Miami,Miami,12,1012.0
208,Vancouver,Portland,11,1138.9
27,Cape Charles,Cape Charles,9,706.7
137,Portsmouth,Norfolk,9,741.3
125,Port Neches,Beaumont,9,1000.0
2,Anacortes,Anacortes,9,975.9



Sample sea-leg pings:


Unnamed: 0,mmsi,base_date_time,sog,origin_port,destination_port,voyage_id
865,255801370,2025-02-20 00:00:01,10.9,Seattle,Seattle,31
866,255801370,2025-02-20 00:01:12,11.0,Seattle,Seattle,31
867,255801370,2025-02-20 00:02:22,11.1,Seattle,Seattle,31
868,255801370,2025-02-20 00:03:31,10.9,Seattle,Seattle,31
1800,309822000,2025-02-20 00:00:04,11.0,Oakland,Oakland,75
1801,309822000,2025-02-20 00:01:13,11.1,Oakland,Oakland,75
1802,309822000,2025-02-20 00:02:24,11.2,Oakland,Oakland,75
1803,309822000,2025-02-20 00:03:34,11.2,Oakland,Oakland,75
1901,311000808,2025-03-23 00:00:18,9.7,Port Manatee,Tampa,79
1902,311000808,2025-03-23 00:01:27,9.8,Port Manatee,Tampa,79
