In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from datetime import timedelta, datetime, timezone
from tqdm import tqdm

In [4]:
def days_to_secs(days):
    return days * 24 * 3600

In [5]:
# Parameters 
timedelta_threshold_days = 100

In [6]:
sched_df = pd.read_csv('../datasets/schedules_to_may_2024.csv', sep='|')

In [7]:
train = pd.read_csv('../datasets/ais_train.csv', sep='|')
train['time'] = pd.to_datetime(train['time'])

In [8]:
sched_df = pd.read_csv('../datasets/schedules_to_may_2024.csv', sep='|')
sched_df['arrivalDate'] = pd.to_datetime(sched_df['arrivalDate'])
sched_df['sailingDate'] = pd.to_datetime(sched_df['sailingDate'])
sched_df = sched_df.sort_values(by=['vesselId', 'arrivalDate'])

In [92]:
def get_next_dest_and_time_left(row, prev_pos_avail=True):
    vesselId, timestamp = row['vesselId'], row['time']
    dest_lon = None
    dest_lat = None
    time_left = None
    for idx, r in sched_df[sched_df['vesselId'] == vesselId].iterrows():
        if timestamp < r['arrivalDate'].replace(tzinfo=None):
            dest_lon = r['portLongitude']
            dest_lat = r['portLatitude']
            time_left = r['arrivalDate'].replace(tzinfo=None) - timestamp
            ret_is_valid = int(not np.isnan(dest_lon) and not np.isnan(dest_lat))
            if not ret_is_valid:
                if not prev_pos_avail:
                    lon = row['longitude']
                    lat = row['latitude']
                else:
                    lon = row['prev_lon']
                    lat = row['prev_lat']
                return lon, lat, 1, ret_is_valid
            return dest_lon, dest_lat, time_left.total_seconds(), ret_is_valid
    if not prev_pos_avail:
        lon = row['longitude']
        lat = row['latitude']
    else:
        lon = row['prev_lon']
        lat = row['prev_lat']
    return lon, lat, 1, 0

def should_be_moored(row):
    vesselId, timestamp = row['vesselId'], row['time']
    for idx, r in sched_df[(sched_df['vesselId'] == vesselId) & (sched_df['arrivalDate'] < timestamp.replace(tzinfo=timezone.utc)) & (sched_df['arrivalDate'] > datetime(2023, 12, 1, 0, 0, tzinfo=timezone.utc))].iterrows():
        if timestamp > r['arrivalDate'].replace(tzinfo=None):
            if timestamp < r['sailingDate'].replace(tzinfo=None):
                return 1 
        else:
            return 0
    return 0

In [60]:
filepath_train = '../datasets/ais_train.csv'
filepath_test = '../datasets/ais_test.csv'

# Load AIS historical data
train = pd.read_csv(filepath_train, sep ='|')  # Replace with your dataset

# Preprocessing
train['time'] = pd.to_datetime(train['time'])
train.sort_values(by=['vesselId', 'time'], inplace=True)

# Feature Engineering
train['prev_lat'] = train.groupby('vesselId')['latitude'].shift(1)
train['prev_lon'] = train.groupby('vesselId')['longitude'].shift(1)
train.dropna(inplace=True)

test1 = train[train['vesselId'] == '61e9f3aab937134a3c4bfe0f'][:100]

# Apply the function and assign the result to multiple columns
test1[['dest_lon', 'dest_lat', 'time_left', 'sched_data_available']] = test1.apply(get_next_dest_and_time_left, axis=1, result_type='expand')
test1.head(100)


Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId,prev_lat,prev_lon,dest_lon,dest_lat,time_left,sched_data_available
8129,2024-01-01 17:35:40,4.9,12.4,0,5,0,01-01 20:00,33.96656,134.97752,61e9f3aab937134a3c4bfe0f,61d37a091366c3998241d8f6,33.89162,134.96944,,,1.0,0.0
8254,2024-01-01 17:55:50,311.2,7.8,0,314,0,01-01 20:00,34.01504,134.93053,61e9f3aab937134a3c4bfe0f,61d37a091366c3998241d8f6,33.96656,134.97752,,,1.0,0.0
8398,2024-01-01 18:14:50,323.8,0.7,0,18,0,01-01 20:00,34.02601,134.91892,61e9f3aab937134a3c4bfe0f,61d37a091366c3998241d8f6,34.01504,134.93053,,,1.0,0.0
8657,2024-01-01 18:36:51,27.4,5.0,127,34,0,01-01 20:00,34.03284,134.91423,61e9f3aab937134a3c4bfe0f,61d37a091366c3998241d8f6,34.02601,134.91892,,,1.0,0.0
8816,2024-01-01 18:57:02,50.7,5.6,0,51,0,01-01 20:00,34.05790,134.94989,61e9f3aab937134a3c4bfe0f,61d37a091366c3998241d8f6,34.03284,134.91423,,,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123690,2024-01-11 22:38:55,222.5,0.4,0,44,1,01-11 22:00,1.25689,103.88288,61e9f3aab937134a3c4bfe0f,61d37ee429b60f6113c89d01,1.22333,103.89428,33.642778,34.924167,1038065.0,1.0
123743,2024-01-11 22:56:03,249.6,0.2,0,23,1,01-11 22:00,1.25676,103.88269,61e9f3aab937134a3c4bfe0f,61d37ee429b60f6113c89d01,1.25689,103.88288,33.642778,34.924167,1037037.0,1.0
123905,2024-01-11 23:17:03,21.4,0.0,0,32,1,01-11 22:00,1.25670,103.88220,61e9f3aab937134a3c4bfe0f,61d37ee429b60f6113c89d01,1.25676,103.88269,33.642778,34.924167,1035777.0,1.0
124053,2024-01-11 23:32:03,45.4,0.0,0,32,1,01-11 22:00,1.25659,103.88222,61e9f3aab937134a3c4bfe0f,61d37ee429b60f6113c89d01,1.25670,103.88220,33.642778,34.924167,1034877.0,1.0


In [45]:
sched_df[sched_df['vesselId'] == '61e9f39eb937134a3c4bfdc9'].sort_values(by='arrivalDate').iloc[-1]


vesselId             61e9f3aab937134a3c4bfe0f
shippingLineId       61a8e672f9cba188601e84ab
shippingLineName             Hoegh Autoliners
arrivalDate         2024-09-01 22:00:00+00:00
sailingDate         2024-09-01 22:00:00+00:00
portName                   Port of Port Louis
portId               61d37a981366c3998241d9d9
portLatitude                       -20.148056
portLongitude                       57.493611
Name: 94700, dtype: object

In [73]:
sched_df[(sched_df['vesselId'] == '61e9f39eb937134a3c4bfdc9') & (sched_df['arrivalDate'] > datetime(2023, 12, 30, 0, 0, tzinfo=timezone.utc))]

Unnamed: 0,vesselId,shippingLineId,shippingLineName,arrivalDate,sailingDate,portName,portId,portLatitude,portLongitude
28208,61e9f39eb937134a3c4bfdc9,61a8e673f9cba188601e84ae,K-Line,2023-12-31 00:00:00+00:00,2022-01-01 00:00:00+00:00,Port of Brunswick,61d38499b7b7526e1adf3d54,31.140556,-81.496667
53353,61e9f39eb937134a3c4bfdc9,61a8e673f9cba188601e84ae,K-Line,2023-12-31 00:00:00+00:00,2024-01-03 00:00:00+00:00,Port of Bremerhaven,61d375e793c6feb83e5eb3e2,53.563611,8.554722
68749,61e9f39eb937134a3c4bfdc9,61a8e673f9cba188601e84ae,K-Line,2023-12-31 00:00:00+00:00,2024-01-03 00:00:00+00:00,Port of Bremerhaven,61d375e793c6feb83e5eb3e2,53.563611,8.554722
88893,61e9f39eb937134a3c4bfdc9,61a8e673f9cba188601e84ae,K-Line,2023-12-31 00:00:00+00:00,2024-01-03 00:00:00+00:00,Port of Bremerhaven,61d375e793c6feb83e5eb3e2,53.563611,8.554722
91557,61e9f39eb937134a3c4bfdc9,61a8e673f9cba188601e84ae,K-Line,2023-12-31 00:00:00+00:00,2024-01-03 00:00:00+00:00,Port of Bremerhaven,61d375e793c6feb83e5eb3e2,53.563611,8.554722
...,...,...,...,...,...,...,...,...,...
135593,61e9f39eb937134a3c4bfdc9,61a8e673f9cba188601e84ae,K-Line,2024-02-09 00:00:00+00:00,2024-02-10 00:00:00+00:00,Port of Bremerhaven,61d375e793c6feb83e5eb3e2,53.563611,8.554722
59940,61e9f39eb937134a3c4bfdc9,61a8e673f9cba188601e84ae,K-Line,2024-02-10 00:00:00+00:00,2024-02-12 00:00:00+00:00,Port of Bremerhaven,61d375e793c6feb83e5eb3e2,53.563611,8.554722
27663,61e9f39eb937134a3c4bfdc9,61a8e673f9cba188601e84ae,K-Line,2024-02-12 00:00:00+00:00,2024-02-13 00:00:00+00:00,Port of Bruges-Zeebrugge,61d36f9a0a1807568ff9a156,51.336389,3.207222
135588,61e9f39eb937134a3c4bfdc9,61a8e673f9cba188601e84ae,K-Line,2024-02-12 00:00:00+00:00,2024-02-13 00:00:00+00:00,Port of Bruges-Zeebrugge,61d36f9a0a1807568ff9a156,51.336389,3.207222


In [75]:
filepath_train = '../datasets/ais_train.csv'
filepath_test = '../datasets/ais_test.csv'

# Load AIS historical data
train = pd.read_csv(filepath_train, sep ='|')  # Replace with your dataset
test = pd.read_csv(filepath_test, sep = ',')

# Preprocessing
train['time'] = pd.to_datetime(train['time'])
train.sort_values(by=['vesselId', 'time'], inplace=True)

test['time'] = pd.to_datetime(test['time'])
test.sort_values(by=['vesselId', 'time'], inplace=True)

# Feature Engineering
train['prev_lat'] = train.groupby('vesselId')['latitude'].shift(1)
train['prev_lon'] = train.groupby('vesselId')['longitude'].shift(1)
train['prev_speed'] = train.groupby('vesselId')['sog'].shift(1)
train['prev_course'] = (train.groupby('vesselId')['cog'].shift(1) / 180) - 1        # normalized
train['prev_rotation'] = train.groupby('vesselId')['rot'].shift(1) 
train['prev_heading'] = (train.groupby('vesselId')['heading'].shift(1)/ 180) - 1 
train['hour'] = train['time'].dt.hour
# Could change this to is_weekend
train['day_of_week'] = train['time'].dt.dayofweek
# Adding timedelta as a feature
train['time_diff'] = train['time'].diff()
train['time_diff_seconds'] = train['time_diff'].dt.total_seconds()

# --------------------------------- prev_rot-related stuff
# Replace special values with NaN
train['prev_rotation'] = train['prev_rotation'].replace({127: np.nan, -127: np.nan, -128: np.nan})
# Optional: Create a new column for turn information availability
train['turn_info_available'] = np.where(train['prev_rotation'] == -128, 0, 1)

# Create binary columns for turn direction and magnitude
train['turn_direction'] = np.where(train['prev_rotation'] > 0, 'right', 'left')
train['turn_magnitude'] = train['prev_rotation'].abs()

# Fill missing values (optional, using forward fill) Uses most recent non-null value from the row above.
train['prev_rotation'].fillna(method='ffill', inplace=True)


rows_w_too_large_timedelta_idx = train[train['time_diff_seconds'] > days_to_secs(timedelta_threshold_days)].index.to_list()
earliest_idx = train.groupby('vesselId')['time'].idxmin().tolist()
idx_to_drop = rows_w_too_large_timedelta_idx + earliest_idx
train = train.drop(idx_to_drop).reset_index(drop=True)

# Drop rows with missing values
train.dropna(inplace=True)

# Add schedule related stuff
train['should_be_moored'] = train.apply(should_be_moored, axis=1)
# Apply the function and assign the result to multiple columns
train[['dest_lon', 'dest_lat', 'time_left', 'sched_data_available']] = train.apply(get_next_dest_and_time_left, axis=1, result_type='expand')

train.dropna(inplace=True)

print(f"Length of dataset after preprocessing: {len(train)}")

train.to_csv('../datasets/ais_train_preprocessed.csv')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['prev_rotation'].fillna(method='ffill', inplace=True)
  train['prev_rotation'].fillna(method='ffill', inplace=True)


Length of dataset after preprocessing: 1497400


In [76]:
# Define features and target variables
X = train[['prev_lat', 'prev_lon', 'prev_speed', 'prev_course','prev_rotation', 'turn_info_available', 'prev_heading', 'time_diff_seconds', 'dest_lon', 'dest_lat', 'time_left', 'sched_data_available', 'should_be_moored']]
y_lat = train['latitude']
y_lon = train['longitude']
X_test = test

X_lat_train, X_lat_val, y_lat_train, y_lat_val = train_test_split(X, y_lat, test_size=0.1, random_state=42)
X_lon_train, X_lon_val, y_lon_train, y_lon_val = train_test_split(X, y_lon, test_size=0.1, random_state=42)


In [79]:
# Train the model
model_lat = RandomForestRegressor(n_estimators=20, verbose=3, random_state=42)
model_lat.fit(X_lat_train.values, y_lat_train.values)

model_lon = RandomForestRegressor(n_estimators=20, verbose=3, random_state=42)
model_lon.fit(X_lon_train.values, y_lon_train.values)

# Make predictions on the validation set
y_lat_pred_val = model_lat.predict(X_lat_val)
y_lon_pred_val = model_lon.predict(X_lon_val)

# Evaluate performance on the validation set
mae_lat = mean_absolute_error(y_lat_val, y_lat_pred_val)
mae_lon = mean_absolute_error(y_lon_val, y_lon_pred_val)

print(f'Mean Absolute Error for Latitude: {mae_lat}')
print(f'Mean Absolute Error for Longitude: {mae_lon}')

building tree 1 of 20
building tree 2 of 20
building tree 3 of 20
building tree 4 of 20
building tree 5 of 20
building tree 6 of 20
building tree 7 of 20
building tree 8 of 20
building tree 9 of 20
building tree 10 of 20
building tree 11 of 20
building tree 12 of 20
building tree 13 of 20
building tree 14 of 20
building tree 15 of 20
building tree 16 of 20
building tree 17 of 20
building tree 18 of 20
building tree 19 of 20
building tree 20 of 20
building tree 1 of 20
building tree 2 of 20
building tree 3 of 20
building tree 4 of 20
building tree 5 of 20
building tree 6 of 20
building tree 7 of 20
building tree 8 of 20
building tree 9 of 20
building tree 10 of 20
building tree 11 of 20
building tree 12 of 20
building tree 13 of 20
building tree 14 of 20
building tree 15 of 20
building tree 16 of 20
building tree 17 of 20
building tree 18 of 20
building tree 19 of 20
building tree 20 of 20




Mean Absolute Error for Latitude: 0.0434668991990091
Mean Absolute Error for Longitude: 0.062209693802631784


In [94]:
# Load AIS historical data
training_data = pd.read_csv(filepath_train, sep ='|')  # Replace with your dataset
training_data['time'] = pd.to_datetime(training_data['time'])

# Predict future positions
def predict_future_position(id, vessel_id, time):
    # Fetch the latest known position of the vessel
    latest_data = training_data[training_data['vesselId'] == vessel_id].sort_values(by='time').iloc[-1]
    dest_lon, dest_lat, time_left, sched_data_available = get_next_dest_and_time_left(latest_data, prev_pos_avail=False)

    new_data = {
        'prev_lat': latest_data['latitude'],
        'prev_lon': latest_data['longitude'],
        'prev_speed': latest_data['sog'],
        'prev_course': (latest_data['cog'] / 180) - 1,
        'prev_rotation': latest_data['rot'],
        'turn_info_available': 1 if latest_data['rot'] != -128 else 0,
        'prev_heading' : (latest_data['heading'] / 180) - 1,

        # Convert the times to a datetime_obj
        'time_diff_seconds' : (datetime.strptime(time, '%Y-%m-%d %H:%M:%S') - latest_data['time']).total_seconds(),
        'dest_lon' : dest_lon,
        'dest_lat' : dest_lat,
        'time_left' : time_left,
        'sched_data_available' : sched_data_available,
        'should_be_moored' : should_be_moored(latest_data),
    }
    # why is it done with all this list stuff?
    return id, model_lat.predict([list(new_data.values())])[0], model_lon.predict([list(new_data.values())])[0]

# ['prev_lat', 'prev_lon', 'prev_speed', 'prev_course','prev_rotation', 'turn_info_available', 'prev_heading', 'time_diff_seconds', 'dest_lon', 'dest_lat', 'time_left', 'sched_data_available', 'should_be_moored']

In [96]:
from tqdm import tqdm

# Open the test file for reading and the prediction file for writing
with open('../datasets/ais_test.csv', 'r') as f_test, open('../predictions/predictions.csv', 'w') as f_pred:
    f_pred.write("ID,longitude_predicted,latitude_predicted\n")
    for line in tqdm(f_test.readlines()[1:]):
        id, vesselID, time, scaling_factor = line.split(',')
        id, pred_lat, pred_lon = predict_future_position(id, vesselID, time)
        f_pred.write(f"{id},{pred_lon},{pred_lat}\n")

  9%|▉         | 4629/51739 [05:01<51:04, 15.37it/s]  


KeyboardInterrupt: 