In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from datetime import timedelta, datetime, timezone
from tqdm import tqdm

In [3]:
def days_to_secs(days):
    return days * 24 * 3600

In [4]:
# Parameters 
timedelta_threshold_days = 100

In [5]:
sched_df = pd.read_csv('../datasets/schedules_to_may_2024.csv', sep='|')

In [6]:
train = pd.read_csv('../datasets/ais_train.csv', sep='|')

In [7]:
sched_df = pd.read_csv('../datasets/schedules_to_may_2024.csv', sep='|')
sched_df['arrivalDate'] = pd.to_datetime(sched_df['arrivalDate'])
sched_df['sailingDate'] = pd.to_datetime(sched_df['sailingDate'])
sched_df = sched_df.sort_values(by=['vesselId', 'arrivalDate'])

In [8]:
def get_next_dest_and_time_left(row):
    vesselId, timestamp = row['vesselId'], row['time']
    timestamp = timestamp.replace(year=timestamp.year - 1)
    dest_lon = None
    dest_lat = None
    time_left = None
    for idx, r in sched_df[sched_df['vesselId'] == vesselId].iterrows():
        print(f"r: {r}")
        if timestamp < r['arrivalDate'].replace(tzinfo=None):
            dest_lon = r['portLongitude']
            dest_lat = r['portLatitude']
            time_left = r['arrivalDate'].replace(tzinfo=None) - timestamp
            return dest_lon, dest_lat, time_left

def should_be_moored(row):
    vesselId, timestamp = row['vesselId'], row['time']
    for idx, r in sched_df[(sched_df['vesselId'] == vesselId) & (sched_df['arrivalDate'] < timestamp.replace(tzinfo=timezone.utc)) & (sched_df['arrivalDate'] > datetime(2023, 12, 1, 0, 0, tzinfo=timezone.utc))].iterrows():
        if timestamp > r['arrivalDate'].replace(tzinfo=None):
            if timestamp < r['sailingDate'].replace(tzinfo=None):
                return True 
        else:
            return False
    return False

In [9]:
filepath_train = '../datasets/ais_train.csv'
filepath_test = '../datasets/ais_test.csv'

# Load AIS historical data
train = pd.read_csv(filepath_train, sep ='|')  # Replace with your dataset
test = pd.read_csv(filepath_test, sep = ',')

# Preprocessing
train['time'] = pd.to_datetime(train['time'])
train.sort_values(by=['vesselId', 'time'], inplace=True)

test['time'] = pd.to_datetime(test['time'])
test.sort_values(by=['vesselId', 'time'], inplace=True)

# Feature Engineering
train['prev_lat'] = train.groupby('vesselId')['latitude'].shift(1)
train['prev_lon'] = train.groupby('vesselId')['longitude'].shift(1)
train['prev_speed'] = train.groupby('vesselId')['sog'].shift(1)
train['prev_course'] = (train.groupby('vesselId')['cog'].shift(1) / 180) - 1        # normalized
train['prev_rotation'] = train.groupby('vesselId')['rot'].shift(1) 
train['prev_heading'] = (train.groupby('vesselId')['heading'].shift(1)/ 180) - 1 
train['hour'] = train['time'].dt.hour
# Could change this to is_weekend
train['day_of_week'] = train['time'].dt.dayofweek
# Adding timedelta as a feature
train['time_diff'] = train['time'].diff()
train['time_diff_seconds'] = train['time_diff'].dt.total_seconds()

# # Add schedule related stuff
# train['should_be_moored'] = train.apply(should_be_moored, axis=1)
# train.to_csv('../datasets/ais_train_preprocessed.csv')

# --------------------------------- prev_rot-related stuff
# Replace special values with NaN
train['prev_rotation'] = train['prev_rotation'].replace({127: np.nan, -127: np.nan, -128: np.nan})
# Optional: Create a new column for turn information availability
train['turn_info_available'] = np.where(train['prev_rotation'] == -128, 0, 1)

# Create binary columns for turn direction and magnitude
train['turn_direction'] = np.where(train['prev_rotation'] > 0, 'right', 'left')
train['turn_magnitude'] = train['prev_rotation'].abs()

# Fill missing values (optional, using forward fill) Uses most recent non-null value from the row above.
train['prev_rotation'].fillna(method='ffill', inplace=True)


rows_w_too_large_timedelta_idx = train[train['time_diff_seconds'] > days_to_secs(timedelta_threshold_days)].index.to_list()
earliest_idx = train.groupby('vesselId')['time'].idxmin().tolist()
idx_to_drop = rows_w_too_large_timedelta_idx + earliest_idx
train = train.drop(idx_to_drop).reset_index(drop=True)

# REMOVE THIS 
print(f"Rows with too high timedeltas: {len(rows_w_too_large_timedelta_idx)}")

# Drop rows with missing values
train.dropna(inplace=True)

# Define features and target variables
X = train[['prev_lat', 'prev_lon', 'prev_speed', 'prev_course','prev_rotation', 'turn_info_available', 'prev_heading', 'time_diff_seconds']]
y_lat = train['latitude']
y_lon = train['longitude']
X_test = test

X_lat_train, X_lat_val, y_lat_train, y_lat_val = train_test_split(X, y_lat, test_size=0.1, random_state=42)
X_lon_train, X_lon_val, y_lon_train, y_lon_val = train_test_split(X, y_lon, test_size=0.1, random_state=42)

# Train the model
model_lat = RandomForestRegressor(n_estimators=20, verbose=3, random_state=42)
model_lat.fit(X_lat_train.values, y_lat_train.values)

model_lon = RandomForestRegressor(n_estimators=20, verbose=3, random_state=42)
model_lon.fit(X_lon_train.values, y_lon_train.values)

# Make predictions on the validation set
y_lat_pred_val = model_lat.predict(X_lat_val)
y_lon_pred_val = model_lon.predict(X_lon_val)

# Evaluate performance on the validation set
mae_lat = mean_absolute_error(y_lat_val, y_lat_pred_val)
mae_lon = mean_absolute_error(y_lon_val, y_lon_pred_val)

print(f'Mean Absolute Error for Latitude: {mae_lat}')
print(f'Mean Absolute Error for Longitude: {mae_lon}')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['prev_rotation'].fillna(method='ffill', inplace=True)
  train['prev_rotation'].fillna(method='ffill', inplace=True)


Rows with too high timedeltas: 0
building tree 1 of 20
building tree 2 of 20
building tree 3 of 20
building tree 4 of 20
building tree 5 of 20
building tree 6 of 20
building tree 7 of 20
building tree 8 of 20
building tree 9 of 20
building tree 10 of 20
building tree 11 of 20
building tree 12 of 20
building tree 13 of 20
building tree 14 of 20
building tree 15 of 20
building tree 16 of 20
building tree 17 of 20
building tree 18 of 20
building tree 19 of 20
building tree 20 of 20
building tree 1 of 20
building tree 2 of 20
building tree 3 of 20
building tree 4 of 20
building tree 5 of 20
building tree 6 of 20
building tree 7 of 20
building tree 8 of 20
building tree 9 of 20
building tree 10 of 20
building tree 11 of 20
building tree 12 of 20
building tree 13 of 20
building tree 14 of 20
building tree 15 of 20
building tree 16 of 20
building tree 17 of 20
building tree 18 of 20
building tree 19 of 20
building tree 20 of 20




Mean Absolute Error for Latitude: 0.043428654676335415
Mean Absolute Error for Longitude: 0.06294103189001747


In [12]:
# Load AIS historical data
training_data = pd.read_csv(filepath_train, sep ='|')  # Replace with your dataset

# Predict future positions
def predict_future_position(id, vessel_id, time):
    # Fetch the latest known position of the vessel
    latest_data = training_data[training_data['vesselId'] == vessel_id].sort_values(by='time').iloc[-1]

    new_data = {
        'prev_lat': latest_data['latitude'],
        'prev_lon': latest_data['longitude'],
        'prev_speed': latest_data['sog'],
        'prev_course': (latest_data['cog'] / 180) - 1,
        'prev_rotation': latest_data['rot'],
        'turn_info_available': 1 if latest_data['rot'] != -128 else 0,
        'prev_heading' : (latest_data['heading'] / 180) - 1,

        # Convert the times to a datetime_obj
        'time_diff_seconds' : (datetime.strptime(time, '%Y-%m-%d %H:%M:%S') - datetime.strptime(latest_data['time'], '%Y-%m-%d %H:%M:%S')).total_seconds(),
    }
    # why is it done with all this list stuff?
    return id, model_lat.predict([list(new_data.values())])[0], model_lon.predict([list(new_data.values())])[0]

In [13]:
# Open the test file for reading and the prediction file for writing
with open('../datasets/ais_test.csv', 'r') as f_test, open('../predictions/predictions.csv', 'w') as f_pred:
    f_pred.write("ID,longitude_predicted,latitude_predicted\n")
    for line in f_test.readlines()[1:]:
        id, vesselID, time, scaling_factor = line.split(',')
        id, pred_lat, pred_lon = predict_future_position(id, vesselID, time)
        f_pred.write(f"{id},{pred_lon},{pred_lat}\n")