In [1]:
# import packages
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error,r2_score
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
import math

Adapting datasets to be interpreted

In [2]:
X_sample = pd.read_csv("data/ais_train.csv", delimiter='|')
X_sample.to_csv('data/ais_train_modified.csv', index=False)
extra_vessels = pd.read_csv("data/vessels.csv", on_bad_lines='skip', delimiter='|')
extra_vessels.to_csv('data/vessels_modified.csv', index=False)
extra_ports = pd.read_csv("data/ports.csv", on_bad_lines='skip', delimiter='|')
extra_ports.to_csv('data/ports_modified.csv', index=False)
extra_schedules = pd.read_csv("data/schedules_to_may_2024.csv", on_bad_lines='skip', delimiter='|')
extra_schedules.to_csv('data/schedules_to_may_2024_modified.csv', index=False)

In [3]:
X_evaluation = pd.read_csv("data/ais_test.csv",)
extra_ports = pd.read_csv("data/ports_modified.csv")
extra_vessels = pd.read_csv("data/vessels_modified.csv")
extra_schedules = pd.read_csv("data/schedules_to_may_2024_modified.csv")
X_original = pd.read_csv("data/ais_train_modified.csv")

Changing current data into immediate anterior data for the file

In [4]:
def past_course(original):
    original=original.reset_index()

    original['prev_lat'] = original['latitude'].shift(1).fillna(original['latitude'].iloc[0])
    original['prev_lon'] = original['longitude'].shift(1).fillna(original['longitude'].iloc[0])
    original['time_2'] = original['time'].shift(1).fillna(original['time'].iloc[0])
    original['cog'] = original['cog'].shift(1)
    original['sog'] = original['sog'].shift(1)
    original['rot'] = original['rot'].shift(1)
    original['heading'] = original['heading'].shift(1)
    original['navstat'] = original['navstat'].shift(1)
    original.loc[0,['cog','sog','rot','heading','navstat']]=[0,0,0,0,0]

    return original

def adapting_training_data(original):
    unique = original['vesselId'].unique()
    new = original.copy()
    new['time'] = pd.to_datetime(new['time'])
    new['prev_lat']=original['latitude']
    new['prev_lon']=original['longitude']
    new['time_2'] = new['time']
    new = new.reset_index()
    final = pd.DataFrame(columns=new.columns)
    for c in unique:
        filtered = new[new['vesselId'] == c].copy()
        new_filtered = past_course(filtered)
        final = pd.concat([final, new_filtered], ignore_index=True)
    final = final.sort_values(by='index')
    final = final.drop(['index'],axis=1)
    final = final.reset_index(drop=True)
    final = final.drop(['level_0'],axis=1)
    return(final)

def adapting_test_data (evaluation,training):
    evalu=evaluation.copy()
    evalu['time'] = pd.to_datetime(evalu['time'])
    evalu['time_2'] = evalu['time']
    evalu['cog'] = 0.1
    evalu['sog'] = 0.1
    evalu['heading'] = 0.1
    evalu['navstat'] = 0.1
    evalu['etaRaw'] = evalu['time']
    evalu['latitude'] = 0.11111111
    evalu['longitude'] = 0.11111111
    evalu['portId'] ='61d371c43aeaecc07011a37f'
    train=training.copy()
    train['time'] = pd.to_datetime(train['time'])
    train['time_2'] = train['time']
    evalu = evalu.drop(['ID','scaling_factor'],axis=1)
    final = pd.concat([train, evalu], ignore_index=True)
    x =adapting_training_data(final)
    evalu = x.iloc[len(train):]
    unique = evalu['vesselId'].unique()
    final = final.iloc[0:0]
    for c in unique:
        filtered = evalu[evalu['vesselId'] == c].copy()
        filtered = filtered.reset_index()
        filtered[['time_2','cog','sog','rot','heading','navstat','etaRaw','prev_lat','prev_lon']] = filtered.iloc[0][['time_2','cog','sog','rot','heading','navstat','etaRaw','prev_lat','prev_lon']]
        final = pd.concat([final,filtered], ignore_index=True)
    final['time_dif'] = (final['time']-final['time_2']).dt.total_seconds()/3600
    final = final.sort_values(by='index')
    final = final.drop(['index'],axis=1)
    final = final.reset_index(drop=True)
    return final

Function to extract new position based on last known location (straight line)

In [58]:
def travel_predict(df):
    lat = df['prev_lat'] * (np.pi / 180)
    lon = df['prev_lon'] * (np.pi / 180)
    speed = df['sog'] * 0.514444 * 3600
    dist = (speed * df['time_dif']) / 1000
    direction = df['cog'] * (np.pi / 180)
    R = 6371

    latf = np.arcsin(np.sin(lat) * np.cos(dist / R) + np.cos(lat) * np.sin(dist / R) * np.cos(direction))
    lonf = lon + np.arctan((np.sin(direction) * np.sin(dist / R) * np.cos(lat)) /(np.cos(dist / R) - (np.sin(lat) * np.sin(latf))))

    latf = latf * (180 / np.pi)
    lonf = lonf * (180 / np.pi)

    result = pd.DataFrame({'longitude_predicted': lonf, 'latitude_predicted': latf})

    return result

New position calculation and dataframe definition

In [None]:
t = adapting_test_data(X_evaluation,X_original)
result = travel_predict(t)
result=result.reset_index()
result.rename(columns={'index': 'ID'}, inplace=True)
result.to_csv('data/first_try.csv', index=False)