In [15]:
# import packages
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error,r2_score
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from joblib import dump, load

Adapting datasets to be interpreted

In [2]:
X_sample = pd.read_csv("data/ais_train.csv", delimiter='|')
X_sample.to_csv('data/ais_train_modified.csv', index=False)
extra_vessels = pd.read_csv("data/vessels.csv", on_bad_lines='skip', delimiter='|')
extra_vessels.to_csv('data/vessels_modified.csv', index=False)
extra_ports = pd.read_csv("data/ports.csv", on_bad_lines='skip', delimiter='|')
extra_ports.to_csv('data/ports_modified.csv', index=False)
extra_schedules = pd.read_csv("data/schedules_to_may_2024.csv", on_bad_lines='skip', delimiter='|')
extra_schedules.to_csv('data/schedules_to_may_2024_modified.csv', index=False)

In [3]:
X_evaluation = pd.read_csv("data/ais_test.csv",)
extra_ports = pd.read_csv("data/ports_modified.csv")
extra_vessels = pd.read_csv("data/vessels_modified.csv")
extra_schedules = pd.read_csv("data/schedules_to_may_2024_modified.csv")
X_original = pd.read_csv("data/ais_train_modified.csv")

Changing current data into previous data for the train file

In [62]:
def past_course(original):
    original=original.reset_index()

    original['prev_lat'] = original['latitude'].shift(1).fillna(original['latitude'].iloc[0])
    original['prev_lon'] = original['longitude'].shift(1).fillna(original['longitude'].iloc[0])
    original['time_2'] = original['time'].shift(1).fillna(original['time'].iloc[0])
    original['cog'] = original['cog'].shift(1)
    original['sog'] = original['sog'].shift(1)
    original['rot'] = original['rot'].shift(1)
    original['heading'] = original['heading'].shift(1)
    original['navstat'] = original['navstat'].shift(1)
    original.loc[0,['cog','sog','rot','heading','navstat']]=[0,0,0,0,0]

    return original

def adapting_training_data(original):
    ships = original['vesselId'].unique()
    new = original.copy()
    new['time'] = pd.to_datetime(new['time'])
    new['prev_lat']=original['latitude']
    new['prev_lon']=original['longitude']
    new['time_2'] = new['time']
    new = new.reset_index()
    final = pd.DataFrame(columns=new.columns)
    for c in ships:
        one_ship = new[new['vesselId'] == c].copy()
        new_filtered = past_course(one_ship)
        final = pd.concat([final, new_filtered], ignore_index=True)
    final = final.sort_values(by='index')
    final = final.drop(['index'],axis=1)
    final = final.reset_index(drop=True)
    final = final.drop(['level_0'],axis=1)
    final['time_dif'] = (final['time']-final['time_2']).dt.total_seconds()/3600
    return(final)

def adapting_test_data (evaluation,training):
    evalu=evaluation.copy()
    evalu['time'] = pd.to_datetime(evalu['time'])
    evalu['time_2'] = evalu['time']
    evalu[['cog','sog','heading','navstat','latitude','longitude']] = 0.1
    evalu['etaRaw'] = evalu['time']
    evalu['portId'] ='1'

    train=training.copy()
    train['time'] = pd.to_datetime(train['time'])
    train['time_2'] = train['time']
    evalu = evalu.drop(['ID','scaling_factor'],axis=1)

    final = pd.concat([train, evalu], ignore_index=True)
    x = adapting_training_data(final)
    ships = x['vesselId'].unique()
    evalu = x.iloc[len(train):]
    ships = evalu['vesselId'].unique()
    final = final.iloc[0:0]
    for c in ships:
        filtered = evalu[evalu['vesselId'] == c].copy()
        filtered = filtered.reset_index()
        filtered[['time_2','cog','sog','rot','heading','navstat','etaRaw','prev_lat','prev_lon']] = filtered.iloc[0][['time_2','cog','sog','rot','heading','navstat','etaRaw','prev_lat','prev_lon']]
        final = pd.concat([final,filtered], ignore_index=True)
    
    final['time_dif'] = (final['time']-final['time_2']).dt.total_seconds()/3600
    final = final.sort_values(by='index')
    final = final.drop(['index'],axis=1)
    final = final.reset_index(drop=True)
    return final

In [None]:
test =  adapting_test_data(X_evaluation,X_original)
train = adapting_training_data(X_original)

Model

In [38]:
train = train.drop(['time','etaRaw','vesselId','portId','time_2'],axis=1)
t = test.copy()
t = t.drop(['time','etaRaw','vesselId','portId','time_2','latitude','longitude'],axis=1)
train_lon = train.loc[:,['longitude']]
train_lat = train.loc[:,['latitude']]
train = train.drop(['latitude','longitude'],axis=1)

In [52]:
forest_lon = RandomForestRegressor(n_estimators=100, random_state=42)
forest_lon.fit(train, train_lon.values.ravel())
forest_lat = RandomForestRegressor(n_estimators=100, random_state=42)
forest_lat.fit(train, train_lat.values.ravel())
longitude_predicted = forest_lon.predict(t)
latitude_predicted = forest_lat.predict(t)

In [59]:
forest_prediction = pd.DataFrame({'longitude_predicted':longitude_predicted,'latitude_predicted':latitude_predicted})
forest_prediction = forest_prediction.reset_index()
forest_prediction.rename(columns={'index': 'ID'}, inplace=True)
forest_prediction.to_csv('data/forest_prediction.csv', index=False)