In [4]:
# import packages
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error,r2_score
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from joblib import dump, load
import seaborn as sns
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

Adapting datasets to be interpreted

In [5]:
X_sample = pd.read_csv("data/ais_train.csv", delimiter='|')
X_sample.to_csv('data/ais_train_modified.csv', index=False)
extra_vessels = pd.read_csv("data/vessels.csv", on_bad_lines='skip', delimiter='|')
extra_vessels.to_csv('data/vessels_modified.csv', index=False)
extra_ports = pd.read_csv("data/ports.csv", on_bad_lines='skip', delimiter='|')
extra_ports.to_csv('data/ports_modified.csv', index=False)
extra_schedules = pd.read_csv("data/schedules_to_may_2024.csv", on_bad_lines='skip', delimiter='|')
extra_schedules.to_csv('data/schedules_to_may_2024_modified.csv', index=False)

In [6]:
X_evaluation = pd.read_csv("data/ais_test.csv",)
extra_ports = pd.read_csv("data/ports_modified.csv")
extra_vessels = pd.read_csv("data/vessels_modified.csv")
extra_schedules = pd.read_csv("data/schedules_to_may_2024_modified.csv")
X_original = pd.read_csv("data/ais_train_modified.csv")
X_original['etaRaw'] = pd.to_datetime('2024-' + X_original['etaRaw']+ ':00', format='%Y-%m-%d %H:%M:%S',errors='coerce')
X_original = X_original.dropna(subset=['etaRaw'])

Changing current data into previous data for the train file

In [7]:
def past_course(original):
    original=original.reset_index()

    original['prev_lat'] = original['latitude'].shift(1).fillna(original['latitude'].iloc[0])
    original['prev_lon'] = original['longitude'].shift(1).fillna(original['longitude'].iloc[0])
    original['time_2'] = original['time'].shift(1).fillna(original['time'].iloc[0])
    original['cog'] = original['cog'].shift(1)
    original['sog'] = original['sog'].shift(1)
    original['rot'] = original['rot'].shift(1)
    original['heading'] = original['heading'].shift(1)
    original['navstat'] = original['navstat'].shift(1)
    original.loc[0,['cog','sog','rot','heading','navstat']]=[0,0,0,0,0]

    return original

def two_days_past_course(original):

    original= original.reset_index(drop=True)

    original['prev_lat'] = original['latitude'].shift(100)
    original['prev_lon'] = original['longitude'].shift(100)
    original['time_2'] = original['time'].shift(100)
    original['cog'] = original['cog'].shift(100)
    original['sog'] = original['sog'].shift(100)
    original['rot'] = original['rot'].shift(100)
    original['heading'] = original['heading'].shift(100)
    original['navstat'] = original['navstat'].shift(100)
    original = original.dropna()

    return original

def three_days_past_course(original):

    original= original.reset_index(drop=True)

    original['prev_lat'] = original['latitude'].shift(150)
    original['prev_lon'] = original['longitude'].shift(150)
    original['time_2'] = original['time'].shift(150)
    original['cog'] = original['cog'].shift(150)
    original['sog'] = original['sog'].shift(150)
    original['rot'] = original['rot'].shift(150)
    original['heading'] = original['heading'].shift(150)
    original['navstat'] = original['navstat'].shift(150)
    original = original.dropna()

    return original

def add_data(original,train):
    ships = original['vesselId'].unique()
    new = original.copy()
    new['time'] = pd.to_datetime(new['time'])
    new['prev_lat']=original['latitude']
    new['prev_lon']=original['longitude']
    new['time_2'] = new['time']
    ex = train.copy()
    for c in ships:
        one_ship = new[new['vesselId'] == c].copy()
        new_filtered = two_days_past_course(one_ship)
        ex = pd.concat([ex, new_filtered], ignore_index=True)
    for c in ships:
        one_ship = new[new['vesselId'] == c].copy()
        new_filtered = three_days_past_course(one_ship)
        ex = pd.concat([ex, new_filtered], ignore_index=True)
    ex['time_dif'] = (ex['time']-ex['time_2']).dt.total_seconds()/3600
    ex = ex[ex['time_dif'] <= 125]
    ex = ex.reset_index(drop=True)

    return ex

def adapting_training_data(original):
    ships = original['vesselId'].unique()
    new = original.copy()
    new['time'] = pd.to_datetime(new['time'])
    new['prev_lat']=original['latitude']
    new['prev_lon']=original['longitude']
    new['time_2'] = new['time']
    new = new.reset_index()
    final = pd.DataFrame(columns=new.columns)
    for c in ships:
        one_ship = new[new['vesselId'] == c].copy()
        new_filtered = past_course(one_ship)
        final = pd.concat([final, new_filtered], ignore_index=True)
    final = final.sort_values(by='index')
    final = final.drop(['index'],axis=1)
    final = final.reset_index(drop=True)
    final = final.drop(['level_0'],axis=1)
    final['time_dif'] = (final['time']-final['time_2']).dt.total_seconds()/3600

    return(final)

def adapting_test_data (evaluation,training):
    evalu=evaluation.copy()
    evalu['time'] = pd.to_datetime(evalu['time'])
    evalu['time_2'] = evalu['time']
    evalu[['cog','sog','heading','navstat','latitude','longitude']] = 0.1
    evalu['etaRaw'] = evalu['time']
    evalu['portId'] ='1'

    train=training.copy()
    train['time'] = pd.to_datetime(train['time'])
    train['time_2'] = train['time']
    evalu = evalu.drop(['ID','scaling_factor'],axis=1)

    final = pd.concat([train, evalu], ignore_index=True)
    x = adapting_training_data(final)
    ships = x['vesselId'].unique()
    last_one = pd.DataFrame(columns=x.columns)
    for c in ships:
        one_ship = x[x['vesselId'] == c].copy()
        one_ship=one_ship.reset_index()
        one_ship['etaRaw'] = one_ship['etaRaw'].shift(1)
        one_ship['portId'] = one_ship['portId'].shift(1)
        last_one = pd.concat([last_one, one_ship], ignore_index=True)
    last_one = last_one.sort_values(by='index')
    last_one = last_one.drop(['index'],axis=1)
    last_one = last_one.reset_index(drop=True)
    evalu = last_one.iloc[len(train):]
    ships = evalu['vesselId'].unique()
    final = final.iloc[0:0]
    for c in ships:
        filtered = evalu[evalu['vesselId'] == c].copy()
        filtered = filtered.reset_index()
        filtered[['time_2','cog','sog','rot','heading','navstat','etaRaw','prev_lat','prev_lon','portId']] = filtered.iloc[0][['time_2','cog','sog','rot','heading','navstat','etaRaw','prev_lat','prev_lon','portId']]
        final = pd.concat([final,filtered], ignore_index=True)
    
    final['time_dif'] = (final['time']-final['time_2']).dt.total_seconds()/3600
    final = final.sort_values(by='index')
    final = final.drop(['index'],axis=1)
    final = final.reset_index(drop=True)
    return final

In [8]:
test =  adapting_test_data(X_evaluation,X_original)
train = adapting_training_data(X_original)
train = add_data(X_original, train)

  final = pd.concat([final, new_filtered], ignore_index=True)
  last_one = pd.concat([last_one, one_ship], ignore_index=True)
  last_one = pd.concat([last_one, one_ship], ignore_index=True)
  filtered[['time_2','cog','sog','rot','heading','navstat','etaRaw','prev_lat','prev_lon','portId']] = filtered.iloc[0][['time_2','cog','sog','rot','heading','navstat','etaRaw','prev_lat','prev_lon','portId']]
  filtered[['time_2','cog','sog','rot','heading','navstat','etaRaw','prev_lat','prev_lon','portId']] = filtered.iloc[0][['time_2','cog','sog','rot','heading','navstat','etaRaw','prev_lat','prev_lon','portId']]
  filtered[['time_2','cog','sog','rot','heading','navstat','etaRaw','prev_lat','prev_lon','portId']] = filtered.iloc[0][['time_2','cog','sog','rot','heading','navstat','etaRaw','prev_lat','prev_lon','portId']]
  filtered[['time_2','cog','sog','rot','heading','navstat','etaRaw','prev_lat','prev_lon','portId']] = filtered.iloc[0][['time_2','cog','sog','rot','heading','navstat','etaRaw','pr

In [9]:
train['time_to_destiny'] = (train['etaRaw']-train['time']).dt.total_seconds()/3600
test['time_to_destiny'] = (test['etaRaw']-test['time']).dt.total_seconds()/3600
extra_ports.rename(columns={'latitude': 'port_lat','longitude': 'port_lon'}, inplace=True)
extra_ports = extra_ports.drop(['name','portLocation','UN_LOCODE','countryName','ISO'],axis=1)
train = pd.merge(train, extra_ports, on='portId', how='inner')
test = pd.merge(test, extra_ports, on='portId', how='inner')

In [10]:
test_full = test.copy()
test_full = test_full.drop(['cog','sog','rot','heading','navstat','time','etaRaw','vesselId','portId','time_2','latitude','longitude'],axis=1)
train_full = train.copy()
train_full = train.drop(['cog','sog','rot','heading','navstat','time','etaRaw','vesselId','portId','time_2'],axis=1)
train_lon = train_full.loc[:,['longitude']]
train_lat = train_full.loc[:,['latitude']]
train_full = train_full.drop(['latitude','longitude'],axis=1)

In [10]:
forest_lon = xgb.XGBRegressor(eval_metric='rmse')
forest_lon.fit(train_full, train_lon.values.ravel())
forest_lat = xgb.XGBRegressor(eval_metric='rmse')
forest_lat.fit(train_full, train_lat.values.ravel())
longitude_predicted = forest_lon.predict(test_full)
latitude_predicted = forest_lat.predict(test_full)


forest_prediction = pd.DataFrame({'longitude_predicted':longitude_predicted,'latitude_predicted':latitude_predicted})
forest_prediction = forest_prediction.reset_index()
forest_prediction.rename(columns={'index': 'ID'}, inplace=True)
forest_prediction.to_csv('data/xgb_prediction_more_dates.csv', index=False)

In [12]:
model = Sequential()
model.add(Dense(64, input_shape=(train_full.shape[1],), activation='relu'))  # Capa oculta de 64 neuronas
model.add(Dense(32, activation='relu'))  # Otra capa oculta de 32 neuronas
model.add(Dense(1, activation='linear'))  # Capa de salida para regresión

model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
history = model.fit(train_full, train_lon, epochs=50, batch_size=32)


model2 = Sequential()
model2.add(Dense(64, input_shape=(train_full.shape[1],), activation='relu'))  # Capa oculta de 64 neuronas
model2.add(Dense(32, activation='relu'))  # Otra capa oculta de 32 neuronas
model2.add(Dense(1, activation='linear'))  # Capa de salida para regresión

model2.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
history = model2.fit(train_full, train_lat, epochs=50, batch_size=32)


longitude_predicted = model.predict(test_full)
latitude_predicted = model2.predict(test_full)

plt.plot(history.history['loss'], label='Pérdida de entrenamiento')
plt.plot(history.history['val_loss'], label='Pérdida de validación')
plt.xlabel('Épocas')
plt.ylabel('Pérdida')
plt.legend()
plt.show()

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m 74454/112064[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m23s[0m 625us/step - loss: 68.6084 - mae: 1.9238

KeyboardInterrupt: 

In [None]:
forest_prediction = pd.DataFrame({'longitude_predicted':longitude_predicted,'latitude_predicted':latitude_predicted})
forest_prediction = forest_prediction.reset_index()
forest_prediction.rename(columns={'index': 'ID'}, inplace=True)
forest_prediction.to_csv('data/neural_prediction_more_dates.csv', index=False)