In [28]:
# import packages
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

Adapting datasets to be interpreted

In [29]:
X_original = pd.read_csv("data/ais_train.csv", delimiter='|')
X_original.to_csv('data/ais_train_modified.csv', index=False)
extra_ports = pd.read_csv("data/ports.csv", on_bad_lines='skip', delimiter='|')
extra_ports.to_csv('data/ports_modified.csv', index=False)

In [30]:
X_evaluation = pd.read_csv("data/ais_test.csv",)
X_original['etaRaw'] = pd.to_datetime('2024-' + X_original['etaRaw']+ ':00', format='%Y-%m-%d %H:%M:%S',errors='coerce')
X_original = X_original.dropna(subset=['etaRaw'])

Changing current data into previous data for the train file

In [31]:
def past_course(original):
    original=original.reset_index()

    original['prev_lat'] = original['latitude'].shift(1).fillna(original['latitude'].iloc[0])
    original['prev_lon'] = original['longitude'].shift(1).fillna(original['longitude'].iloc[0])
    original['time_2'] = original['time'].shift(1).fillna(original['time'].iloc[0])
    original['cog'] = original['cog'].shift(1)
    original['sog'] = original['sog'].shift(1)
    original['rot'] = original['rot'].shift(1)
    original['heading'] = original['heading'].shift(1)
    original['navstat'] = original['navstat'].shift(1)
    original.loc[0,['cog','sog','rot','heading','navstat']]=[0,0,0,0,0]

    return original

def two_days_past_course(original):

    original= original.reset_index(drop=True)

    original['prev_lat'] = original['latitude'].shift(100)
    original['prev_lon'] = original['longitude'].shift(100)
    original['time_2'] = original['time'].shift(100)
    original['cog'] = original['cog'].shift(100)
    original['sog'] = original['sog'].shift(100)
    original['rot'] = original['rot'].shift(100)
    original['heading'] = original['heading'].shift(100)
    original['navstat'] = original['navstat'].shift(100)
    original = original.dropna()

    return original

def three_days_past_course(original):

    original= original.reset_index(drop=True)

    original['prev_lat'] = original['latitude'].shift(150)
    original['prev_lon'] = original['longitude'].shift(150)
    original['time_2'] = original['time'].shift(150)
    original['cog'] = original['cog'].shift(150)
    original['sog'] = original['sog'].shift(150)
    original['rot'] = original['rot'].shift(150)
    original['heading'] = original['heading'].shift(150)
    original['navstat'] = original['navstat'].shift(150)
    original = original.dropna()

    return original

def add_data(original,train):
    ships = original['vesselId'].unique()
    new = original.copy()
    new['time'] = pd.to_datetime(new['time'])
    new['prev_lat']=original['latitude']
    new['prev_lon']=original['longitude']
    new['time_2'] = new['time']
    ex = train.copy()
    for c in ships:
        one_ship = new[new['vesselId'] == c].copy()
        new_filtered = two_days_past_course(one_ship)
        ex = pd.concat([ex, new_filtered], ignore_index=True)
    for c in ships:
        one_ship = new[new['vesselId'] == c].copy()
        new_filtered = three_days_past_course(one_ship)
        ex = pd.concat([ex, new_filtered], ignore_index=True)
    ex['time_dif'] = (ex['time']-ex['time_2']).dt.total_seconds()/3600
    ex = ex[ex['time_dif'] <= 125]
    ex = ex.reset_index(drop=True)

    return ex

def adapting_training_data(original):
    ships = original['vesselId'].unique()
    new = original.copy()
    new['time'] = pd.to_datetime(new['time'])
    new['prev_lat']=original['latitude']
    new['prev_lon']=original['longitude']
    new['time_2'] = new['time']
    new = new.reset_index()
    final = pd.DataFrame(columns=new.columns)
    for c in ships:
        one_ship = new[new['vesselId'] == c].copy()
        new_filtered = past_course(one_ship)
        final = pd.concat([final, new_filtered], ignore_index=True)
    final = final.sort_values(by='index')
    final = final.drop(['index'],axis=1)
    final = final.reset_index(drop=True)
    final = final.drop(['level_0'],axis=1)
    final['time_dif'] = (final['time']-final['time_2']).dt.total_seconds()/3600

    return(final)

def adapting_test_data (evaluation,training):
    evalu=evaluation.copy()
    evalu['time'] = pd.to_datetime(evalu['time'])
    evalu['time_2'] = evalu['time']
    evalu[['cog','sog','heading','navstat','latitude','longitude']] = 0.1
    evalu['etaRaw'] = evalu['time']
    evalu['portId'] ='1'

    train=training.copy()
    train['time'] = pd.to_datetime(train['time'])
    train['time_2'] = train['time']
    evalu = evalu.drop(['ID','scaling_factor'],axis=1)

    final = pd.concat([train, evalu], ignore_index=True)
    x = adapting_training_data(final)
    ships = x['vesselId'].unique()
    last_one = pd.DataFrame(columns=x.columns)
    for c in ships:
        one_ship = x[x['vesselId'] == c].copy()
        one_ship=one_ship.reset_index()
        one_ship['etaRaw'] = one_ship['etaRaw'].shift(1)
        one_ship['portId'] = one_ship['portId'].shift(1)
        last_one = pd.concat([last_one, one_ship], ignore_index=True)
    last_one = last_one.sort_values(by='index')
    last_one = last_one.drop(['index'],axis=1)
    last_one = last_one.reset_index(drop=True)
    evalu = last_one.iloc[len(train):]
    ships = evalu['vesselId'].unique()
    final = final.iloc[0:0]
    for c in ships:
        filtered = evalu[evalu['vesselId'] == c].copy()
        filtered = filtered.reset_index()
        filtered[['time_2','cog','sog','rot','heading','navstat','etaRaw','prev_lat','prev_lon','portId']] = filtered.iloc[0][['time_2','cog','sog','rot','heading','navstat','etaRaw','prev_lat','prev_lon','portId']]
        final = pd.concat([final,filtered], ignore_index=True)
    
    final['time_dif'] = (final['time']-final['time_2']).dt.total_seconds()/3600
    final = final.sort_values(by='index')
    final = final.drop(['index'],axis=1)
    final = final.reset_index(drop=True)
    return final

In [32]:
test =  adapting_test_data(X_evaluation,X_original)
train = adapting_training_data(X_original)
train = add_data(X_original, train)

  final = pd.concat([final, new_filtered], ignore_index=True)
  last_one = pd.concat([last_one, one_ship], ignore_index=True)
  last_one = pd.concat([last_one, one_ship], ignore_index=True)
  filtered[['time_2','cog','sog','rot','heading','navstat','etaRaw','prev_lat','prev_lon','portId']] = filtered.iloc[0][['time_2','cog','sog','rot','heading','navstat','etaRaw','prev_lat','prev_lon','portId']]
  filtered[['time_2','cog','sog','rot','heading','navstat','etaRaw','prev_lat','prev_lon','portId']] = filtered.iloc[0][['time_2','cog','sog','rot','heading','navstat','etaRaw','prev_lat','prev_lon','portId']]
  filtered[['time_2','cog','sog','rot','heading','navstat','etaRaw','prev_lat','prev_lon','portId']] = filtered.iloc[0][['time_2','cog','sog','rot','heading','navstat','etaRaw','prev_lat','prev_lon','portId']]
  filtered[['time_2','cog','sog','rot','heading','navstat','etaRaw','prev_lat','prev_lon','portId']] = filtered.iloc[0][['time_2','cog','sog','rot','heading','navstat','etaRaw','pr

In [33]:
train['time_to_destiny'] = (train['etaRaw']-train['time']).dt.total_seconds()/3600
test['time_to_destiny'] = (test['etaRaw']-test['time']).dt.total_seconds()/3600
extra_ports.rename(columns={'latitude': 'port_lat','longitude': 'port_lon'}, inplace=True)
extra_ports = extra_ports.drop(['name','portLocation','UN_LOCODE','countryName','ISO'],axis=1)
train = pd.merge(train, extra_ports, on='portId', how='inner')
test = pd.merge(test, extra_ports, on='portId', how='inner')

In [34]:
def calcular_distancias_separadas(lat1, lon1, lat2, lon2):
    # Radio de la Tierra en kilómetros
    R = 6371.0
    
    # Convertir grados a radianes
    lat1_rad = np.radians(lat1)
    lon1_rad = np.radians(lon1)
    lat2_rad = np.radians(lat2)
    lon2_rad = np.radians(lon2)
    
    # Diferencias de latitud y longitud en radianes
    dlat_rad = lat2_rad - lat1_rad
    dlon_rad = lon2_rad - lon1_rad
    
    # Distancia en la dirección de la latitud (norte-sur)
    distancia_lat_km = R * dlat_rad
    
    # Distancia en la dirección de la longitud (este-oeste), ajustada por la latitud media
    latitud_media = (lat1_rad + lat2_rad) / 2
    distancia_lon_km = R * dlon_rad * np.cos(latitud_media)
    
    return distancia_lat_km, distancia_lon_km

# Aplicar la función y asignar los resultados a nuevas columnas en el DataFrame
train['dist_lat_km'], train['dist_lon_km'] = calcular_distancias_separadas(train['prev_lat'], train['prev_lon'], train['port_lat'], train['port_lon'])
test['dist_lat_km'], test['dist_lon_km'] = calcular_distancias_separadas(test['prev_lat'], test['prev_lon'], test['port_lat'], test['port_lon'])

In [35]:
test_full = test.copy()
test_full = test_full.drop(['cog','sog','rot','heading','navstat','time','etaRaw','vesselId','portId','time_2','latitude','longitude'],axis=1)
train_full = train.copy()
train_full = train.drop(['cog','sog','rot','heading','navstat','time','etaRaw','vesselId','portId','time_2'],axis=1)
train_lon = train_full.loc[:,['longitude']]
train_lat = train_full.loc[:,['latitude']]
train_full = train_full.drop(['latitude','longitude'],axis=1)

del train,test, extra_ports, X_evaluation, X_original

In [36]:
booster_lon = xgb.XGBRegressor(eval_metric='rmse', n_estimators= 300,learning_rate= 0.2,max_depth= 5,subsample= 1,colsample_bytree= 1,gamma= 0)
booster_lon.fit(train_full, train_lon.values.ravel())

booster_lat = xgb.XGBRegressor(eval_metric='rmse',n_estimators= 300,learning_rate= 0.2,max_depth= 5,subsample= 1,colsample_bytree= 0.8,gamma= 0.2)
booster_lat.fit(train_full, train_lat.values.ravel())

booster_longitude_predicted = booster_lon.predict(test_full)
booster_latitude_predicted = booster_lat.predict(test_full)

booster_prediction = pd.DataFrame({'longitude_predicted':booster_longitude_predicted,'latitude_predicted':booster_latitude_predicted})
del booster_lon, booster_longitude_predicted,booster_lat,booster_latitude_predicted

In [None]:
forest_lon = RandomForestRegressor(n_estimators=100, random_state=42)
forest_lon.fit(train_full, train_lon.values.ravel())
forest_longitude_predicted = forest_lon.predict(test_full)

forest_prediction = pd.DataFrame({'longitude_predicted':forest_longitude_predicted})
del forest_lon,forest_longitude_predicted

In [None]:
forest_lat = RandomForestRegressor(n_estimators=100, random_state=42)
forest_lat.fit(train_full, train_lat.values.ravel())
forest_latitude_predicted = forest_lat.predict(test_full)

forest_prediction['latitude_predicted'] = forest_latitude_predicted
del forest_lat,forest_latitude_predicted

In [None]:
x = booster_prediction.copy()
y = forest_prediction.copy()

mixed_prediction= x.copy()
mixed_prediction['latitude_predicted'] = (x['latitude_predicted']+y['latitude_predicted'])/2
mixed_prediction['longitude_predicted'] = (x['longitude_predicted']+y['longitude_predicted'])/2


mixed_prediction = mixed_prediction.reset_index()
mixed_prediction.rename(columns={'index': 'ID'}, inplace=True)
mixed_prediction.to_csv('data/mixed_prediction_mixed.csv', index=False)