In [3]:
# import packages
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error,r2_score
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from joblib import dump, load

Adapting datasets to be interpreted

In [4]:
X_sample = pd.read_csv("data/ais_train.csv", delimiter='|')
X_sample.to_csv('data/ais_train_modified.csv', index=False)
extra_vessels = pd.read_csv("data/vessels.csv", on_bad_lines='skip', delimiter='|')
extra_vessels.to_csv('data/vessels_modified.csv', index=False)
extra_ports = pd.read_csv("data/ports.csv", on_bad_lines='skip', delimiter='|')
extra_ports.to_csv('data/ports_modified.csv', index=False)
extra_schedules = pd.read_csv("data/schedules_to_may_2024.csv", on_bad_lines='skip', delimiter='|')
extra_schedules.to_csv('data/schedules_to_may_2024_modified.csv', index=False)

In [5]:
X_evaluation = pd.read_csv("data/ais_test.csv",)
extra_ports = pd.read_csv("data/ports_modified.csv")
extra_vessels = pd.read_csv("data/vessels_modified.csv")
extra_schedules = pd.read_csv("data/schedules_to_may_2024_modified.csv")
X_original = pd.read_csv("data/ais_train_modified.csv")

Changing current data into previous data for the train file

In [27]:
def past_course(original):
    original=original.reset_index()

    original['prev_lat'] = original['latitude'].shift(1).fillna(original['latitude'].iloc[0])
    original['prev_lon'] = original['longitude'].shift(1).fillna(original['longitude'].iloc[0])
    original['time_2'] = original['time'].shift(1).fillna(original['time'].iloc[0])
    original['cog'] = original['cog'].shift(1)
    original['sog'] = original['sog'].shift(1)
    original['rot'] = original['rot'].shift(1)
    original['heading'] = original['heading'].shift(1)
    original['navstat'] = original['navstat'].shift(1)
    original.loc[0,['cog','sog','rot','heading','navstat']]=[0,0,0,0,0]

    return original

def adapting_training_data(original):
    ships = original['vesselId'].unique()
    new = original.copy()
    new['time'] = pd.to_datetime(new['time'])
    new['prev_lat']=original['latitude']
    new['prev_lon']=original['longitude']
    new['time_2'] = new['time']
    new = new.reset_index()
    final = pd.DataFrame(columns=new.columns)
    for c in ships:
        one_ship = new[new['vesselId'] == c].copy()
        new_filtered = past_course(one_ship)
        final = pd.concat([final, new_filtered], ignore_index=True)
    final = final.sort_values(by='index')
    final = final.drop(['index'],axis=1)
    final = final.reset_index(drop=True)
    final = final.drop(['level_0'],axis=1)
    final['time_dif'] = (final['time']-final['time_2']).dt.total_seconds()/3600
    return(final)

def adapting_test_data (evaluation,training):
    evalu=evaluation.copy()
    evalu['time'] = pd.to_datetime(evalu['time'])
    evalu['time_2'] = evalu['time']
    evalu[['cog','sog','heading','navstat','latitude','longitude']] = 0.1
    evalu['etaRaw'] = evalu['time']
    evalu['portId'] ='1'

    train=training.copy()
    train['time'] = pd.to_datetime(train['time'])
    train['time_2'] = train['time']
    evalu = evalu.drop(['ID','scaling_factor'],axis=1)

    final = pd.concat([train, evalu], ignore_index=True)
    x = adapting_training_data(final)
    ships = x['vesselId'].unique()
    last_one = pd.DataFrame(columns=x.columns)
    for c in ships:
        one_ship = x[x['vesselId'] == c].copy()
        one_ship=one_ship.reset_index()
        one_ship['etaRaw'] = one_ship['etaRaw'].shift(1)
        one_ship['portId'] = one_ship['portId'].shift(1)
        last_one = pd.concat([last_one, one_ship], ignore_index=True)
    last_one = last_one.sort_values(by='index')
    last_one = last_one.drop(['index'],axis=1)
    last_one = last_one.reset_index(drop=True)
    evalu = last_one.iloc[len(train):]
    ships = evalu['vesselId'].unique()
    final = final.iloc[0:0]
    for c in ships:
        filtered = evalu[evalu['vesselId'] == c].copy()
        filtered = filtered.reset_index()
        filtered[['time_2','cog','sog','rot','heading','navstat','etaRaw','prev_lat','prev_lon','portId']] = filtered.iloc[0][['time_2','cog','sog','rot','heading','navstat','etaRaw','prev_lat','prev_lon','portId']]
        final = pd.concat([final,filtered], ignore_index=True)
    
    final['time_dif'] = (final['time']-final['time_2']).dt.total_seconds()/3600
    final = final.sort_values(by='index')
    final = final.drop(['index'],axis=1)
    final = final.reset_index(drop=True)
    return final

In [28]:
test =  adapting_test_data(X_evaluation,X_original)
train = adapting_training_data(X_original)

  final = pd.concat([final, new_filtered], ignore_index=True)
  last_one = pd.concat([last_one, one_ship], ignore_index=True)
  filtered[['time_2','cog','sog','rot','heading','navstat','etaRaw','prev_lat','prev_lon','portId']] = filtered.iloc[0][['time_2','cog','sog','rot','heading','navstat','etaRaw','prev_lat','prev_lon','portId']]
  filtered[['time_2','cog','sog','rot','heading','navstat','etaRaw','prev_lat','prev_lon','portId']] = filtered.iloc[0][['time_2','cog','sog','rot','heading','navstat','etaRaw','prev_lat','prev_lon','portId']]
  filtered[['time_2','cog','sog','rot','heading','navstat','etaRaw','prev_lat','prev_lon','portId']] = filtered.iloc[0][['time_2','cog','sog','rot','heading','navstat','etaRaw','prev_lat','prev_lon','portId']]
  filtered[['time_2','cog','sog','rot','heading','navstat','etaRaw','prev_lat','prev_lon','portId']] = filtered.iloc[0][['time_2','cog','sog','rot','heading','navstat','etaRaw','prev_lat','prev_lon','portId']]
  filtered[['time_2','cog','sog','

In [29]:
train['etaRaw'] = pd.to_datetime('2024-' + train['etaRaw']+ ':00', format='%Y-%m-%d %H:%M:%S')
test['etaRaw'] = pd.to_datetime('2024-' + test['etaRaw']+ ':00', format='%Y-%m-%d %H:%M:%S')
train['time_to_destiny'] = (train['etaRaw']-train['time']).dt.total_seconds()/3600
test['time_to_destiny'] = (test['etaRaw']-test['time']).dt.total_seconds()/3600
extra_ports.rename(columns={'latitude': 'port_lat','longitude': 'port_lon'}, inplace=True)
extra_ports = extra_ports.drop(['name','portLocation','UN_LOCODE','countryName','ISO'],axis=1)
train = pd.merge(train, extra_ports, on='portId', how='inner')
test = pd.merge(test, extra_ports, on='portId', how='inner')


ValueError: time data "2024-00-00 23:00:00" doesn't match format "%Y-%m-%d %H:%M:%S", at position 467. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [25]:
train['etaRaw'] = '2024-' + train['etaRaw']+ ':00:00'

In [26]:
train.head()

Unnamed: 0,time,cog,sog,rot,heading,navstat,etaRaw,latitude,longitude,vesselId,portId,prev_lat,prev_lon,time_2,time_dif
0,2024-01-01 00:00:25,0.0,0.0,0.0,0.0,0.0,2024-01-09 23:00:00:00,-34.7437,-57.8513,61e9f3a8b937134a3c4bfdf7,61d371c43aeaecc07011a37f,-34.7437,-57.8513,2024-01-01 00:00:25,0.0
1,2024-01-01 00:00:36,0.0,0.0,0.0,0.0,0.0,2024-12-29 20:00:00:00,8.8944,-79.47939,61e9f3d4b937134a3c4bff1f,634c4de270937fc01c3a7689,8.8944,-79.47939,2024-01-01 00:00:36,0.0
2,2024-01-01 00:01:45,0.0,0.0,0.0,0.0,0.0,2024-01-02 09:00:00:00,39.19065,-76.47567,61e9f436b937134a3c4c0131,61d3847bb7b7526e1adf3d19,39.19065,-76.47567,2024-01-01 00:01:45,0.0
3,2024-01-01 00:03:11,0.0,0.0,0.0,0.0,0.0,2024-12-31 20:00:00:00,-34.41189,151.02067,61e9f3b4b937134a3c4bfe77,61d36f770a1807568ff9a126,-34.41189,151.02067,2024-01-01 00:03:11,0.0
4,2024-01-01 00:03:51,0.0,0.0,0.0,0.0,0.0,2024-01-25 12:00:00:00,35.88379,-5.91636,61e9f41bb937134a3c4c0087,634c4de270937fc01c3a74f3,35.88379,-5.91636,2024-01-01 00:03:51,0.0


Changing the dates into a different columns

In [13]:
'''
X_original['time'] = pd.to_datetime(X_original['time'])
X_original['year'] = X_original['time'].dt.year
X_original['month'] = X_original['time'].dt.month
X_original['day'] = X_original['time'].dt.day
X_original['hour'] = X_original['time'].dt.hour
X_original['minute'] = X_original['time'].dt.minute
X_original['second'] = X_original['time'].dt.second

X_original['etaRaw'] = pd.to_datetime(X_original['etaRaw'], format='%m-%d %H:%M', errors='coerce')
X_original['eta_month'] = X_original['etaRaw'].dt.month
X_original['eta_day'] = X_original['etaRaw'].dt.day
X_original['eta_hour'] = X_original['etaRaw'].dt.hour
X_original['eta_minute'] = X_original['etaRaw'].dt.minute

X_original = X_original.drop(['time','etaRaw'], axis=1)
'''

Splitting the dataset into a train/test one and x-(all the info)  y-(the results longitud/latitud)

In [14]:
#Generating one test file
X_original = X_original.drop(['vesselId','portId'], axis=1)

x_test=X_original.sample(frac=0.2, random_state=42)
x_train=X_original.drop(x_test.index)

y_test_lon=x_test.loc[:,['longitude']]
y_test_lat=x_test.loc[:,['latitude']]
x_test=x_test.drop(['longitude','latitude'],axis=1)

y_train_lon=x_train.loc[:,['longitude']]
y_train_lat=x_train.loc[:,['latitude']]
x_train = x_train.drop(['longitude','latitude'],axis=1)

In [None]:
#verification (length must match)
print(f"x_test shape: {x_test.shape}")
print(f"y_test_lon shape: {y_test_lon.shape}")
print(f"y_test_lat shape: {y_test_lat.shape}")

In [15]:
#Generating three sets of training data to train 3 different models
#Split of the data in 3 equal slices
x_original_1 = X_original.sample(frac=1/3, random_state=42)
X_original_trans = X_original.drop(x_original_1.index)
x_original_2 = X_original_trans.sample(frac=0.5, random_state=42)
x_original_3 = X_original_trans.drop(x_original_2.index)

#Adapting the slices into x and y
#slice 1
y_test_1_lon = x_original_1.loc[:,['longitude']]
y_test_1_lat = x_original_1.loc[:,['latitude']]
x_test_1 = x_original_1.drop(['longitude','latitude'],axis=1)
x_train_1 = X_original.drop(x_original_1.index)
y_train_1_lon = x_train_1.loc[:,['longitude']]
y_train_1_lat = x_train_1.loc[:,['latitude']]
x_train_1 = x_train_1.drop(['longitude','latitude'],axis=1)
#slice 2
y_test_2_lon=x_original_2.loc[:,['longitude']]
y_test_2_lat=x_original_2.loc[:,['latitude']]
x_test_2 = x_original_2.drop(['longitude','latitude'],axis=1)
x_train_2 = X_original.drop(x_original_2.index)
y_train_2_lon = x_train_2.loc[:,['longitude']]
y_train_2_lat = x_train_2.loc[:,['latitude']]
x_train_2 = x_train_2.drop(['longitude','latitude'],axis=1)
#slice 3
y_test_3_lon=x_original_3.loc[:,['longitude']]
y_test_3_lat=x_original_3.loc[:,['latitude']]
x_test_3 = x_original_3.drop(['longitude','latitude'],axis=1)
x_train_3 = X_original.drop(x_original_3.index)
y_train_3_lon = x_train_3.loc[:,['longitude']]
y_train_3_lat = x_train_3.loc[:,['latitude']]
x_train_3 = x_train_3.drop(['longitude','latitude'],axis=1)

In [18]:
#verification (length must match)
print(f"X_original shape: {X_original.shape}")
print(f"x_train_1 shape: {x_train_1.shape}")
print(f"y_train_1_lon shape: {y_train_1_lon.shape}")
print(f"y_train_1_lat shape: {y_train_1_lat.shape}")
print(f"x_test_1 shape: {x_test_1.shape}")
print(f"y_test_1_lon shape: {y_test_1_lon.shape}")
print(f"y_test_1_lat shape: {y_test_1_lat.shape}")



X_original shape: (1522065, 17)
x_train_1 shape: (1014710, 15)
y_train_1_lon shape: (1014710, 1)
y_train_1_lat shape: (1014710, 1)
x_test_1 shape: (507355, 15)
y_test_1_lon shape: (507355, 1)
y_test_1_lat shape: (507355, 1)


Function to evaluate results

In [7]:
# define a utility function to print out the prediction performance
def evaluate_result(y_test, y_pred):
    x= mean_absolute_error(y_test, y_pred)
    y = r2_score(y_test, y_pred)
    return [x,y]

In [23]:
past_pos_lon = train_check.loc[:,['prev_lon']]
past_pos_lat = train_check.loc[:,['prev_lat']]
past_pos_lon =past_pos_lon.rename(columns={'prev_lon': 'longitude'})
past_pos_lat =past_pos_lat.rename(columns={'prev_lat': 'latitude'})
past_lon = evaluate_result(train_check_lon,past_pos_lon)
past_lat = evaluate_result(train_check_lat,past_pos_lat)

In [38]:
train = train.drop(['time','etaRaw','vesselId','portId','time_2'],axis=1)
train_train = train.sample(frac=0.8, random_state=42)
train_train_lon = train_train.loc[:,['longitude']]
train_train_lat = train_train.loc[:,['latitude']]
train_train = train_train.drop(['longitude','latitude'],axis=1)
train_check=train.drop(train_train.index)
train_check_lon = train_check.loc[:,['longitude']]
train_check_lat = train_check.loc[:,['latitude']]
train_check = train_check.drop(['longitude','latitude'],axis=1)

Model generation of random forest

In [57]:
clf_lat = RandomForestRegressor(n_estimators=100, random_state=42)
clf_lat.fit(train_train, train_train_lat.values.ravel())

In [42]:
clf = RandomForestRegressor(n_estimators=100, random_state=42)
clf.fit(train_train, train_train_lon.values.ravel())

In [39]:
model_lon = xgb.XGBRegressor(eval_metric='rmse')
model_lon.fit(train_train, train_train_lon)

In [40]:
predXGBRegressor = model_lon.predict(train_check)
booster = evaluate_result(train_check_lon,predXGBRegressor)

In [56]:
predforest = clf.predict(t)


In [44]:
print('Last position:',past_lon)
print("XGBregressor:",booster)
print("XGBregressor_opt:",booster_opt)
print("Forestregressor",forest)


Last position: [0.23379592586387568, 0.9941142269088236]
XGBregressor: [0.28474621131379013, 0.9980247020721436]
XGBregressor_opt: [0.5205514057584346, 0.9943536520004272]
Forestregressor [0.06910847985185167, 0.9993316919666602]


In [51]:
test.head()
t = test.copy()
t = t.drop(['time','etaRaw','vesselId','portId','time_2','latitude','longitude'],axis=1)
train_lon = train.loc[:,['longitude']]
train_lat = train.loc[:,['latitude']]
train = train.drop(['latitude','longitude'],axis=1)


In [52]:
forest_lon = RandomForestRegressor(n_estimators=100, random_state=42)
forest_lon.fit(train, train_lon.values.ravel())
forest_lat = RandomForestRegressor(n_estimators=100, random_state=42)
forest_lat.fit(train, train_lat.values.ravel())
longitude_predicted = forest_lon.predict(t)
latitude_predicted = forest_lat.predict(t)

In [None]:
t.head()

In [54]:
forest_prediction = pd.DataFrame({'longitude_predicted':longitude_predicted,'latitude_predicted':latitude_predicted})
forest_prediction = forest_prediction.reset_index()
forest_prediction.rename(columns={'index': 'ID'}, inplace=True)
forest_prediction.to_csv('data/forest_prediction.csv', index=False)

In [19]:
# Definir el modelo
model = xgb.XGBRegressor()

# Definir los hiperparámetros a ajustar
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

grid_search_longitude = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3, verbose=1, n_jobs=-1)

# Ajustar el modelo a los datos de longitud
grid_search_longitude.fit(train_train, train_train_lon)

# Obtener los mejores parámetros
best_params_longitude = grid_search_longitude.best_params_
print(f"Mejores parámetros para longitud: {best_params_longitude}")

Fitting 3 folds for each of 108 candidates, totalling 324 fits
Mejores parámetros para longitud: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'subsample': 1.0}


In [20]:
model2 = grid_search_longitude.best_estimator_
predXGBRegressor_opt = model2.predict(train_check)
booster_opt = evaluate_result(train_check_lon,predXGBRegressor_opt)


Hyperparameter tuning for random forest regressor

In [69]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# model definition
rf_regressor = RandomForestRegressor()

# Search settings
grid_search = GridSearchCV(estimator=rf_regressor, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)

# adjust the model
grid_search.fit(train_train, train_train_lon)

# see best parameters
print(grid_search.best_params_)
print("Mejor puntuación: ", grid_search.best_score_)

KeyboardInterrupt: 

Function to build the submission file

In [1]:
def result_merger(longitude,latitude):
    submission = pd.merge(longitude, latitude, left_index=True, right_index=True, how='inner')
    submission.insert(0, 'ID', range(len(df)))
    submission.to_csv('submission.csv', index=False)
    return submission