In [1]:
import pandas as pd
import numpy as np
import re
from datetime import timedelta
import xgboost as xgb
from xgboost import plot_importance, plot_tree
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
simplefilter(action="ignore", category=FutureWarning)
simplefilter(action="ignore", category=UserWarning)
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit

import loadBar
from csv_parser import CSVParser
from globals import RESOURCE_FOLDER, STEPSIZES, OUTPUT_WINDOW, INPUT_WINDOW, OUTPUT_FORECAST, DELETEABLE_COLUMNS, ONE_HOT_COLUMNS
from markovSquares import apply_markov
from feature_engineer import FeatureEngineer
from exploring_data_functions import *

from searoutePointFinder import fill_with_proximity

In [2]:
parser = CSVParser(RESOURCE_FOLDER)
total_df = parser.retrieve_training_data()

In [3]:
total_df["time"] = pd.to_datetime(total_df['time'])

In [4]:
def calculate_time_diffs_within_window(df, timewindow=timedelta(days=5), n=2):
    result_df= pd.DataFrame()
    vessel_dfs = df.groupby('vesselId')
    
    for _, vessel_df in vessel_dfs:
        vessel_df = vessel_df.sort_values(by='time')

        for i in range(len(vessel_df)):
            loadBar.load_bar(len(vessel_df),i)
            current_row = vessel_df.iloc[i]
            #Pick n random rows that are later in the dataframe but within the timewindow
            future_rows = vessel_df.iloc[i+1:][(vessel_df['time'] > current_row['time']) 
                                               & (vessel_df['time'] <= current_row['time'] + timewindow)]
            if len(future_rows) > n:
                future_rows = future_rows.sample(n=n)
            elif len(future_rows) == 0:
                break
            
            #Add the current row, future latitudes, future longitudes and the time differences to the result_df as n new rows
            for future_row in future_rows.iterrows():
                new_row = current_row.copy()
                new_row['next_latitude'] = future_row[1]['latitude']
                new_row['next_longitude'] = future_row[1]['longitude']
                new_row['time_diff'] = (future_row[1]['time'] - current_row['time']).total_seconds()
                result_df = pd.concat([result_df,new_row])
    
    result_df['diff_seconds'] = result_df['time_diff'].dt.total_seconds()
    result_df['diff_minutes'] = result_df['diff_seconds'] / 60
    result_df['diff_hours'] = result_df['diff_seconds'] / 3600
    result_df['diff_days'] = result_df['diff_seconds'] / 86400
    
    # Identify the last row for each vesselId for separation
    last_rows = df.groupby('vesselId').tail(1)
    
    # Remove the last rows from the main DataFrame
    result_df = result_df.drop(last_rows.index).reset_index(drop=True)
    last_rows = last_rows.reset_index(drop=True)
    
    return result_df, last_rows

In [5]:
def calculate_time_diffs_with_coords(df):
    # Ensure the DataFrame is sorted by vesselId and time
    df = df.sort_values(by=['vesselId', 'time']).reset_index(drop=True)
    last_rows = df.groupby('vesselId').tail(1)
    
    print(last_rows.head())
    # Calculate the time difference in seconds between consecutive times per vesselId
    df['time_diff'] = df.groupby('vesselId')['time'].diff().shift(-1)
    
    # Create columns for each time unit by converting from seconds
    df['diff_seconds'] = df['time_diff'].dt.total_seconds()
    df['diff_minutes'] = df['diff_seconds'] / 60
    df['diff_hours'] = df['diff_seconds'] / 3600
    df['diff_days'] = df['diff_seconds'] / 86400
    
    # Get the latitude and longitude of the next row within each vesselId group
    df['next_latitude'] = df.groupby('vesselId')['latitude'].shift(-1)
    df['next_longitude'] = df.groupby('vesselId')['longitude'].shift(-1)
    
    # Identify the last row for each vesselId
    
    
    # Remove the last rows from the main DataFrame
    df = df.drop(last_rows.index).reset_index(drop=True)
    last_rows = last_rows.reset_index(drop=True)
    
    # Drop the temporary 'time_diff' column
    df = df.drop(columns=['time_diff'])
    
    return df, last_rows

In [None]:
total_df, last_rows = calculate_time_diffs_within_window(total_df)

[==------------------] 14.08% complete

In [None]:
feature_engineering_functions = [categorize_navstat, numerize_UN_LOCODE, numerize_ISO] #, apply_markov, minutes_to_etaParsed, categorize_rot, type_dummies


feature_engineer = FeatureEngineer(total_df)
feature_engineer.apply_features(feature_engineering_functions)
total_df = feature_engineer.get_dataframe()

total_df.set_index("time", inplace=True)

In [None]:

def remove_non_numeric_columns(df):
    # Select columns that are either of type bool, float, or int
    df = df.select_dtypes(include=['bool', 'float', 'int'])
    return df

total_df = remove_non_numeric_columns(total_df)


In [None]:
print(total_df.head())

                       cog   sog  rot  heading  navstat  latitude  longitude  \
time                                                                           
2024-01-12 14:07:47  308.1  17.1   -6      316        0   7.50361   77.58340   
2024-01-12 14:31:00  307.6  17.3    5      313        0   7.57302   77.49505   
2024-01-12 14:57:23  306.8  16.9    5      312        0   7.65043   77.39404   
2024-01-12 15:18:48  307.9  16.9    6      313        0   7.71275   77.31394   
2024-01-12 15:39:47  307.0  16.3    7      313        0   7.77191   77.23585   

                     portLongitude  portLatitude   CEU  ...  rampCapacity  \
time                                                    ...                 
2024-01-12 14:07:47      80.341111     13.263333  6500  ...         150.0   
2024-01-12 14:31:00      72.885278     18.941944  6500  ...         150.0   
2024-01-12 14:57:23      72.885278     18.941944  6500  ...         150.0   
2024-01-12 15:18:48      72.885278     18.941944  6500

In [None]:
def train_test_split(df, perc1, perc2):

    y_list = ['next_latitude', 'next_longitude'] #'next_latitude', 'next_longitude'
    ys = df[y_list]
    Xs = df.drop(columns = y_list)

    X_train = Xs.iloc[:int(np.round(Xs.shape[0]*perc1)),:]
    y_train = ys.iloc[:int(np.round(Xs.shape[0]*perc1)),:]
    X_val = Xs.iloc[int(np.round(Xs.shape[0]*perc1)):int(np.round(Xs.shape[0]*perc2)),:]
    y_val = ys.iloc[int(np.round(Xs.shape[0]*perc1)):int(np.round(Xs.shape[0]*perc2)),:]
    X_test = Xs.iloc[int(np.round(Xs.shape[0]*perc2)):,:]
    y_test = ys.iloc[int(np.round(Xs.shape[0]*perc2)):,:]

    return X_train, y_train, X_val, y_val, X_test, y_test

In [None]:
X_train, y_train, X_val, y_val, X_test, y_test = train_test_split(total_df, 0.85, 0.99)

In [None]:
X_train_cols= X_train.columns

In [None]:
print(X_train)



dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest_X = xgb.DMatrix(X_test)

params = {"objective": "reg:squarederror",
            "max_depth": 7,
            "booster": "gbtree",
            "tree-method": "gpu_hist",
            "colsample_bynode": 0.4,
            "num_parallel_tree": 50,
            "subsample": 0.6,
            "seed": 42,
            "learning_rate": 1,
            #"n_estimators": 100,
            #"reg_alpha": 0.1,
            #"reg_lambda": 0.1,
            #"n_jobs": -1,
            "verbosity": 1
            }

#Subsample was 0.8, num-parallel tree was 100, colsample-bynode was 0.5 and max-depth was 5. Before tuning. 

#After first round of tuning we got sumbsample was 0.6, num-parallel tree was 50, colsample-bynode was 0.4 and max-depth was 7. Before tuning. 

num_boost_round = 30

#Tried 50, but that stagnated quickly. So reducing again to 30.  


early_stopping_rounds = 3

print(dtrain)

model = xgb.train(params, dtrain, num_boost_round, evals=[(dval, "validation")], 
                  early_stopping_rounds=early_stopping_rounds, verbose_eval=True)


preds = model.predict(dtest_X)

                       cog   sog  rot  heading  navstat  latitude  longitude  \
time                                                                           
2024-01-12 14:07:47  308.1  17.1   -6      316        0   7.50361   77.58340   
2024-01-12 14:31:00  307.6  17.3    5      313        0   7.57302   77.49505   
2024-01-12 14:57:23  306.8  16.9    5      312        0   7.65043   77.39404   
2024-01-12 15:18:48  307.9  16.9    6      313        0   7.71275   77.31394   
2024-01-12 15:39:47  307.0  16.3    7      313        0   7.77191   77.23585   
...                    ...   ...  ...      ...      ...       ...        ...   
2024-03-17 20:51:50  308.0  19.7    0      308        0  41.46054   12.36343   
2024-03-17 21:08:09  325.3  19.1    0      325        0  41.52613   12.28797   
2024-03-17 21:35:38  311.3  19.6    0      312        0  41.63601   12.15595   
2024-03-17 21:55:50  312.9  19.6    0      314        0  41.70853   12.04483   
2024-03-17 22:17:02  312.3  19.8    0   

In [None]:
def calculate_time_diff_with_last_rows(test_df, last_rows):
    # Merge test_df with last_rows on vesselId to get the last row time for each vesselId
    merged_df = pd.merge(
        test_df[["ID", "vesselId", "time"]],
        last_rows,
        on='vesselId',
        suffixes=('', '_last')
    )
    print(merged_df.columns)
    # Calculate time difference between test_df time and last row time
    merged_df['time_diff'] = (merged_df['time'] - merged_df['time_last']).dt.total_seconds()
    
    # Create columns for each time unit by converting from seconds
    merged_df['diff_seconds'] = merged_df['time_diff']
    merged_df['diff_minutes'] = merged_df['time_diff'] / 60
    merged_df['diff_hours'] = merged_df['time_diff'] / 3600
    merged_df['diff_days'] = merged_df['time_diff'] / 86400
    
    # Drop the temporary 'time_diff' column and 'time_last' if desired
    merged_df = merged_df.drop(columns=['time_diff', 'time_last'])
    
    
    return merged_df

In [None]:
test_df = parser.retrieve_test_data()

In [None]:
print(test_df.head())

   ID                  vesselId                time  scaling_factor
0   0  61e9f3aeb937134a3c4bfe3d 2024-05-08 00:03:16             0.3
1   1  61e9f473b937134a3c4c02df 2024-05-08 00:06:17             0.3
2   2  61e9f469b937134a3c4c029b 2024-05-08 00:10:02             0.3
3   3  61e9f45bb937134a3c4c0221 2024-05-08 00:10:34             0.3
4   4  61e9f38eb937134a3c4bfd8d 2024-05-08 00:12:27             0.3


In [None]:
print(last_rows)

                   time    cog   sog  rot  heading  navstat  latitude  \
0   2024-05-05 13:25:50   40.0   0.0    1      328        1  36.84686   
1   2024-05-07 23:51:29  291.0   0.3    0      275        2  48.53320   
2   2024-05-07 21:53:14  129.3  15.6    0      128        0  28.82071   
3   2024-05-05 04:33:43  303.7  10.0   -2      303        0  33.86918   
4   2024-05-07 23:42:38   38.9  15.5    3       40        0  33.18068   
..                  ...    ...   ...  ...      ...      ...       ...   
683 2024-05-07 20:04:21   45.8   0.0    0      150        5 -34.06421   
684 2024-05-07 23:59:04   27.6  16.0    0       28        0  54.71940   
685 2024-05-01 11:34:29  182.0  15.0    0      182        0  33.96570   
686 2024-05-07 23:54:24   51.0   0.0    0       51        5  51.35306   
687 2024-05-07 23:59:01   53.6  17.7   -1       51        0  59.89167   

     longitude                   vesselId                    portId  ...  \
0      5.84638   61e9f38eb937134a3c4bfd8b  61d3

In [None]:
test_df=calculate_time_diff_with_last_rows(test_df,last_rows)

Index(['ID', 'vesselId', 'time', 'time_last', 'cog', 'sog', 'rot', 'heading',
       'navstat', 'latitude', 'longitude', 'portId', 'etaParsed', 'UN_LOCODE',
       'ISO', 'portLongitude', 'portLatitude', 'shippingLineId', 'CEU', 'DWT',
       'GT', 'NT', 'vesselType', 'breadth', 'depth', 'draft', 'enginePower',
       'freshWater', 'fuel', 'homePort', 'length', 'maxHeight', 'maxSpeed',
       'maxWidth', 'rampCapacity', 'yearBuilt'],
      dtype='object')


In [None]:
print(test_df.columns)

Index(['ID', 'vesselId', 'time', 'cog', 'sog', 'rot', 'heading', 'navstat',
       'latitude', 'longitude', 'portId', 'etaParsed', 'UN_LOCODE', 'ISO',
       'portLongitude', 'portLatitude', 'shippingLineId', 'CEU', 'DWT', 'GT',
       'NT', 'vesselType', 'breadth', 'depth', 'draft', 'enginePower',
       'freshWater', 'fuel', 'homePort', 'length', 'maxHeight', 'maxSpeed',
       'maxWidth', 'rampCapacity', 'yearBuilt', 'diff_seconds', 'diff_minutes',
       'diff_hours', 'diff_days'],
      dtype='object')


In [None]:
feature_engineer = FeatureEngineer(test_df)
feature_engineer.apply_features(feature_engineering_functions)
test_df = feature_engineer.get_dataframe()

test_df.set_index("time", inplace=True)
test_df = remove_non_numeric_columns(test_df)



In [None]:
print(test_df.head())

                     ID    cog   sog  rot  heading  navstat  latitude  \
time                                                                    
2024-05-08 00:03:16   0  179.6   0.0    0      344        5  31.14647   
2024-05-08 00:06:17   1   24.7   0.0    0      214        5  14.81694   
2024-05-08 00:10:02   2    8.0  18.7    0        6        0  38.27895   
2024-05-08 00:10:34   3  321.3   0.1    0       70        1 -43.53785   
2024-05-08 00:12:27   4  291.0   0.3    0      275        2  48.53320   

                     longitude  portLongitude  portLatitude  ...  maxSpeed  \
time                                                         ...             
2024-05-08 00:03:16  -81.49789     -81.496667     31.140556  ...       NaN   
2024-05-08 00:06:17  120.29625     120.279444     14.808333  ...       NaN   
2024-05-08 00:10:02   10.78280      11.780833     42.098889  ...       NaN   
2024-05-08 00:10:34  172.83522     172.716111    -43.606111  ...      22.2   
2024-05-08 00:12:27 

In [None]:
def make_predictions(model, input_df):
    # Extract the features for prediction, excluding the 'ID' column
    features_df = input_df.drop(columns=['ID'])
    features_df = features_df[X_train_cols]
    # Generate predictions using the model
    predictions = model.predict(xgb.DMatrix(features_df))
    
    # Create a DataFrame with the predictions, renaming columns as required
    predictions_df = pd.DataFrame(predictions, columns=['next_latitude', 'next_longitude'])
    predictions_df = predictions_df.rename(columns={'next_latitude': 'latitude', 'next_longitude': 'longitude'})
    
    # Combine the 'ID' column with the predictions DataFrame
    result_df = pd.concat([input_df['ID'].reset_index(drop=True), predictions_df], axis=1)
    
    return result_df

In [None]:
result_df = make_predictions(model, test_df)

In [None]:
result_df.to_csv(RESOURCE_FOLDER+"/result_simple.csv", index=False)