In [1]:
%load_ext autoreload

In [None]:
import pandas as pd
import numpy as np
import re
import xgboost as xgb
from xgboost import plot_importance, plot_tree
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit

import loadBar
from csv_parser import CSVParser
from globals import RESOURCE_FOLDER, STEPSIZES, OUTPUT_WINDOW, INPUT_WINDOW, OUTPUT_FORECAST, DELETEABLE_COLUMNS, ONE_HOT_COLUMNS
from markovSquares import apply_markov
from feature_engineer import FeatureEngineer
from exploring_data_functions import *

from searoutePointFinder import fill_with_proximity


In [None]:
parser = CSVParser(RESOURCE_FOLDER)
index_data = parser.retrieve_training_data()



In [None]:

index_data.set_index("time", inplace=True)

In [None]:
def resampler(df, sorting_column, freq):
    unique_ids = df[sorting_column].unique()
    final_df = pd.DataFrame()
    partial_list = []

    for i in range(len(unique_ids)):
        loadBar.load_bar(len(unique_ids),i+1)
        resample_partial = df[df[sorting_column] == unique_ids[i]].resample(freq).last()

        resample_partial = fill_with_proximity(resample_partial)
        partial_list.append(resample_partial)

    for chunk in partial_list:
        final_df = pd.concat([final_df,chunk])
    
    return final_df

print(index_data)
resampled_data_20min = resampler(index_data, "vesselId", "20min")

resampled_data_20min.to_csv("../../Project materials(1)/data_resampled_20min.csv")

In [None]:
total_df = pd.read_csv("../../Project materials(1)/data_resampled_20min.csv")


# total_df = pd.read_csv(RESOURCE_FOLDER+"/resampled_data_h.csv")

total_df['etaParsed'] = pd.to_datetime(total_df['etaParsed'])
total_df["time"] = pd.to_datetime(total_df['time'])

start_date = pd.to_datetime('2024-01-01')

total_df["etaParsed"] = (total_df['etaParsed'] - start_date).dt.days

In [None]:

time_diffs = total_df["time"].diff()
time_interval = time_diffs.dropna().iloc[0]
time_interval = int(time_interval.total_seconds()/(60*20))



total_df.set_index("time", inplace=True)

In [None]:
total_df.to_csv(RESOURCE_FOLDER+"/data_resampled_20min_markov.csv")

In [None]:
print(total_df.columns)

In [8]:
total_df = pd.read_csv(RESOURCE_FOLDER+"/data_resampled_20min.csv")
total_df["time"] = pd.to_datetime(total_df['time'])

0         2024-01-01 00:00:00
1         2024-01-01 00:20:00
2         2024-01-01 00:40:00
3         2024-01-01 01:00:00
4         2024-01-01 01:20:00
                  ...        
5899716   2024-05-04 04:00:00
5899717   2024-05-04 04:20:00
5899718   2024-05-04 04:40:00
5899719   2024-05-04 05:00:00
5899720   2024-05-04 05:20:00
Name: time, Length: 5899721, dtype: datetime64[ns]
0         2024-01-01 00:00:00
1         2024-01-01 00:20:00
2         2024-01-01 00:40:00
3         2024-01-01 01:00:00
4         2024-01-01 01:20:00
                  ...        
5899716   2024-05-04 04:00:00
5899717   2024-05-04 04:20:00
5899718   2024-05-04 04:40:00
5899719   2024-05-04 05:00:00
5899720   2024-05-04 05:20:00
Name: time, Length: 5899721, dtype: datetime64[ns]


In [None]:
feature_engineering_functions = [categorize_navstat,categorize_rot,numerize_UN_LOCODE, numerize_ISO, days_to_etaParsed] #,apply_markov


feature_engineer = FeatureEngineer(total_df)
feature_engineer.apply_features(feature_engineering_functions)
total_df = feature_engineer.get_dataframe()

total_df.set_index("time", inplace=True)

TypeError: cannot subtract DatetimeArray from ndarray

In [None]:

one_hot_columns = ONE_HOT_COLUMNS
total_df = pd.get_dummies(total_df, columns=one_hot_columns, drop_first=False)

In [None]:
#Make time series into supervised problem

# 1 = 20 minutes
# 3 = 1 hour
# 18 = 6 hours
# 72 = 24 hours
# 144 = 2 days
# 216 = 3 days
# 288 = 4 days
# 360 = 5 days



def make_supervised(df, forecast_columns, sorting_column, input_window=1, output_window=1):
    """
    Converts a multivariate time series dataframe into a supervised learning problem.
    
    Parameters:
    df (pd.DataFrame): The original dataframe with time series data.
    forecast_columns (list): A list of column names to forecast.
    input_window (int): The number of past observations to use as features.
    output_window (int): The number of steps to forecast into the future.
    
    Returns:
    pd.DataFrame: A new dataframe with supervised learning format.
    """
    

    df_new = pd.DataFrame()
    #Put in a for loop here where you iterate over all IDs, to make sure things get correct
    unique_sorts = df[sorting_column].unique()

    other_cols = [col for col in df.columns if col not in forecast_columns]
    
    #Iterate through all IDs
    print("Creating supervised data")
    for i, sorts in enumerate(unique_sorts):
        loadBar.load_bar(len(unique_sorts),i+1)
        df_supervised = pd.DataFrame()
        sort_df = df[df[sorting_column] == sorts]

        #Iterate through all columns for input features
        for col in forecast_columns: 
            for i in range(input_window, 0, -1):
                df_supervised[f"{col}_t-{i}"] = sort_df[col].shift(i)
            
            df_supervised[f"{col}_t"] = sort_df[col]
            

        # Create columns for forecast (target) with forward shift
        for col in forecast_columns:
            for j in range(output_window, 0,-1):
                df_supervised[f"{col}_t+{j}"] = sort_df[col].shift(-j)

        df_supervised = df_supervised.dropna()

        df_supervised[other_cols] = sort_df[other_cols]
        
        df_new = pd.concat([df_new, df_supervised])
    
    return df_new

# total_df = pd.read_csv("../../build_resources/data_resampled_20min_markov.csv")



In [None]:
total_df = make_supervised(total_df, OUTPUT_FORECAST, "vesselId", input_window=INPUT_WINDOW, output_window=OUTPUT_WINDOW)

Creating supervised data


In [None]:
total_df = total_df.drop(DELETEABLE_COLUMNS, axis=1)

In [None]:


#Sorting columns
def sort_columns(df):
    selected_columns = df.filter(regex=r'_t$|_t\+|_t-')
    non_selected_columns = df.drop(selected_columns.columns, axis=1)
    # Extract suffixes and assign _t as _t0
    columns_with_suffix = []
    for col in selected_columns.columns:
        match = re.search(r"_t([+-]?\d*)$", col)
        # If there's no number after _t, treat it as _t0
        suffix = int(match.group(1)) if match.group(1) else 0
        columns_with_suffix.append((col, suffix))
    
    # Sort by suffix value (ascending)
    sorted_t_columns = [col for col, _ in sorted(columns_with_suffix, key=lambda x: x[1])]
    
    # Reorder dataframe columns
    return df[sorted_t_columns+non_selected_columns.columns.tolist()]

total_df = total_df.dropna()
print(len(total_df))
total_df = total_df.sort_index(ascending = True)
total_df=sort_columns(total_df)

print(total_df)



5896285
                     latitude_t-4  longitude_t-4  cog_t-4  sog_t-4  rot_t-4  \
time                                                                          
2024-01-01 01:20:00     -34.74370      -57.85130    284.0      0.7      0.0   
2024-01-01 01:20:00      44.40593        8.88505     11.5      0.0      0.0   
2024-01-01 01:20:00      52.32413        2.10535    320.0      1.2      0.0   
2024-01-01 01:20:00      33.63707     -118.23330    162.8      9.9      0.0   
2024-01-01 01:20:00      53.57537        8.56014    271.5      0.0      0.0   
...                           ...            ...      ...      ...      ...   
2024-05-07 23:20:00      43.58091       10.30400    352.5      0.0      0.0   
2024-05-07 23:20:00       1.17829      103.75309    224.6      0.2      0.0   
2024-05-07 23:20:00      59.57721       21.54090    296.3     14.7      3.0   
2024-05-07 23:20:00      10.63965      106.76226    113.6      0.0      0.0   
2024-05-07 23:20:00      36.86324     -122.4

In [None]:
print(total_df.head()) 

                     latitude_t-4  longitude_t-4  cog_t-4  sog_t-4  rot_t-4  \
time                                                                          
2024-01-01 01:20:00     -34.74370      -57.85130    284.0      0.7      0.0   
2024-01-01 01:20:00      44.40593        8.88505     11.5      0.0      0.0   
2024-01-01 01:20:00      52.32413        2.10535    320.0      1.2      0.0   
2024-01-01 01:20:00      33.63707     -118.23330    162.8      9.9      0.0   
2024-01-01 01:20:00      53.57537        8.56014    271.5      0.0      0.0   

                     heading_t-4  latitude_t-3  longitude_t-3  cog_t-3  \
time                                                                     
2024-01-01 01:20:00         88.0    -34.627229     -57.966135    284.0   
2024-01-01 01:20:00        352.0     44.405910       8.885030     11.5   
2024-01-01 01:20:00        134.0     52.325250       2.109120     75.0   
2024-01-01 01:20:00        164.0     33.582630    -118.211500    161.6   
20

In [None]:


def train_test_split(df, perc1, perc2, output_window):
    y_list = []
    for j in range(output_window):
        for col in OUTPUT_FORECAST:
            y_list.append(f"{col}_t+{j+1}")
    ys = df[y_list]
    Xs = df.drop(columns = y_list)

    X_train = Xs.iloc[:int(np.round(Xs.shape[0]*perc1)),:]
    y_train = ys.iloc[:int(np.round(Xs.shape[0]*perc1)),:]
    X_val = Xs.iloc[int(np.round(Xs.shape[0]*perc1)):int(np.round(Xs.shape[0]*perc2)),:]
    y_val = ys.iloc[int(np.round(Xs.shape[0]*perc1)):int(np.round(Xs.shape[0]*perc2)),:]
    X_test = Xs.iloc[int(np.round(Xs.shape[0]*perc2)):,:]
    y_test = ys.iloc[int(np.round(Xs.shape[0]*perc2)):,:]

    return X_train, y_train, X_val, y_val, X_test, y_test



X_train, y_train, X_val, y_val, X_test, y_test = train_test_split(total_df, 0.75, 0.85, OUTPUT_WINDOW)



In [None]:
def evaluate(stepsize, preds, y_val):
    print("/"+"-"*50+"\\")
    print("Evaluating model with stepsize", stepsize)

    results = {
        "MAE": mean_absolute_error(y_val, preds),
        "MSE": np.square(np.subtract(y_val,preds)).mean(),
        "R2 Score": r2_score(y_val, preds),
        "RMSE": np.sqrt(np.square(np.subtract(y_val,preds)).mean())
    }

    for metric, value in results.items():
        print(f"{metric}: {value}")
    print("\\"+"-"*50+"/")



In [None]:
# #Tuning params
# # We need to use XGB

# # Define model with a high num_boost_round
# model = xgb.XGBRFRegressor(
#     objective="reg:squarederror",
#     tree_method="hist",  # or "hist" if not using GPU
#     n_jobs=-1
# )

# # Define parameter grid
# param_grid = {
#     "max_depth": [3, 5, 7],
#     "learning_rate": [1],
#     "subsample": [0.6, 0.8, 1.0],
#     "colsample_bynode": [0.4, 0.6, 0.8],
#     "num_parallel_tree": [50, 100, 200]
# }

# # Use TimeSeriesSplit for time series cross-validation
# tscv = TimeSeriesSplit(n_splits=10)

# # Set early stopping and validation set in fit parameters
# fit_params = {
#     "eval_set": [(X_val, y_val)],  # Validation set to monitor performance
#     "verbose": 1
# }

# # RandomizedSearchCV with early stopping
# random_search = RandomizedSearchCV(
#     estimator=model,
#     param_distributions=param_grid,
#     n_iter=20,  # Number of sampled parameter combinations
#     scoring="neg_mean_squared_error",
#     cv=tscv,
#     verbose=1,
#     random_state=42
# )

# # Fit with early stopping
# random_search.fit(X_train, y_train, **fit_params)

# # Output best parameters and number of boosting rounds
# best_params = random_search.best_params_
# best_num_boost_round = model.get_booster().best_iteration  # Retrieve best boosting rounds
# print("Best parameters:", best_params)
# print("Best num_boost_round:", best_num_boost_round)


Fitting 10 folds for each of 20 candidates, totalling 200 fits


In [None]:

print(X_train)



dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
dtest_X = xgb.DMatrix(X_test)

params = {"objective": "reg:squarederror",
            "max_depth": 5,
            "booster": "gbtree",
            "tree-method": "gpu_hist",
            "col_sample_bynode": 0.5,
            "num_parallel_tree": 100,
            "subsample": 0.8,
            "learning_rate": 1,
            #"n_estimators": 100,
            #"reg_alpha": 0.1,
            #"reg_lambda": 0.1,
            #"n_jobs": -1,
            "verbosity": 1
            }

num_boost_round = 5

early_stopping_rounds = 2

print(dtrain)

model = xgb.train(params, dtrain, num_boost_round, evals=[(dval, "validation")], early_stopping_rounds=early_stopping_rounds, verbose_eval=True)


preds = model.predict(dtest_X)



                     latitude_t-4  longitude_t-4  cog_t-4  sog_t-4  rot_t-4  \
time                                                                          
2024-01-01 01:20:00    -34.743700     -57.851300    284.0      0.7      0.0   
2024-01-01 01:20:00     44.405930       8.885050     11.5      0.0      0.0   
2024-01-01 01:20:00     52.324130       2.105350    320.0      1.2      0.0   
2024-01-01 01:20:00     33.637070    -118.233300    162.8      9.9      0.0   
2024-01-01 01:20:00     53.575370       8.560140    271.5      0.0      0.0   
...                           ...            ...      ...      ...      ...   
2024-04-04 10:20:00     37.981977       8.333008    276.0     16.6      0.0   
2024-04-04 10:20:00    -33.567415      16.853843    322.1     15.6      0.0   
2024-04-04 10:20:00     40.276139     -10.943409    217.7     14.4      0.0   
2024-04-04 10:20:00    -26.249571       4.188977    204.6     18.3      0.0   
2024-04-04 10:20:00    -23.957620     -46.300340    

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:etaParsed: object, UN_LOCODE: object, ISO: object

In [None]:
def closest_n_min_mark(timestamp, n=1):
    timestamp = pd.to_datetime(timestamp)
    minutes = timestamp.minute
    closest_mark = round(minutes / (n*20)) * n*20
    if closest_mark == 60:
        rounded_timestamp = timestamp.replace(minute=0, second=0, microsecond=0) + pd.Timedelta(hours=1)
    else:
        rounded_timestamp = timestamp.replace(minute=closest_mark, second=0, microsecond=0)
    
    return rounded_timestamp

In [None]:
def shift_to_back(process_df):      
    for _, col in enumerate(OUTPUT_FORECAST):

        max_suffix_neg = 0
        max_suffix_pos = 0
        
        # Identify existing suffixes in the process_df for the current column
        while f"{col}_t-{max_suffix_neg+1}" in process_df.columns:
            max_suffix_neg += 1
        while f"{col}_t+{max_suffix_pos+1}" in process_df.columns:
            max_suffix_pos += 1
        for shift in range(max_suffix_neg - 1, -max_suffix_pos, -1):  # Start from max_suffix-1 down to 0
            if shift == 0:
                # Set the new predicted value as the most recent
                process_df[f"{col}_t"] = process_df[f"{col}_t+1"]
            elif shift == 1:
                # Shift the column
                process_df[f"{col}_t-{shift}"] = process_df[f"{col}_t"]
            elif shift > 1:
                # Shift the column
                process_df[f"{col}_t-{shift}"] = process_df[f"{col}_t-{shift - 1}"]
            else:
                process_df[f"{col}_t+{-shift}"] = process_df[f"{col}_t+{-shift + 1}"]

        for shift in range(1, max_suffix_pos+1):
            process_df = process_df.drop(columns=[f"{col}_t+{shift}"])
    
    return process_df


In [None]:
def predict_far_future(model, features, test_df,  forecast_columns):
    
    X_test = features.copy().iloc[-1:]
    preds = pd.DataFrame(columns=["vesselId", "approximate_time"])
    
    # Determine the furthest time in 20-minute intervals
    furthest_time = closest_n_min_mark(test_df["time"].max())
    current_time = closest_n_min_mark(X_test.index.max())
    
    # Generate the future time steps at 20-minute intervals
    future_steps = pd.date_range(start=current_time, end=furthest_time, freq='20min')
    
    for future_time in future_steps:
        y_pred = model.predict(xgb.DMatrix(X_test))

        new_row = pd.DataFrame({
            "vesselId": [test_df["vesselId"].iloc[0]],
            "approximate_time": [future_time]
        })
        for idx, col in enumerate(forecast_columns):
            new_row[f"{col}"] = y_pred[0, idx]  # Use the predicted value
        selected_columns = X_test.filter(regex=r'_t$|_t\+|_t-')
        non_selected_columns = X_test.drop(selected_columns.columns, axis=1)
        new_row[non_selected_columns.columns] = X_test[non_selected_columns.columns].iloc[0]
        
        
        preds = pd.concat([preds, new_row], ignore_index=True)
        
        # Update X_test for the next iteration
        for idx, col in enumerate(forecast_columns):

            max_suffix = 0
            
            # Identify existing suffixes in the X_test for the current column
            while f"{col}_t-{max_suffix+1}" in X_test.columns:
                max_suffix += 1
            for shift in range(max_suffix - 1, -1, -1):  # Start from max_suffix-1 down to 0
                if shift == 0:
                    # Set the new predicted value as the most recent
                    X_test[f"{col}_t"] = y_pred[0, idx]
                elif shift == 1:
                    # Shift the column
                    X_test[f"{col}_t-{shift}"] = X_test[f"{col}_t"]
                else:
                    # Shift the column
                    X_test[f"{col}_t-{shift}"] = X_test[f"{col}_t-{shift - 1}"]
    
    return preds


csv_parser = CSVParser(folderpath="../../Project materials(1)")

test_df = csv_parser.retrieve_test_data()




In [None]:
def preprocess(latest_features, feature_engineering_functions):

    feature_engineer = FeatureEngineer(latest_features)
    feature_engineer.apply_features(feature_engineering_functions)
    latest_features = feature_engineer.get_dataframe()
    latest_features = pd.get_dummies(latest_features, columns=ONE_HOT_COLUMNS, drop_first=False)
    
    latest_features = make_supervised(latest_features, OUTPUT_FORECAST, "vesselId" , INPUT_WINDOW, OUTPUT_WINDOW)
    latest_features = shift_to_back(latest_features)
    latest_features = latest_features.dropna()
    latest_features = latest_features.drop(DELETEABLE_COLUMNS, axis=1)
    latest_features = sort_columns(latest_features)

    
    
    
    return latest_features

In [None]:
def predict_times(model,total_df,test_df):
    unique_sorts = test_df["vesselId"].unique()
    preds_df = pd.DataFrame()
    result = pd.DataFrame()

    for sorts in unique_sorts:
        latest_features=total_df[total_df["vesselId"] == sorts]
        test_by_vessel_df = test_df[test_df["vesselId"] == sorts]

        latest_features = preprocess(latest_features, feature_engineering_functions)

        preds = predict_far_future(model, latest_features, test_by_vessel_df, OUTPUT_FORECAST)
        preds_df = pd.concat([preds_df, preds])
    
    for test in test_df.iterrows():
        test=pd.Series(test[1])
        new_row = pd.DataFrame()
        new_row=preds_df[
            (preds_df["vesselId"] == test["vesselId"]) & 
            (preds_df["approximate_time"] == closest_n_min_mark(test["time"]))
            ][["latitude", "longitude"]]
        new_row["ID"] = test["ID"]
        new_row["time"] = test["time"]
        
        result = pd.concat([result, new_row])
    result["latitude_predicted"] = result["latitude"]
    result["longitude_predicted"] = result["longitude"]

    return result[["ID","longitude_predicted","latitude_predicted"]]

print(test_df)
total_df = pd.read_csv("../../Project materials(1)/data_resampled_20min.csv")
total_df["time"] = pd.to_datetime(total_df['time'])
total_df.set_index("time", inplace=True)
print(total_df.head())
result_df = predict_times(model, total_df, test_df)
print(result_df)





In [None]:
#turn results into a csv file
result_df.to_csv("../../Project materials(1)/results.csv", index=False)

In [None]:
_ = plot_importance(model, height=0.9)

### First model:
Included navstat and etaParsed

Timewindow: (3,2)

MAE: 0.8521843262281953 

MSE: longitude_t+1    21.225563

latitude_t+1      1.993130

longitude_t+2    38.471488

latitude_t+2      3.840146

dtype: float64

R2 Score: 0.9958523607729776

RMSE: longitude_t+1    4.607121

latitude_t+1     1.411783

longitude_t+2    6.202539

latitude_t+2     1.959629

dtype: float64


### Second model:

Added cog, rot and heading to target features.

Timewindow: (3,2)

MAE: 7.198335594601071
MSE: latitude_t+1        1.980426
longitude_t+1      21.577318
cog_t+1          1820.208937
rot_t+1            92.532501
heading_t+1      1172.604934
latitude_t+2        3.813640
longitude_t+2      39.218475
cog_t+2          2370.325440
rot_t+2           107.991661
heading_t+2      1769.347459
dtype: float64
R2 Score: 0.8826565909012996
RMSE: latitude_t+1      1.407276
longitude_t+1     4.645139
cog_t+1          42.663907
rot_t+1           9.619382
heading_t+1      34.243320
latitude_t+2      1.952854
longitude_t+2     6.262466
cog_t+2          48.685988
rot_t+2          10.391904
heading_t+2      42.063612
dtype: float64
