In [19]:
import pandas as pd
import numpy as np
import re
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score


In [10]:
#We have two resampled and therefore regular datasets, now we need to make our time series into a supervised problem.
#First I need to change navstat into a categorical feature:


# Define categories based on ranges or discrete values
# pretty_20m = pd.read_csv("../Project materials(1)/resampled_data_20min.csv")

pretty_h = pd.read_csv("../Project materials(1)/resampled_data_h.csv")

pretty_h['etaParsed'] = pd.to_datetime(pretty_h['etaParsed'])
pretty_h["time"] = pd.to_datetime(pretty_h['time'])

start_date = pd.to_datetime('2024-01-01')

pretty_h["etaParsed"] = (pretty_h['etaParsed'] - start_date).dt.days

navstat_unique = pretty_h["navstat"].unique()

#pretty_20m["navstat"] = pd.Categorical(pretty_20m["navstat"], categories=navstat_unique, ordered=True)

pretty_h["navstat"] = pd.Categorical(pretty_h["navstat"], categories=navstat_unique, ordered=True)


# Let's make dummys 

pretty_h = pd.get_dummies(pretty_h, columns=["navstat"], drop_first=True)
navstat_cols = [col for col in pretty_h.columns if col.startswith("navstat")]
pretty_h[navstat_cols]=pretty_h[navstat_cols].astype(int)
    

#pretty_20m = pd.get_dummies(pretty_20m, columns = ["navstat"], drop_first=True)
print(type(pretty_h["time"].iloc[1]))
#pretty_20m.set_index("time", inplace=True)
pretty_h.set_index("time", inplace=True)



<class 'pandas._libs.tslibs.timestamps.Timestamp'>


In [11]:
print(type(pretty_h["etaParsed"].iloc[1]))

<class 'numpy.float64'>


In [12]:
#Make time series into supervised problem
def make_supervised(df, forecast_columns, sorting_column, input_window=1, output_window=1):
    """
    Converts a multivariate time series dataframe into a supervised learning problem.
    
    Parameters:
    df (pd.DataFrame): The original dataframe with time series data.
    forecast_columns (list): A list of column names to forecast.
    input_window (int): The number of past observations to use as features.
    output_window (int): The number of steps to forecast into the future.
    
    Returns:
    pd.DataFrame: A new dataframe with supervised learning format.
    """
    # Create a list to hold the transformed value
    df_holder = []
    
    #Put in a for loop here where you iterate over all IDs, to make sure things get correct
    unique_sorts = df[sorting_column].unique()

    forbidden_cols = ["vesselId", "UN_LOCODE", "ISO", "portId"]
    
    #Iterate through all IDs
    for sorts in unique_sorts:
        df_supervised = pd.DataFrame()
        sort_df = df[df[sorting_column] == sorts]

        #Iterate through all columns for input features
        for col in sort_df.columns: 
            if col in forbidden_cols:
                    continue
            for i in range(input_window, 0, -1):
                df_supervised[f"{col}_t-{i}"] = sort_df[col].shift(i)
            
            df_supervised[f"{col}_t"] = sort_df[col]
            

    # Create columns for forecast (target) with forward shift
        for col in forecast_columns:
            for j in range(output_window, 0,-1):
                df_supervised[f"{col}_t+{j}"] = sort_df[col].shift(-j)
        
        df_holder.append(df_supervised)
    

    
    df_new = pd.DataFrame()
    
    for chunk in df_holder:
        df_new = pd.concat([df_new, chunk])
    # Remove rows with NaN values caused by the shifting process
    df_new.dropna(inplace=True)
    
    return df_new

supervised_h = make_supervised(pretty_h, ["latitude", "longitude"],"vesselId" , 3, 2)

supervised_h.columns

Index(['cog_t-3', 'cog_t-2', 'cog_t-1', 'cog_t', 'sog_t-3', 'sog_t-2',
       'sog_t-1', 'sog_t', 'rot_t-3', 'rot_t-2', 'rot_t-1', 'rot_t',
       'heading_t-3', 'heading_t-2', 'heading_t-1', 'heading_t',
       'latitude_t-3', 'latitude_t-2', 'latitude_t-1', 'latitude_t',
       'longitude_t-3', 'longitude_t-2', 'longitude_t-1', 'longitude_t',
       'etaParsed_t-3', 'etaParsed_t-2', 'etaParsed_t-1', 'etaParsed_t',
       'portLongitude_t-3', 'portLongitude_t-2', 'portLongitude_t-1',
       'portLongitude_t', 'portLatitude_t-3', 'portLatitude_t-2',
       'portLatitude_t-1', 'portLatitude_t', 'navstat_5.0_t-3',
       'navstat_5.0_t-2', 'navstat_5.0_t-1', 'navstat_5.0_t',
       'navstat_1.0_t-3', 'navstat_1.0_t-2', 'navstat_1.0_t-1',
       'navstat_1.0_t', 'navstat_8.0_t-3', 'navstat_8.0_t-2',
       'navstat_8.0_t-1', 'navstat_8.0_t', 'navstat_2.0_t-3',
       'navstat_2.0_t-2', 'navstat_2.0_t-1', 'navstat_2.0_t',
       'navstat_3.0_t-3', 'navstat_3.0_t-2', 'navstat_3.0_t-1',
    

In [13]:
#Sorting by time
supervised_h = supervised_h.sort_index(ascending = True)

#Sorting columns
def sort_columns(df):
    
    # Extract suffixes and assign _t as _t0
    columns_with_suffix = []
    for col in df.columns:
        match = re.search(r"_t([+-]?\d*)$", col)
        # If there's no number after _t, treat it as _t0
        suffix = int(match.group(1)) if match.group(1) else 0
        columns_with_suffix.append((col, suffix))
    
    # Sort by suffix value (ascending)
    sorted_t_columns = [col for col, _ in sorted(columns_with_suffix, key=lambda x: x[1])]
    
    # Reorder dataframe columns
    return df[sorted_t_columns]

supervised_h=sort_columns(supervised_h)


In [14]:
print(supervised_h.columns)

print(supervised_h.shape)

def train_test_split(df, perc1, perc2, output_window):
    y_list = []
    for j in range(output_window):
        y_list.append(f"{"longitude"}_t+{j+1}")
        y_list.append(f"{"latitude"}_t+{j+1}")
    ys = df[y_list]
    Xs = df.drop(columns = y_list)

    X_train = Xs.iloc[:int(np.round(Xs.shape[0]*perc1)),:]
    y_train = ys.iloc[:int(np.round(Xs.shape[0]*perc1)),:]
    X_val = Xs.iloc[int(np.round(Xs.shape[0]*perc1)):int(np.round(Xs.shape[0]*perc2)),:]
    y_val = ys.iloc[int(np.round(Xs.shape[0]*perc1)):int(np.round(Xs.shape[0]*perc2)),:]
    X_test = Xs.iloc[int(np.round(Xs.shape[0]*perc2)):,:]
    y_test = ys.iloc[int(np.round(Xs.shape[0]*perc2)):,:]

    return X_train, y_train, X_val, y_val, X_test, y_test

X_train, y_train, X_val, y_val, X_test, y_test = train_test_split(supervised_h, 0.75, 0.85, 2)

Index(['cog_t-3', 'sog_t-3', 'rot_t-3', 'heading_t-3', 'latitude_t-3',
       'longitude_t-3', 'etaParsed_t-3', 'portLongitude_t-3',
       'portLatitude_t-3', 'navstat_5.0_t-3', 'navstat_1.0_t-3',
       'navstat_8.0_t-3', 'navstat_2.0_t-3', 'navstat_3.0_t-3',
       'navstat_15.0_t-3', 'navstat_4.0_t-3', 'navstat_14.0_t-3',
       'navstat_11.0_t-3', 'navstat_12.0_t-3', 'navstat_6.0_t-3',
       'navstat_7.0_t-3', 'navstat_13.0_t-3', 'navstat_9.0_t-3', 'cog_t-2',
       'sog_t-2', 'rot_t-2', 'heading_t-2', 'latitude_t-2', 'longitude_t-2',
       'etaParsed_t-2', 'portLongitude_t-2', 'portLatitude_t-2',
       'navstat_5.0_t-2', 'navstat_1.0_t-2', 'navstat_8.0_t-2',
       'navstat_2.0_t-2', 'navstat_3.0_t-2', 'navstat_15.0_t-2',
       'navstat_4.0_t-2', 'navstat_14.0_t-2', 'navstat_11.0_t-2',
       'navstat_12.0_t-2', 'navstat_6.0_t-2', 'navstat_7.0_t-2',
       'navstat_13.0_t-2', 'navstat_9.0_t-2', 'cog_t-1', 'sog_t-1', 'rot_t-1',
       'heading_t-1', 'latitude_t-1', 'longitude_

In [15]:
print(type(X_train["navstat_7.0_t-3"].iloc[1]))

<class 'numpy.float64'>


In [16]:
model = xgb.XGBRFRegressor()
model.fit(X_train, y_train)
preds = model.predict(X_val)

In [None]:
results = {
    "MAE": mean_absolute_error(y_val, preds),
    "MSE": np.square(np.subtract(y_val,preds)).mean(),
    "R2 Score": r2_score(y_val, preds),
    "RMSE": np.sqrt(np.square(np.subtract(y_val,preds)).mean())
}

for metric, value in results.items():
    print(f"{metric}: {value}")

MAE: 0.8465450034427957
MSE: longitude_t+1    21.327158
latitude_t+1      1.989872
longitude_t+2    38.661797
latitude_t+2      3.836467
dtype: float64
R2 Score: 0.9958446838366094
RMSE: longitude_t+1    4.618134
latitude_t+1     1.410628
longitude_t+2    6.217861
latitude_t+2     1.958690
dtype: float64
