In [3]:
%load_ext autoreload
%autoreload 2
import tensorflow as tf
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import datetime
import os
from feature_engineering.tensor_features import develop_features, floating_conv
from sklearn.preprocessing import StandardScaler
from data_handler import LocalToLargeDataLoader
from path_finder import path_sorter

In [5]:
data_loader = LocalToLargeDataLoader(print_progress=True)
parsed_data = data_loader.load_raw_data(path="../../resources")

Retrieving training data...


In [6]:
index_data = parsed_data.copy()
index_data.set_index("time", inplace=True)


In [7]:
import loadBar


def fill_with_proximity(df):
    # Forward fill first
    df_ffill = df.ffill()
    
    # Backward fill next
    df_bfill = df.bfill()
    # Create a mask for original missing values
    mask = df.isna()

    # Create a new DataFrame to hold the results
    filled_df = df.copy()

    # Iterate over each missing value
    for i in range(len(df)):
        if mask.iloc[i, 0]:  # Check if the value is missing
            # Get the current index (timestamp)
            current_time = df.index[i]

            # Get the last known value index (timestamp)
            last_known_index = df_ffill.first_valid_index()
            if last_known_index is not None:
                distance_to_ffill = (current_time - last_known_index).total_seconds()  # distance to last known

            # Get the next known value index (timestamp)
            next_known_index = df_bfill.first_valid_index()
            if next_known_index is not None:
                distance_to_bfill = (next_known_index - current_time).total_seconds()  # distance to next known

            # Fill based on proximity
            if (last_known_index is not None and distance_to_ffill < distance_to_bfill) or next_known_index is None:
                filled_df.iloc[i] = df_ffill.iloc[i]
            else:
                filled_df.iloc[i] = df_bfill.iloc[i]

    return filled_df



def resampler(df, sorting_column, freq):
    unique_ids = df[sorting_column].unique()
    final_df = pd.DataFrame()
    partial_list = []

    for i in range(len(unique_ids)):
        loadBar.load_bar(len(unique_ids),i+1)
        resample_partial = df[df[sorting_column] == unique_ids[i]].resample(freq).last()
        resample_partial = fill_with_proximity(resample_partial)
        partial_list.append(resample_partial)

    for chunk in partial_list:
        final_df = pd.concat([final_df,chunk])
    
    return final_df

resampled_data_h = resampler(index_data, "vesselId", "h")
resampled_data_20min = resampler(index_data, "vesselId", "20min")

resampled_data_h.to_csv('resampled_data_h.csv')
resampled_data_20min.to_csv('resampled_data_20min.csv')


[--------------------] 1.01% complete

KeyboardInterrupt: 

In [8]:
#We have two resampled and therefore regular datasets, now we need to make our time series into a supervised problem.
#First I need to change navstat into a categorical feature:


# Define categories based on ranges or discrete values
pretty_20m = pd.read_csv("../../resources/resampled_data_20min.csv")

pretty_h = pd.read_csv("../../resources/resampled_data_h.csv")

navstat_unique = pretty_h["navstat"].unique()

pretty_20m["navstat"] = pd.Categorical(pretty_20m["navstat"], categories=navstat_unique, ordered=True)

pretty_h["navstat"] = pd.Categorical(pretty_h["navstat"], categories=navstat_unique, ordered=True)


# Let's make dummys 

pretty_h = pd.get_dummies(pretty_h, columns=["navstat"], drop_first=True)
pretty_20m = pd.get_dummies(pretty_20m, columns = ["navstat"], drop_first=True)

pretty_20m.set_index("time", inplace=True)
pretty_h.set_index("time", inplace=True)


In [9]:
print(pretty_h.head())
print(pretty_h.shape)

                       cog   sog  rot  heading  latitude  longitude  \
time                                                                  
2024-01-01 00:00:00  284.0   0.7  0.0     88.0 -34.74370   -57.8513   
2024-01-01 01:00:00   88.2  14.3  0.0     86.0 -35.16805   -56.5319   
2024-01-01 02:00:00   88.2  14.3  0.0     86.0 -35.16805   -56.5319   
2024-01-01 03:00:00   88.2  14.3  0.0     86.0 -35.16805   -56.5319   
2024-01-01 04:00:00   88.2  14.3  0.0     86.0 -35.16805   -56.5319   

                                     vesselId                    portId  \
time                                                                      
2024-01-01 00:00:00  61e9f3a8b937134a3c4bfdf7  61d371c43aeaecc07011a37f   
2024-01-01 01:00:00  61e9f3a8b937134a3c4bfdf7  61d371c43aeaecc07011a37f   
2024-01-01 02:00:00  61e9f3a8b937134a3c4bfdf7  61d371c43aeaecc07011a37f   
2024-01-01 03:00:00  61e9f3a8b937134a3c4bfdf7  61d371c43aeaecc07011a37f   
2024-01-01 04:00:00  61e9f3a8b937134a3c4bfdf7  61d37

In [11]:
#Make time series into supervised problem
def make_supervised(df, forecast_columns, sorting_column, input_window=1, output_window=1):
    """
    Converts a multivariate time series dataframe into a supervised learning problem.
    
    Parameters:
    df (pd.DataFrame): The original dataframe with time series data.
    forecast_columns (list): A list of column names to forecast.
    input_window (int): The number of past observations to use as features.
    output_window (int): The number of steps to forecast into the future.
    
    Returns:
    pd.DataFrame: A new dataframe with supervised learning format.
    """
    # Create a dataframe to hold the transformed features
    df_supervised = pd.DataFrame()

    #Put in a for loop here where you iterate over all IDs, to make sure things get correct
    unique_sorts = df[sorting_column].unique()
    
    #Iterate through all IDs
    for sorts in unique_sorts:
        sort_df = df[df[sorting_column] == sorts]

        #Iterate through all columns for input features
        for col in sort_df.columns: 
            for i in range(input_window, 0, -1):
                df_supervised[f"{col}_t-{i}"] = sort_df[col].shift(i)
            df_supervised[f"{col}_t"] = sort_df[col]

    # Create columns for forecast (target) with forward shift
        for col in forecast_columns:
            for j in range(output_window, 0, -1):
                df_supervised[f"{col}_t+{j}"] = sort_df[col].shift(-j)
        

    # Remove rows with NaN values caused by the shifting process
    df_supervised.dropna(inplace=True)
    
    return df_supervised

supervised_h = make_supervised(pretty_h, ["latitude", "longitude"],"vesselId" , 3, 2)

supervised_h.columns

  df_supervised[f"{col}_t-{i}"] = sort_df[col].shift(i)
  df_supervised[f"{col}_t-{i}"] = sort_df[col].shift(i)
  df_supervised[f"{col}_t-{i}"] = sort_df[col].shift(i)
  df_supervised[f"{col}_t"] = sort_df[col]
  df_supervised[f"{col}_t-{i}"] = sort_df[col].shift(i)
  df_supervised[f"{col}_t-{i}"] = sort_df[col].shift(i)
  df_supervised[f"{col}_t-{i}"] = sort_df[col].shift(i)
  df_supervised[f"{col}_t"] = sort_df[col]
  df_supervised[f"{col}_t+{j}"] = sort_df[col].shift(-j)
  df_supervised[f"{col}_t+{j}"] = sort_df[col].shift(-j)
  df_supervised[f"{col}_t+{j}"] = sort_df[col].shift(-j)
  df_supervised[f"{col}_t+{j}"] = sort_df[col].shift(-j)


KeyboardInterrupt: 

In [None]:
print([col for col in supervised_h.columns])

NameError: name 'supervised_h' is not defined