In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from scipy.interpolate import interp1d
from tqdm import tqdm
warnings.filterwarnings('ignore', category=RuntimeWarning)
warnings.filterwarnings('ignore')

In [29]:
df = pd.read_csv('/content/modified_ais_data.csv',nrows=30000)
df.head()

Unnamed: 0,BaseDateTime,MMSI,IMO,LAT,LON,SOG,COG
0,2022-03-31T00:00:01,671226100,IMO9221322,25.77626,-80.2032,3.2,143.7
1,2022-03-31T00:00:06,367452810,IMO9602344,29.32824,-94.77391,2.6,319.2
2,2022-03-31T00:00:09,366919770,IMO9253583,48.74428,-122.49504,0.0,210.3
3,2022-03-31T00:00:00,311000966,IMO8916607,18.55833,-66.4791,17.5,274.5
4,2022-03-31T00:00:01,219028420,IMO9411305,28.66703,-93.59339,9.5,129.4


In [30]:
df.shape

(30000, 7)

In [31]:
import geopandas as gpd
import pandas as pd
import numpy as np
from tqdm import tqdm

def sliding_window_segmentation(data, window_size, step_size, mmsi):
    segments = []
    for start in range(0, len(data) - window_size + 1, step_size):
        window = data.iloc[start:start + window_size].copy()
        window['MMSI'] = mmsi
        segments.append(window)
    return segments

def lat_lon_range(df_file):
    min_lat, max_lat = df_file['LAT'].min(), df_file['LAT'].max()
    min_lon, max_lon = df_file['LON'].min(), df_file['LON'].max()
    min_lat, max_lat, min_lon, max_lon = (
        int(np.floor(min_lat)),
        int(np.ceil(max_lat)),
        int(np.floor(min_lon)),
        int(np.ceil(max_lon)),
    )
    return min_lat, max_lat, min_lon, max_lon

def prepare_ais_data_without_interpolation(
    data, bounding_box, window_size, step_size
):
    min_lat, max_lat, min_lon, max_lon = bounding_box

    filtered_data = data[
        (data['LAT'] >= min_lat) & (data['LAT'] <= max_lat) &
        (data['LON'] >= min_lon) & (data['LON'] <= max_lon)
    ]

    grouped = filtered_data.groupby('MMSI')
    segments = []
    segment_id = 0

    for mmsi, group in tqdm(grouped, desc="Processing MMSI", unit="MMSI"):
        group = group.sort_values(by='BaseDateTime')
        group['BaseDateTime'] = pd.to_datetime(group['BaseDateTime'])

        # Directly use the existing data without interpolation
        resampled_traj = group[['BaseDateTime', 'LAT', 'LON', 'SOG', 'COG']].copy()

        trajectory_segments = sliding_window_segmentation(resampled_traj, window_size, step_size, mmsi)

        for segment in trajectory_segments:
            segment['SegmentID'] = segment_id
            segments.append(segment)
            segment_id += 1

    combined_df = pd.concat(segments, ignore_index=True)
    return combined_df

# Example usage
if __name__ == "__main__":

    # Define bounding box from the dataset
    min_lat, max_lat, min_lon, max_lon = lat_lon_range(df)
    bounding_box = (min_lat, max_lat, min_lon, max_lon)

    window_size = 10
    step_size = 5

    segmented_trajectories_df = prepare_ais_data_without_interpolation(
        df, bounding_box, window_size, step_size
    )

    segmented_trajectories_df.to_csv("segmented_trajectories.csv", index=False)


Processing MMSI: 100%|██████████| 2089/2089 [00:08<00:00, 247.52MMSI/s]


In [24]:
segmented_trajectories_df.head()

Unnamed: 0,BaseDateTime,LAT,LON,SOG,COG,MMSI,SegmentID
0,2022-03-31 00:02:32,27.35372,-94.62546,0.4,228.6,111,0
1,2022-03-31 00:05:35,27.35372,-94.6255,0.6,219.8,111,0
2,2022-03-31 00:08:34,27.35377,-94.62556,0.2,221.7,111,0
3,2022-03-31 00:11:31,27.3538,-94.62557,0.3,105.0,111,0
4,2022-03-31 00:14:33,27.35365,-94.62542,0.3,173.4,111,0


In [32]:
segmented_trajectories_df.shape

(33190, 7)

In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from numba import jit

# Add temporal features
def add_temporal_features(data, time_col):
    data[time_col] = pd.to_datetime(data[time_col])
    data['Hour'] = data[time_col].dt.hour
    data['Minute'] = data[time_col].dt.minute
    data['Second'] = data[time_col].dt.second
    data['TimeDelta'] = data[time_col].diff().dt.total_seconds().fillna(0)
    return data

# Filter segments with insufficient rows
def filter_short_segments(data, window_size, prediction_horizon, group_col='SegmentID'):
    min_required_rows = window_size + prediction_horizon
    filtered_data = data.groupby(group_col).filter(lambda x: len(x) >= min_required_rows)
    print(f"Filtered data contains {len(filtered_data)} rows after removing short segments.")
    return filtered_data

# Numba-optimized sequence preparation (works with NumPy arrays)
@jit(nopython=True, parallel=True)
def prepare_sequences_numba(data_array, group_indices, window_size, prediction_horizon):
    sequences = []
    for start_idx, end_idx in group_indices:
        num_rows = end_idx - start_idx
        if num_rows < window_size + prediction_horizon:
            continue
        for start in range(num_rows - window_size - prediction_horizon + 1):
            x_window = data_array[start_idx + start:start_idx + start + window_size]
            y_window = data_array[start_idx + start + window_size:start_idx + start + window_size + prediction_horizon]
            sequences.append((x_window, y_window))
    return sequences

# Sequence preparation (non-Numba, handles dictionary creation)
def prepare_sequences(data, window_size, prediction_horizon, group_col='SegmentID'):
    unique_segments = data[group_col].unique()
    data_array = data[['LAT', 'LON', 'SOG', 'Hour', 'Minute', 'Second', 'TimeDelta']].to_numpy()
    group_indices = []
    for segment in unique_segments:
        segment_data = np.where(data[group_col] == segment)[0]
        group_indices.append((segment_data[0], segment_data[-1] + 1))

    sequences = prepare_sequences_numba(data_array, group_indices, window_size, prediction_horizon)

    # Return raw sequences (without dictionary creation)
    return sequences

# Numba-optimized formatting (works with NumPy arrays)
@jit(nopython=True, parallel=True)
def format_sequences_numba(sequences, input_features_idx, output_features_idx, predicted_seq_len):
    num_sequences = len(sequences)
    num_input_features = len(input_features_idx)
    num_output_features = len(output_features_idx)

    encoder_inputs = np.zeros((num_sequences, len(sequences[0][0]), num_input_features))
    decoder_inputs = np.zeros((num_sequences, predicted_seq_len, num_input_features))
    targets = np.zeros((num_sequences, predicted_seq_len, num_output_features))

    for i in range(num_sequences):
        x_data, y_data = sequences[i]

        # Manually index into x_data and y_data using loops
        for j in range(num_input_features):
            encoder_inputs[i, :, j] = x_data[:, input_features_idx[j]]

        for j in range(num_input_features):
            decoder_inputs[i, :-1, j] = y_data[:-1, input_features_idx[j]]

        for j in range(num_output_features):
            targets[i, :, j] = y_data[:, output_features_idx[j]]

    return encoder_inputs, decoder_inputs, targets

# Sequence formatting (non-Numba, handles feature indexing and dictionary creation)
def format_sequences(sequences, input_features, output_features, predicted_seq_len):
    input_features_idx = [input_features.index(feat) for feat in input_features]
    output_features_idx = [input_features.index(feat) for feat in output_features]

    # Format sequences using Numba
    encoder_inputs, decoder_inputs, targets = format_sequences_numba(
        sequences, input_features_idx, output_features_idx, predicted_seq_len
    )

    # Return the arrays directly instead of creating a dictionary
    return encoder_inputs, decoder_inputs, targets

# Split data into train/val/test
def split_train_val_test(data, test_size=0.2, val_size=0.2):
    train_data, temp_data = train_test_split(data, test_size=test_size + val_size, random_state=42, shuffle=True)
    val_data, test_data = train_test_split(temp_data, test_size=test_size / (test_size + val_size), random_state=42, shuffle=True)
    return train_data, val_data, test_data

# Main pipeline for preparing data (no scaling)
def prepare_data_pipeline_no_scaling(data, window_size, prediction_horizon, test_size=0.2, val_size=0.32):
    """
    Prepares the data pipeline. Adds features, filters data, and prepares sequences.
    """
    # Add temporal features
    print("Adding temporal features...")
    data = add_temporal_features(data, time_col='BaseDateTime')

    # Filter segments
    print("Filter segments...")
    data = filter_short_segments(data, window_size, prediction_horizon, group_col='SegmentID')

    # Prepare sequences
    print("Prepare sequences...")
    sequences = prepare_sequences(data, window_size, prediction_horizon, group_col='SegmentID')

    # Split sequences into train, validation, and test
    print("Split sequences into train, validation, and test....")
    train_val_sequences, test_sequences = train_test_split(sequences, test_size=test_size, random_state=42)
    train_sequences, val_sequences = train_test_split(train_val_sequences, test_size=val_size, random_state=42)

    return train_sequences, val_sequences, test_sequences

# Parameters
window_size = 5
prediction_horizon = 2
test_size = 0.2
val_size = 0.32

# Define input and output features
input_features = ['LAT', 'LON', 'SOG', 'Hour', 'Minute', 'Second', 'TimeDelta']
output_features = ['LAT', 'LON', 'SOG']

# Sample dataframe (replace this with your actual dataframe)
# segmented_trajectories_df = pd.read_csv('your_data.csv')  # Replace with your actual data

# Prepare data pipeline
train_sequences, val_sequences, test_sequences = prepare_data_pipeline_no_scaling(
    segmented_trajectories_df, window_size, prediction_horizon, test_size, val_size
)

# Format data
train_enc_inputs, train_dec_inputs, train_targets = format_sequences(train_sequences, input_features, output_features, prediction_horizon)
val_enc_inputs, val_dec_inputs, val_targets = format_sequences(val_sequences, input_features, output_features, prediction_horizon)
test_enc_inputs, test_dec_inputs, test_targets = format_sequences(test_sequences, input_features, output_features, prediction_horizon)

# Print results
print("Training data shapes:")
print(f"Encoder inputs: {train_enc_inputs.shape}, Decoder inputs: {train_dec_inputs.shape}, Targets: {train_targets.shape}")


Adding temporal features...
Filter segments...
Filtered data contains 33190 rows after removing short segments.
Prepare sequences...
Split sequences into train, validation, and test....
Training data shapes:
Encoder inputs: (7221, 5, 7), Decoder inputs: (7221, 2, 7), Targets: (7221, 2, 3)


In [41]:
import tensorflow as tf
from tensorflow.keras.layers import (
    Input, LSTM, Dense, Concatenate, Dropout, MultiHeadAttention,
    Add, LayerNormalization, RepeatVector, Lambda, Conv1D, Bidirectional
)
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

def create_advanced_probabilistic_seq2seq(input_seq_len, output_seq_len, input_dim, output_dim, latent_dim):
    """
    Enhanced Probabilistic Seq2Seq Model for Time Series Forecasting with improvements for handling time series data.
    """
    # Encoder
    encoder_inputs = Input(shape=(input_seq_len, input_dim), name="encoder_inputs")

    # Temporal Convolutional Layers for better feature extraction in time series data
    encoder_conv = Conv1D(filters=64, kernel_size=3, activation='relu', padding='same', name="encoder_conv")(encoder_inputs)
    encoder_conv = Dropout(0.3, name="encoder_conv_dropout")(encoder_conv)

    # Bidirectional LSTM for Encoder to capture patterns from both directions
    encoder_lstm = Bidirectional(LSTM(latent_dim, return_sequences=True, return_state=True, name="encoder_lstm"))
    encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_lstm(encoder_conv)  # Adjusted unpacking

    # Combine forward and backward states to form initial state for decoder
    state_h = Concatenate()([forward_h, backward_h])
    state_c = Concatenate()([forward_c, backward_c])

    # Multi-Head Attention for Encoder
    multi_head_attention = MultiHeadAttention(num_heads=4, key_dim=latent_dim, name="multi_head_attention")
    encoder_attention = multi_head_attention(encoder_outputs, encoder_outputs)  # Query and value are the same
    encoder_attention = LayerNormalization(name="encoder_attention_norm")(encoder_attention)

    # Decoder
    decoder_inputs = Input(shape=(output_seq_len, input_dim), name="decoder_inputs")

    # Convolutional Layer in Decoder
    decoder_conv = Conv1D(filters=64, kernel_size=3, activation='relu', padding='same', name="decoder_conv")(decoder_inputs)
    decoder_conv = Dropout(0.3, name="decoder_conv_dropout")(decoder_conv)

    # Decoder LSTM, using encoder states for initialization
    decoder_lstm = LSTM(latent_dim * 2, return_sequences=True, return_state=True, name="decoder_lstm")
    decoder_outputs, _, _ = decoder_lstm(decoder_conv, initial_state=[state_h, state_c])

    # Ensure that the attention mechanism has compatible shapes (query and value)
    attention_context = multi_head_attention(decoder_outputs, encoder_attention)  # Using encoder attention in decoder
    attention_context = LayerNormalization(name="decoder_attention_norm")(attention_context)
    attention_context = Lambda(lambda x: tf.reduce_mean(x, axis=1), name="summarize_attention")(attention_context)

    # Ensure Attention Context and Decoder Outputs have compatible dimensions
    attention_context_repeated = RepeatVector(output_seq_len)(attention_context)  # Repeat to match output_seq_len
    attention_context_repeated = tf.keras.layers.Reshape((output_seq_len, latent_dim * 2))(attention_context_repeated)

    # Merge Attention Context with Decoder Outputs
    merged_context = Concatenate(axis=-1)([decoder_outputs, attention_context_repeated])

    # Dropout and Output Layers
    dropout = Dropout(0.4, name="decoder_dropout")(merged_context)
    output_mean = Dense(output_dim, activation="linear", kernel_regularizer=tf.keras.regularizers.l2(1e-4),
                        name="mean_output")(dropout)
    output_std = Dense(output_dim, activation="softplus", kernel_regularizer=tf.keras.regularizers.l2(1e-4),
                       name="std_output")(dropout)

    # Combine Outputs
    outputs = Concatenate(name="probabilistic_outputs")([output_mean, output_std])

    # Define the Model
    model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=outputs, name="enhanced_probabilistic_seq2seq_model")

    # Custom Loss Function (Negative Log-Likelihood)
    def nll_loss(y_true, y_pred):
        mean = y_pred[..., :output_dim]
        std = y_pred[..., output_dim:]
        dist = tf.compat.v1.distributions.Normal(loc=mean, scale=std)
        return -tf.reduce_mean(dist.log_prob(y_true))

    # Probabilistic Evaluation Metric: CRPS
    def crps_metric(y_true, y_pred):
        mean = y_pred[..., :output_dim]
        std = y_pred[..., output_dim:]
        return tf.reduce_mean((y_true - mean) ** 2 / (2 * std ** 2) + tf.math.log(std))

    # Adjusted R² Metric
    def adjusted_r2_score(y_true, y_pred):
        mean = y_pred[..., :output_dim]
        ss_res = tf.reduce_sum(tf.square(y_true - mean), axis=-1)
        ss_tot = tf.reduce_sum(tf.square(y_true - tf.reduce_mean(y_true, axis=-1, keepdims=True)), axis=-1)
        r2 = 1 - (ss_res / (ss_tot + tf.keras.backend.epsilon()))

        # Number of samples (batch size)
        n = tf.cast(tf.shape(y_true)[0], tf.float32)

        # Number of predictors (features)
        p = tf.cast(output_dim, tf.float32)

        # Adjusted R²
        adjusted_r2 = 1 - ((1 - r2) * (n - 1)) / (n - p - 1 + tf.keras.backend.epsilon())
        return adjusted_r2

    # Optimizer
    optimizer = Adam(learning_rate=0.01)

    # Compile the Model
    model.compile(optimizer=optimizer, loss=nll_loss, metrics=[crps_metric, adjusted_r2_score])

    return model



In [42]:
import tensorflow as tf
import numpy as np
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

# Hyperparameters
input_seq_len = 5  # Input sequence length
output_seq_len = 2  # Output sequence length
latent_dim = 64  # Number of LSTM units
input_dim = len(input_features)  # Number of input features
output_dim = len(output_features)  # Number of output features

# Create the model
model = create_advanced_probabilistic_seq2seq(input_seq_len, output_seq_len, input_dim, output_dim, latent_dim)

# Callbacks
lr_scheduler = ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=5, verbose=1)
early_stopping = EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True, verbose=1)

# Train the model
print("Training the model...")
history = model.fit(
    [train_enc_inputs, train_dec_inputs],
    train_targets,
    validation_data=([val_enc_inputs, val_dec_inputs], val_targets),
    batch_size=32,
    epochs=45,
    callbacks=[lr_scheduler, early_stopping],
    verbose=1
)

# Save the Model
model.save_weights("prob_attention_lstm.weights.h5")
print("Model saved successfully.")


Training the model...
Epoch 1/45


Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.
Instructions for updating:
The TensorFlow Distributions library has moved to TensorFlow Probability (https://github.com/tensorflow/probability). You should update all references to use `tfp.distributions` instead of `tf.distributions`.


[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 82ms/step - adjusted_r2_score: -0.1139 - crps_metric: 2259.0117 - loss: 2259.9370 - val_adjusted_r2_score: 0.1035 - val_crps_metric: 7.1561 - val_loss: 8.0752 - learning_rate: 0.0100
Epoch 2/45
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 53ms/step - adjusted_r2_score: 0.1479 - crps_metric: 7.0060 - loss: 7.9282 - val_adjusted_r2_score: 0.3127 - val_crps_metric: 5.2667 - val_loss: 6.1894 - learning_rate: 0.0100
Epoch 3/45
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 51ms/step - adjusted_r2_score: 0.3536 - crps_metric: 5.2871 - loss: 6.2116 - val_adjusted_r2_score: 0.5429 - val_crps_metric: 4.2581 - val_loss: 5.1842 - learning_rate: 0.0100
Epoch 4/45
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 51ms/step - adjusted_r2_score: 0.5784 - crps_metric: 4.2999 - loss: 5.2272 - val_adjusted_r2_score: 0.7662 - val_crps_metric: 3.4614 - val_loss: 4.3908 - learning_r