# Data Preprocessing Pipeline

This notebook demonstrates the complete data preprocessing pipeline for AIS data.

## Contents
1. Data Loading and Validation
2. Data Cleaning
3. Geospatial Feature Engineering
4. Temporal Feature Engineering
5. Data Aggregation and Time Series Creation
6. Feature Selection and Normalization
7. Dataset Preparation for Modeling

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Import our custom modules
import sys
sys.path.append('../src')
from data.loader import AISDataLoader
from data.preprocessing import AISDataPreprocessor
from features.geo_features import GeoFeatureEngineer
from features.time_features import TimeFeatureEngineer
from visualization.plots import setup_plot_style

# Set up plotting
setup_plot_style()
plt.rcParams['figure.figsize'] = (12, 8)

## 1. Data Loading and Validation

In [None]:
# Initialize components
data_loader = AISDataLoader('../data')
preprocessor = AISDataPreprocessor()
geo_engineer = GeoFeatureEngineer(h3_resolution=8)
time_engineer = TimeFeatureEngineer()

print("Preprocessing pipeline initialized successfully!")

In [None]:
# Load raw data (replace with your actual data file)
# df_raw = data_loader.load_raw_data('../data/raw/your_ais_data.csv')

# For demo purposes, create sample data
np.random.seed(42)
n_samples = 50000
n_vessels = 100
vessel_ids = [f'V{i:03d}' for i in range(1, n_vessels + 1)]

# Create realistic vessel tracks
df_raw = pd.DataFrame({
    'timestamp': pd.date_range('2024-01-01', periods=n_samples, freq='5min'),
    'vessel_id': np.random.choice(vessel_ids, n_samples),
    'lat': np.random.uniform(58, 62, n_samples),  # Norwegian waters
    'lon': np.random.uniform(4, 12, n_samples),
    'speed': np.random.exponential(8, n_samples),  # Speed in knots
    'course': np.random.uniform(0, 360, n_samples),  # Course in degrees
})

# Add some data quality issues for demonstration
# Missing values
missing_idx = np.random.choice(df_raw.index, size=int(0.02 * len(df_raw)), replace=False)
df_raw.loc[missing_idx[:len(missing_idx)//2], 'speed'] = np.nan
df_raw.loc[missing_idx[len(missing_idx)//2:], 'course'] = np.nan

# Invalid coordinates
invalid_idx = np.random.choice(df_raw.index, size=100, replace=False)
df_raw.loc[invalid_idx[:50], 'lat'] = 95  # Invalid latitude
df_raw.loc[invalid_idx[50:], 'lon'] = 185  # Invalid longitude

print(f"Raw dataset shape: {df_raw.shape}")
print(f"Date range: {df_raw['timestamp'].min()} to {df_raw['timestamp'].max()}")
print(f"Number of vessels: {df_raw['vessel_id'].nunique()}")

df_raw.head()

In [None]:
# Validate raw data
validation_issues = preprocessor.validate_ais_data(df_raw)

print("Data Validation Results:")
if validation_issues:
    for issue in validation_issues:
        print(f"  ⚠️  {issue}")
else:
    print("  ✅ No validation issues found!")

## 2. Data Cleaning

In [None]:
# Clean the data
print("Cleaning data...")
df_clean = preprocessor.clean_ais_data(
    df_raw, 
    remove_outliers=True, 
    speed_threshold=50.0
)

print(f"\nData cleaning results:")
print(f"  Original records: {len(df_raw):,}")
print(f"  Clean records: {len(df_clean):,}")
print(f"  Removed: {len(df_raw) - len(df_clean):,} ({((len(df_raw) - len(df_clean)) / len(df_raw) * 100):.1f}%)")

In [None]:
# Handle remaining missing values
print("Handling missing values...")
df_clean = preprocessor.handle_missing_values(
    df_clean, 
    strategy='interpolate', 
    columns=['speed', 'course']
)

# Check for remaining missing values
missing_after = df_clean.isnull().sum()
print(f"\nMissing values after cleaning:")
print(missing_after[missing_after > 0])

## 3. Geospatial Feature Engineering

In [None]:
# Create H3 geospatial features
print("Creating H3 geospatial features...")
df_geo = geo_engineer.create_h3_cells(df_clean)
df_geo = geo_engineer.create_h3_center_coordinates(df_geo)

print(f"Sample H3 cells:")
print(df_geo[['lat', 'lon', 'h3_cell', 'h3_center_lat', 'h3_center_lon']].head())

In [None]:
# Calculate distance and speed features
print("Calculating distance and speed features...")
df_geo = geo_engineer.calculate_distance_features(df_geo, group_col='vessel_id')
df_geo = geo_engineer.create_speed_features(df_geo, group_col='vessel_id')
df_geo = geo_engineer.create_bearing_features(df_geo, group_col='vessel_id')

print(f"Geospatial features created:")
geo_cols = ['distance_from_previous', 'cumulative_distance', 'calculated_speed', 'bearing']
print(df_geo[geo_cols].describe())

In [None]:
# Visualize H3 cells
h3_counts = df_geo['h3_cell'].value_counts().head(20)

plt.figure(figsize=(12, 6))
plt.bar(range(len(h3_counts)), h3_counts.values)
plt.title('Top 20 H3 Cells by Observation Count')
plt.xlabel('H3 Cell Rank')
plt.ylabel('Number of Observations')
plt.xticks(range(len(h3_counts)), [f'Cell {i+1}' for i in range(len(h3_counts))], rotation=45)
plt.tight_layout()
plt.show()

print(f"Total unique H3 cells: {df_geo['h3_cell'].nunique()}")

## 4. Temporal Feature Engineering

In [None]:
# Create temporal features
print("Creating temporal features...")
df_temporal = time_engineer.create_basic_time_features(df_geo, 'timestamp')
df_temporal = time_engineer.create_cyclical_features(df_temporal, 'timestamp')
df_temporal = time_engineer.create_seasonal_features(df_temporal, 'timestamp')

print("Temporal features created:")
temporal_cols = ['hour', 'day_of_week', 'month', 'hour_sin', 'hour_cos', 'is_weekend', 'season']
print(df_temporal[temporal_cols].head())

In [None]:
# Create lag and rolling features for speed (as an example target variable)
print("Creating lag and rolling features...")
df_temporal = time_engineer.create_lag_features(
    df_temporal, 
    target_col='speed',
    group_cols=['vessel_id'],
    lags=[1, 2, 3, 6, 12]  # 1, 2, 3, 6, 12 time steps back
)

df_temporal = time_engineer.create_rolling_features(
    df_temporal,
    target_col='speed',
    group_cols=['vessel_id'],
    windows=[3, 6, 12, 24],
    features=['mean', 'std', 'max']
)

print("Lag and rolling features created.")
lag_cols = [col for col in df_temporal.columns if 'lag' in col or 'rolling' in col]
print(f"Number of lag/rolling features: {len(lag_cols)}")

## 5. Data Aggregation and Time Series Creation

In [None]:
# Create time index for forecasting
print("Creating time index...")
df_indexed = preprocessor.create_time_index(df_temporal, freq='H')  # Hourly aggregation

print(f"Time index range: {df_indexed['time_idx'].min()} to {df_indexed['time_idx'].max()}")
print(f"Number of unique time steps: {df_indexed['time_idx'].nunique()}")

In [None]:
# Aggregate to H3 grid
print("Aggregating to H3 grid...")
df_grid = preprocessor.aggregate_to_grid(
    df_indexed,
    h3_col='h3_cell',
    time_col='time_idx',
    value_cols=['speed', 'calculated_speed'],
    agg_functions={'speed': 'mean', 'calculated_speed': 'mean'}
)

print(f"Grid aggregation results:")
print(f"  Shape: {df_grid.shape}")
print(f"  Unique H3 cells: {df_grid['h3_cell'].nunique()}")
print(f"  Time steps: {df_grid['time_idx'].nunique()}")

df_grid.head()

In [None]:
# Visualize grid aggregation
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Vessel count distribution
axes[0].hist(df_grid['vessel_count'], bins=30, alpha=0.7, edgecolor='black')
axes[0].set_title('Distribution of Vessel Counts per Grid Cell')
axes[0].set_xlabel('Vessel Count')
axes[0].set_ylabel('Frequency')

# Average speed distribution
axes[1].hist(df_grid['speed'].dropna(), bins=30, alpha=0.7, edgecolor='black')
axes[1].set_title('Distribution of Average Speeds per Grid Cell')
axes[1].set_xlabel('Average Speed (knots)')
axes[1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## 6. Feature Selection and Normalization

In [None]:
# Prepare final dataset for modeling
print("Preparing final dataset...")

# Select features for modeling
feature_columns = [
    'h3_cell', 'time_idx', 'vessel_count', 'speed',
    'hour', 'day_of_week', 'month', 'is_weekend',
    'hour_sin', 'hour_cos', 'month_sin', 'month_cos'
]

# Add lag features if they exist
lag_features = [col for col in df_grid.columns if 'speed_lag' in col]
feature_columns.extend(lag_features[:5])  # Use first 5 lag features

# Filter to only existing columns
available_features = [col for col in feature_columns if col in df_grid.columns]
df_model = df_grid[available_features].copy()

print(f"Selected features: {available_features}")
print(f"Final dataset shape: {df_model.shape}")

In [None]:
# Handle missing values in final dataset
print("Final missing value check:")
missing_final = df_model.isnull().sum()
print(missing_final[missing_final > 0])

# Drop rows with missing target values
df_model = df_model.dropna(subset=['speed'])

# Fill remaining missing values
numeric_cols = df_model.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    if df_model[col].isnull().sum() > 0:
        df_model[col] = df_model[col].fillna(df_model[col].median())

print(f"Final dataset shape after cleaning: {df_model.shape}")

In [None]:
# Normalize numerical features
print("Normalizing features...")
normalize_cols = ['vessel_count', 'speed'] + [col for col in df_model.columns if 'lag' in col]
normalize_cols = [col for col in normalize_cols if col in df_model.columns]

df_normalized, norm_params = preprocessor.normalize_features(
    df_model, 
    columns=normalize_cols, 
    method='minmax'
)

print(f"Normalized columns: {list(norm_params.keys())}")
print("Normalization parameters saved for inverse transformation.")

## 7. Dataset Preparation for Modeling

In [None]:
# Create group IDs for time series modeling
df_final = df_normalized.copy()
df_final['GroupIDS'] = df_final['h3_cell']  # Use H3 cell as group identifier

# Rename target column
df_final['value'] = df_final['speed']  # Standard target name

# Ensure proper data types
df_final['time_idx'] = df_final['time_idx'].astype(int)
df_final['hour'] = df_final['hour'].astype(int)
df_final['day_of_week'] = df_final['day_of_week'].astype(int)
df_final['month'] = df_final['month'].astype(int)
df_final['is_weekend'] = df_final['is_weekend'].astype(int)

print(f"Final dataset for modeling:")
print(f"  Shape: {df_final.shape}")
print(f"  Columns: {list(df_final.columns)}")
print(f"  Time range: {df_final['time_idx'].min()} to {df_final['time_idx'].max()}")
print(f"  Number of groups: {df_final['GroupIDS'].nunique()}")

df_final.head()

In [None]:
# Final data quality check
print("Final data quality assessment:")
print(f"  Missing values: {df_final.isnull().sum().sum()}")
print(f"  Duplicate rows: {df_final.duplicated().sum()}")
print(f"  Data types:")
print(df_final.dtypes)

# Basic statistics
print("\nFinal dataset statistics:")
print(df_final.describe())

In [None]:
# Save processed data
print("Saving processed data...")
data_loader.save_processed_data(df_final, 'processed_ais_data')

# Save normalization parameters
import pickle
with open('../data/processed/normalization_params.pkl', 'wb') as f:
    pickle.dump(norm_params, f)

print("✅ Preprocessing complete!")
print(f"   Processed data saved as: processed_ais_data.pkl")
print(f"   Normalization parameters saved as: normalization_params.pkl")
print(f"   Ready for model training with {len(df_final):,} samples")

## Summary

The preprocessing pipeline successfully:

1. ✅ **Loaded and validated** raw AIS data
2. ✅ **Cleaned data** by removing invalid records and outliers
3. ✅ **Created geospatial features** using H3 hexagonal grids
4. ✅ **Engineered temporal features** including cyclical encoding
5. ✅ **Aggregated data** to regular time intervals and spatial grids
6. ✅ **Normalized features** for model training
7. ✅ **Prepared final dataset** in the format required for forecasting models

The processed data is now ready for training deep learning forecasting models like TFT and N-BEATS.

**Next Steps:**
- Train forecasting models using the prepared dataset
- Evaluate model performance and tune hyperparameters
- Deploy the best-performing model for production use