# AIS Data Exploration

This notebook contains exploratory data analysis for AIS (Automatic Identification System) data.

## Contents
1. Data Loading and Overview
2. Statistical Analysis
3. Geospatial Analysis
4. Temporal Patterns
5. Vessel Behavior Analysis
6. Data Quality Assessment

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import folium
import h3
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Import our custom modules
import sys
sys.path.append('../src')
from data.loader import AISDataLoader
from data.preprocessing import AISDataPreprocessor
from visualization.plots import setup_plot_style

# Set up plotting style
setup_plot_style()
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')

## 1. Data Loading and Overview

In [None]:
# Initialize data loader
data_loader = AISDataLoader('../data')

# List available data files
print("Available raw data files:")
raw_files = data_loader.list_raw_files()
for file in raw_files:
    print(f"  - {file}")

print("\nAvailable processed data files:")
processed_files = data_loader.list_processed_files()
for file in processed_files:
    print(f"  - {file}")

In [None]:
# Load sample data (replace with your actual data file)
# df = data_loader.load_raw_data('../data/raw/sample_ais_data.csv')
# For demo purposes, create sample data
np.random.seed(42)
n_samples = 10000
vessel_ids = [f'V{i:03d}' for i in range(1, 51)]  # 50 vessels

df = pd.DataFrame({
    'timestamp': pd.date_range('2024-01-01', periods=n_samples, freq='10min'),
    'vessel_id': np.random.choice(vessel_ids, n_samples),
    'lat': np.random.uniform(58, 62, n_samples),  # Norwegian coast area
    'lon': np.random.uniform(4, 12, n_samples),
    'speed': np.random.exponential(8, n_samples),  # Speed in knots
    'course': np.random.uniform(0, 360, n_samples)  # Course in degrees
})

print(f"Dataset shape: {df.shape}")
print(f"Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")
print(f"Number of unique vessels: {df['vessel_id'].nunique()}")

df.head()

## 2. Statistical Analysis

In [None]:
# Basic statistics
print("Dataset Info:")
print(df.info())
print("\nStatistical Summary:")
print(df.describe())

In [None]:
# Distribution plots
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Speed distribution
axes[0, 0].hist(df['speed'], bins=50, alpha=0.7, edgecolor='black')
axes[0, 0].set_title('Speed Distribution')
axes[0, 0].set_xlabel('Speed (knots)')
axes[0, 0].set_ylabel('Frequency')

# Course distribution
axes[0, 1].hist(df['course'], bins=36, alpha=0.7, edgecolor='black')
axes[0, 1].set_title('Course Distribution')
axes[0, 1].set_xlabel('Course (degrees)')
axes[0, 1].set_ylabel('Frequency')

# Latitude distribution
axes[1, 0].hist(df['lat'], bins=50, alpha=0.7, edgecolor='black')
axes[1, 0].set_title('Latitude Distribution')
axes[1, 0].set_xlabel('Latitude')
axes[1, 0].set_ylabel('Frequency')

# Longitude distribution
axes[1, 1].hist(df['lon'], bins=50, alpha=0.7, edgecolor='black')
axes[1, 1].set_title('Longitude Distribution')
axes[1, 1].set_xlabel('Longitude')
axes[1, 1].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## 3. Geospatial Analysis

In [None]:
# Create a sample map with vessel positions
sample_positions = df.sample(1000)  # Sample for visualization

# Create base map centered on the data
center_lat = df['lat'].mean()
center_lon = df['lon'].mean()

m = folium.Map(location=[center_lat, center_lon], zoom_start=8)

# Add vessel positions
for idx, row in sample_positions.iterrows():
    folium.CircleMarker(
        location=[row['lat'], row['lon']],
        radius=3,
        popup=f"Vessel: {row['vessel_id']}<br>Speed: {row['speed']:.1f} knots",
        color='blue',
        fill=True,
        fillColor='blue',
        fillOpacity=0.6
    ).add_to(m)

# Display map
m

In [None]:
# Heatmap of vessel positions
plt.figure(figsize=(12, 8))
plt.hexbin(df['lon'], df['lat'], gridsize=30, cmap='YlOrRd', mincnt=1)
plt.colorbar(label='Number of Observations')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Vessel Position Density')
plt.show()

## 4. Temporal Patterns

In [None]:
# Add temporal features
df['hour'] = df['timestamp'].dt.hour
df['day_of_week'] = df['timestamp'].dt.day_name()
df['month'] = df['timestamp'].dt.month

# Hourly patterns
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Messages per hour
hourly_counts = df.groupby('hour').size()
axes[0, 0].bar(hourly_counts.index, hourly_counts.values)
axes[0, 0].set_title('AIS Messages by Hour of Day')
axes[0, 0].set_xlabel('Hour')
axes[0, 0].set_ylabel('Number of Messages')

# Average speed by hour
hourly_speed = df.groupby('hour')['speed'].mean()
axes[0, 1].plot(hourly_speed.index, hourly_speed.values, marker='o')
axes[0, 1].set_title('Average Speed by Hour of Day')
axes[0, 1].set_xlabel('Hour')
axes[0, 1].set_ylabel('Average Speed (knots)')

# Messages by day of week
daily_counts = df.groupby('day_of_week').size()
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
daily_counts = daily_counts.reindex(day_order)
axes[1, 0].bar(range(len(daily_counts)), daily_counts.values)
axes[1, 0].set_title('AIS Messages by Day of Week')
axes[1, 0].set_xlabel('Day of Week')
axes[1, 0].set_ylabel('Number of Messages')
axes[1, 0].set_xticks(range(len(day_order)))
axes[1, 0].set_xticklabels([day[:3] for day in day_order])

# Time series of daily message counts
daily_ts = df.set_index('timestamp').resample('D').size()
axes[1, 1].plot(daily_ts.index, daily_ts.values)
axes[1, 1].set_title('Daily Message Count Over Time')
axes[1, 1].set_xlabel('Date')
axes[1, 1].set_ylabel('Messages per Day')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 5. Vessel Behavior Analysis

In [None]:
# Vessel activity statistics
vessel_stats = df.groupby('vessel_id').agg({
    'timestamp': ['count', 'min', 'max'],
    'speed': ['mean', 'std', 'max'],
    'lat': ['min', 'max'],
    'lon': ['min', 'max']
}).round(2)

vessel_stats.columns = ['_'.join(col).strip() for col in vessel_stats.columns]

print("Top 10 most active vessels:")
print(vessel_stats.sort_values('timestamp_count', ascending=False).head(10))

In [None]:
# Speed analysis by vessel
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Distribution of average speeds
avg_speeds = df.groupby('vessel_id')['speed'].mean()
axes[0].hist(avg_speeds, bins=20, alpha=0.7, edgecolor='black')
axes[0].set_title('Distribution of Average Vessel Speeds')
axes[0].set_xlabel('Average Speed (knots)')
axes[0].set_ylabel('Number of Vessels')

# Box plot of speeds for top 10 active vessels
top_vessels = vessel_stats.sort_values('timestamp_count', ascending=False).head(10).index
speed_data = [df[df['vessel_id'] == vessel]['speed'].values for vessel in top_vessels]

axes[1].boxplot(speed_data, labels=[v[:6] for v in top_vessels])
axes[1].set_title('Speed Distribution for Top 10 Active Vessels')
axes[1].set_xlabel('Vessel ID')
axes[1].set_ylabel('Speed (knots)')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 6. Data Quality Assessment

In [None]:
# Data quality assessment
preprocessor = AISDataPreprocessor()
quality_issues = preprocessor.validate_ais_data(df)

print("Data Quality Issues:")
if quality_issues:
    for issue in quality_issues:
        print(f"  - {issue}")
else:
    print("  No major data quality issues found!")

In [None]:
# Missing value analysis
missing_data = df.isnull().sum()
missing_pct = (missing_data / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_data,
    'Missing Percentage': missing_pct
})

print("Missing Data Summary:")
print(missing_df[missing_df['Missing Count'] > 0])

# Visualize missing data
if missing_data.sum() > 0:
    plt.figure(figsize=(10, 6))
    missing_pct[missing_pct > 0].plot(kind='bar')
    plt.title('Missing Data by Column')
    plt.ylabel('Percentage Missing')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
else:
    print("No missing data found!")

## Summary and Next Steps

Based on this exploratory analysis:

1. **Data Overview**: [Add your observations about the dataset]
2. **Temporal Patterns**: [Add insights about time-based patterns]
3. **Spatial Distribution**: [Add insights about vessel locations]
4. **Vessel Behavior**: [Add insights about individual vessel patterns]
5. **Data Quality**: [Add assessment of data quality issues]

**Recommendations for preprocessing:**
- [Add specific preprocessing steps needed]
- [Add feature engineering suggestions]
- [Add data cleaning recommendations]