In [None]:
"""
Jupyter notebook for exploratory data analysis
Note: This is a Python script representation of a Jupyter notebook
"""

# %% [markdown]
# # Exploratory Data Analysis - Vehicle Trip Anomaly Detection
# 
# This notebook explores the Porto taxi trajectory dataset and performs initial analysis.

# %%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import sys
import json

# Add project root to path
project_root = Path.cwd().parent
sys.path.append(str(project_root))

from config.settings import RAW_DATA_FILE, OUTPUTS_DIR
from src.data_ingestion import DataIngestion
from src.data_preprocessing import DataPreprocessor

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# %%
# Load and explore raw data
ingestor = DataIngestion()
raw_data = ingestor.load_data()

print(f"Dataset shape: {raw_data.shape}")
print(f"\nColumns: {list(raw_data.columns)}")
print(f"\nFirst few rows:")
display(raw_data.head())

# %%
# Data validation
is_valid, validation_report = ingestor.validate_data()
print(f"Data valid: {is_valid}")
print(f"\nValidation report:")
for key, value in validation_report.items():
    if key != 'missing_values':
        print(f"  {key}: {value}")

# %%
# Summary statistics
summary = ingestor.get_summary_statistics()
print("Data Summary:")
print(f"  Taxi count: {summary['taxi_count']}")
print(f"  Trajectory count: {summary['trajectory_count']}")
print(f"  Date range: {summary['date_range']['min']} to {summary['date_range']['max']}")

# %%
# Data cleaning
preprocessor = DataPreprocessor()
cleaned_data = preprocessor.clean_data(raw_data)

print(f"Cleaned data shape: {cleaned_data.shape}")
print(f"\nRemoved {len(raw_data) - len(cleaned_data)} records")

# %%
# Basic statistics of cleaned data
print("Basic statistics of cleaned data:")
print(f"Unique taxis: {cleaned_data['taxi_id'].nunique()}")
print(f"Unique trajectories: {cleaned_data['trajectory_id'].nunique()}")
print(f"\nDistance statistics (km):")
print(f"  Mean: {cleaned_data['distance_km'].mean():.2f}")
print(f"  Std: {cleaned_data['distance_km'].std():.2f}")
print(f"  Min: {cleaned_data['distance_km'].min():.2f}")
print(f"  Max: {cleaned_data['distance_km'].max():.2f}")

# %%
# Visualize distance distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(cleaned_data['distance_km'], bins=50, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Distance (km)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Trip Distances')
axes[0].grid(True, alpha=0.3)

# Box plot
axes[1].boxplot(cleaned_data['distance_km'].dropna(), vert=False)
axes[1].set_xlabel('Distance (km)')
axes[1].set_title('Box Plot of Trip Distances')

plt.tight_layout()
plt.show()

# %%
# Time analysis
cleaned_data['hour'] = pd.to_datetime(cleaned_data['timestamp']).dt.hour
cleaned_data['day'] = pd.to_datetime(cleaned_data['timestamp']).dt.day

# Trips by hour
trips_by_hour = cleaned_data['hour'].value_counts().sort_index()

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].bar(trips_by_hour.index, trips_by_hour.values)
axes[0].set_xlabel('Hour of Day')
axes[0].set_ylabel('Number of Trips')
axes[0].set_title('Trips by Hour of Day')
axes[0].set_xticks(range(0, 24, 2))
axes[0].grid(True, alpha=0.3, axis='y')

# Distance by hour
distance_by_hour = cleaned_data.groupby('hour')['distance_km'].mean()

axes[1].plot(distance_by_hour.index, distance_by_hour.values, marker='o', linewidth=2)
axes[1].set_xlabel('Hour of Day')
axes[1].set_ylabel('Average Distance (km)')
axes[1].set_title('Average Trip Distance by Hour')
axes[1].set_xticks(range(0, 24, 2))
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# %%
# Geographic analysis
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Source points
axes[0].scatter(cleaned_data['source_lon'], cleaned_data['source_lat'], 
                alpha=0.3, s=10)
axes[0].set_xlabel('Longitude')
axes[0].set_ylabel('Latitude')
axes[0].set_title('Source Points Distribution')
axes[0].grid(True, alpha=0.3)

# Target points
axes[1].scatter(cleaned_data['target_lon'], cleaned_data['target_lat'], 
                alpha=0.3, s=10, color='orange')
axes[1].set_xlabel('Longitude')
axes[1].set_ylabel('Latitude')
axes[1].set_title('Target Points Distribution')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# %%
# Save cleaned data for later use
output_path = OUTPUTS_DIR / "eda_cleaned_data.csv"
cleaned_data.to_csv(output_path, index=False)
print(f"Saved cleaned data to {output_path}")

# Generate EDA report
eda_report = {
    'original_shape': raw_data.shape,
    'cleaned_shape': cleaned_data.shape,
    'records_removed': len(raw_data) - len(cleaned_data),
    'taxi_count': cleaned_data['taxi_id'].nunique(),
    'trajectory_count': cleaned_data['trajectory_id'].nunique(),
    'distance_stats': {
        'mean': float(cleaned_data['distance_km'].mean()),
        'std': float(cleaned_data['distance_km'].std()),
        'min': float(cleaned_data['distance_km'].min()),
        'max': float(cleaned_data['distance_km'].max()),
    },
    'time_range': {
        'start': str(cleaned_data['timestamp'].min()),
        'end': str(cleaned_data['timestamp'].max()),
    }
}

report_path = OUTPUTS_DIR / "eda_report.json"
with open(report_path, 'w') as f:
    json.dump(eda_report, f, indent=2)

print(f"EDA report saved to {report_path}")