In [None]:
# ==========================================================
# SMART FISHER LANKA - COMPLETE DATASET GENERATOR
# For Fishing Trip Base Cost Prediction (Fuel, Ice, Water)
# WITH CEYPETCO FUEL PRICE EXTRACTION
# ==========================================================

!pip install -q pandas numpy scikit-learn lxml html5lib tqdm

import numpy as np
import pandas as pd
import random
import math
import warnings
from datetime import datetime
from math import radians, sin, cos, sqrt, atan2
warnings.filterwarnings("ignore")

# ------------------- 0. CEYPETCO FUEL PRICE DATA EXTRACTION -------------------
def get_ceypetco_fuel_data():
    """Extract fuel prices from Ceypetco website with fallback"""
    try:
        print("Fetching fuel prices from Ceypetco website...")
        df = pd.read_html("https://ceypetco.gov.lk/historical-prices/", header=0)[0]
        df.columns = df.columns.str.strip()
        df = df.rename(columns={"LP 92": "Petrol92", "LAD": "Diesel_AD", "LK": "Kerosene_LK"})
        df["Date"] = pd.to_datetime(df["Date"], format="%d.%m.%Y", errors="coerce")
        df["Petrol92"] = pd.to_numeric(df["Petrol92"], errors="coerce")
        df["Diesel_AD"] = pd.to_numeric(df["Diesel_AD"], errors="coerce")
        df["Kerosene_LK"] = pd.to_numeric(df["Kerosene_LK"], errors="coerce")
        df = df.dropna(subset=["Date", "Petrol92", "Diesel_AD", "Kerosene_LK"])
        df = df.sort_values("Date", ascending=False)
        
        if not df.empty:
            print(f"✓ Successfully fetched {len(df)} fuel price records")
            return df.reset_index(drop=True)
    except Exception as e:
        print(f"⚠ Could not fetch from Ceypetco: {e}")
        print("Using fallback historical data...")
    
    # Fallback: Generate realistic historical data (2015-2025)
    dates, petrol_prices, diesel_prices, kerosene_prices = [], [], [], []
    base_year = 2015
    
    for year in range(base_year, 2026):
        for month in range(1, 13):
            if year == 2025 and month > 6:  # Only up to mid-2025
                break
            dates.append(datetime(year, month, 15))
            
            # Realistic price progression
            year_factor = 1 + (year - base_year) * 0.08  # 8% annual increase
            month_factor = 1 + 0.01 * (month - 6) / 6    # Seasonal variation
            
            base_p = 120 * year_factor * month_factor
            base_d = 100 * year_factor * month_factor
            base_k = 60 * year_factor * month_factor
            
            petrol_prices.append(round(base_p * np.random.uniform(0.98, 1.02), 1))
            diesel_prices.append(round(base_d * np.random.uniform(0.98, 1.02), 1))
            kerosene_prices.append(round(base_k * np.random.uniform(0.98, 1.02), 1))
    
    df_fallback = pd.DataFrame({
        "Date": dates,
        "Petrol92": petrol_prices,
        "Diesel_AD": diesel_prices,
        "Kerosene_LK": kerosene_prices
    })
    
    # Set current prices (2025 Sri Lanka rates)
    df_fallback.loc[df_fallback['Date'] >= '2025-01-01', ['Petrol92', 'Diesel_AD', 'Kerosene_LK']] = [320.0, 310.0, 180.0]
    
    return df_fallback.sort_values("Date", ascending=False).reset_index(drop=True)

# Load fuel data
df_fuel = get_ceypetco_fuel_data()

def get_fuel_price_for_date(target_date):
    """Get fuel prices for a specific date"""
    if isinstance(target_date, str):
        target_date = pd.to_datetime(target_date)
    
    # Find the most recent price on or before target_date
    past_prices = df_fuel[df_fuel["Date"] <= target_date]
    
    if len(past_prices) > 0:
        closest = past_prices.iloc[0]  # Most recent price before target_date
        petrol_price = float(closest["Petrol92"])
        diesel_price = float(closest["Diesel_AD"])
        kerosene_price = float(closest["Kerosene_LK"])
    else:
        # Use latest available price
        petrol_price = float(df_fuel.iloc[0]["Petrol92"])
        diesel_price = float(df_fuel.iloc[0]["Diesel_AD"])
        kerosene_price = float(df_fuel.iloc[0]["Kerosene_LK"])
    
    return petrol_price, diesel_price, kerosene_price

# Current prices (for reference)
current_petrol, current_diesel, current_kerosene = get_fuel_price_for_date(datetime.now())
print(f"\nCurrent Fuel Prices (LKR/Liter):")
print(f"  Petrol (92 Octane): {current_petrol}")
print(f"  Diesel: {current_diesel}")
print(f"  Kerosene: {current_kerosene}")

# ------------------- 1. COST CONSTANTS -------------------
ICE_COST_PER_KG = 12.0  # LKR per kg (realistic ice cost)
WATER_COST_PER_LITER = 5.0  # LKR per liter for drinking water
TANK_WATER_COST = 0.0   # LKR per liter for tank water (free)

# ------------------- 2. SRI LANKAN PORTS -------------------
PORTS = [
    {"name": "Negombo", "lat": 7.2090, "lon": 79.8350, "region": "Western"},
    {"name": "Kalpitiya", "lat": 8.5560, "lon": 79.8000, "region": "North Western"},
    {"name": "Colombo", "lat": 6.9271, "lon": 79.8612, "region": "Western"},
    {"name": "Galle", "lat": 6.1241, "lon": 81.1185, "region": "Southern"},
    {"name": "Beruwala", "lat": 6.0535, "lon": 80.2209, "region": "Western"},
    {"name": "Hambantota", "lat": 5.9549, "lon": 80.5549, "region": "Southern"},
    {"name": "Trincomalee", "lat": 8.5500, "lon": 81.2330, "region": "Eastern"},
    {"name": "Kankesanthurai", "lat": 9.6615, "lon": 80.0104, "region": "Northern"}
]

# ------------------- 3. BOAT TYPES (REALISTIC SRI LANKAN) -------------------
BOAT_TYPES = {
    # Industrial Multi-day (Large deep sea vessels)
    "IMUL": {
        "name": "Industrial Multi-day",
        "hp_range": (200, 500),
        "fuel_type": "diesel",
        "fuel_rate_lph_per_hp": 0.08,  # liters/hour per HP
        "speed_knots": (8, 12),
        "ice_capacity_kg": 10000,
        "water_capacity_L": 30000,
        "crew_range": (12, 20),
        "trip_days_range": (10, 45),
        "distance_range_km": (200, 1000),
        "probability": 0.15
    },
    
    # Motorized Day Boats (Medium vessels)
    "MDBT": {
        "name": "Motorized Day Boat",
        "hp_range": (60, 150),
        "fuel_type": "diesel",
        "fuel_rate_lph_per_hp": 0.10,
        "speed_knots": (10, 16),
        "ice_capacity_kg": 2000,
        "water_capacity_L": 1000,
        "crew_range": (6, 10),
        "trip_days_range": (1, 5),
        "distance_range_km": (50, 200),
        "probability": 0.25
    },
    
    # Outboard Fiberglass (Common small boats - petrol)
    "OBFR": {
        "name": "Outboard Fiberglass",
        "hp_range": (25, 90),
        "fuel_type": "petrol",
        "fuel_rate_lph_per_hp": 0.15,
        "speed_knots": (15, 25),
        "ice_capacity_kg": 500,
        "water_capacity_L": 200,
        "crew_range": (3, 6),
        "trip_days_range": (1, 2),
        "distance_range_km": (20, 120),
        "probability": 0.40
    },
    
    # Traditional Kerosene Boats
    "TKBO": {
        "name": "Traditional Kerosene Boat",
        "hp_range": (10, 30),
        "fuel_type": "kerosene",
        "fuel_rate_lph_per_hp": 0.20,
        "speed_knots": (6, 10),
        "ice_capacity_kg": 300,
        "water_capacity_L": 100,
        "crew_range": (2, 4),
        "trip_days_range": (1, 1),
        "distance_range_km": (10, 60),
        "probability": 0.15
    },
    
    # Non-motorized Traditional
    "NMTR": {
        "name": "Non-motorized Traditional",
        "hp_range": (0, 0),
        "fuel_type": "none",
        "fuel_rate_lph_per_hp": 0.0,
        "speed_knots": (2, 4),
        "ice_capacity_kg": 50,
        "water_capacity_L": 30,
        "crew_range": (1, 2),
        "trip_days_range": (1, 1),
        "distance_range_km": (2, 15),
        "probability": 0.05
    }
}

# Convert probabilities for random selection
boat_keys = list(BOAT_TYPES.keys())
boat_probs = [BOAT_TYPES[k]["probability"] for k in boat_keys]
boat_probs = np.array(boat_probs) / sum(boat_probs)

# ------------------- 4. HELPER FUNCTIONS -------------------
def haversine_distance(lat1, lon1, lat2, lon2):
    """Calculate distance between two coordinates in km"""
    R = 6371.0  # Earth radius in km
    
    lat1_rad = radians(lat1)
    lon1_rad = radians(lon1)
    lat2_rad = radians(lat2)
    lon2_rad = radians(lon2)
    
    dlat = lat2_rad - lat1_rad
    dlon = lon2_rad - lon1_rad
    
    a = sin(dlat/2)**2 + cos(lat1_rad) * cos(lat2_rad) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    
    return R * c

def generate_fishing_location(departure_lat, departure_lon, boat_type):
    """Generate random fishing location based on boat type capabilities"""
    config = BOAT_TYPES[boat_type]
    min_dist, max_dist = config["distance_range_km"]
    
    # Random distance within boat's range
    distance_km = np.random.uniform(min_dist * 0.3, max_dist * 0.8)
    
    # Random bearing (direction)
    bearing = np.random.uniform(0, 2 * np.pi)
    
    # Convert distance to radians
    d = distance_km / 6371.0
    
    # Calculate new coordinates
    lat1 = radians(departure_lat)
    lon1 = radians(departure_lon)
    
    lat2 = math.asin(math.sin(lat1) * math.cos(d) + 
                     math.cos(lat1) * math.sin(d) * math.cos(bearing))
    
    lon2 = lon1 + math.atan2(math.sin(bearing) * math.sin(d) * math.cos(lat1),
                             math.cos(d) - math.sin(lat1) * math.sin(lat2))
    
    # Convert back to degrees
    lat2 = math.degrees(lat2)
    lon2 = math.degrees(lon2)
    
    # Ensure within Sri Lankan waters
    lat2 = max(5.9, min(9.8, lat2))  # Sri Lanka latitude bounds
    lon2 = max(79.5, min(81.9, lon2))  # Sri Lanka longitude bounds
    
    return lat2, lon2, distance_km

def calculate_fuel_consumption(boat_type, engine_hp, distance_km, trip_days, weather_factor=1.0):
    """Calculate fuel consumption in liters and total hours"""
    config = BOAT_TYPES[boat_type]
    
    if config["fuel_type"] == "none":
        # For non-motorized boats, return 0 fuel and calculate time manually
        avg_speed_knots = np.mean(config["speed_knots"])
        avg_speed_kmh = avg_speed_knots * 1.852
        transit_hours = (distance_km * 2) / avg_speed_kmh
        fishing_hours_per_day = np.random.uniform(6, 10)
        total_fishing_hours = fishing_hours_per_day * trip_days
        total_hours = transit_hours + total_fishing_hours
        return 0.0, round(total_hours, 1)
    
    # Convert knots to km/h (1 knot = 1.852 km/h)
    avg_speed_knots = np.mean(config["speed_knots"])
    avg_speed_kmh = avg_speed_knots * 1.852
    
    # Transit hours (to fishing spot and back)
    transit_hours = (distance_km * 2) / avg_speed_kmh
    
    # Fishing hours per day (6-10 hours)
    fishing_hours_per_day = np.random.uniform(6, 10)
    total_fishing_hours = fishing_hours_per_day * trip_days
    
    total_hours = transit_hours + total_fishing_hours
    
    # Calculate fuel consumption
    if config["fuel_type"] == "diesel":
        # Diesel engines more efficient
        base_rate = config["fuel_rate_lph_per_hp"] * 0.9
    elif config["fuel_type"] == "petrol":
        base_rate = config["fuel_rate_lph_per_hp"] * 1.0
    else:  # kerosene
        base_rate = config["fuel_rate_lph_per_hp"] * 1.1
    
    fuel_liters = base_rate * engine_hp * total_hours * weather_factor
    
    # Add 5-15% auxiliary power for multi-day boats
    if boat_type in ["IMUL", "MDBT"] and trip_days > 1:
        aux_power = fuel_liters * np.random.uniform(0.05, 0.15)
        fuel_liters += aux_power
    
    # Add random variation (±8%)
    fuel_liters *= np.random.uniform(0.92, 1.08)
    
    return round(max(0, fuel_liters), 2), round(total_hours, 1)

def calculate_ice_requirement(boat_type, trip_days, crew_size, distance_km):
    """Calculate ice needed in kg"""
    config = BOAT_TYPES[boat_type]
    
    # Base ice per crew per day
    if boat_type == "IMUL":
        ice_per_crew_day = 10.0  # Deep sea needs more ice
    elif boat_type == "MDBT":
        ice_per_crew_day = 7.0
    elif boat_type == "OBFR":
        ice_per_crew_day = 5.0
    elif boat_type == "TKBO":
        ice_per_crew_day = 4.0
    else:  # NMTR
        ice_per_crew_day = 2.0
    
    # Distance factor (longer trips need more ice)
    distance_factor = 1.0 + (distance_km / 500) * 0.3
    
    # Calculate needed ice
    needed_ice = ice_per_crew_day * crew_size * trip_days * distance_factor
    
    # Ensure within capacity
    needed_ice = min(needed_ice, config["ice_capacity_kg"])
    
    # Minimum ice requirement
    needed_ice = max(20.0, needed_ice)
    
    # Add random variation (±10%)
    needed_ice *= np.random.uniform(0.9, 1.1)
    
    return round(needed_ice, 2)

def calculate_water_requirement(boat_type, trip_days, crew_size):
    """Calculate water needed in liters"""
    config = BOAT_TYPES[boat_type]
    
    # Water consumption per person per day (liters)
    if boat_type in ["IMUL", "MDBT"]:  # Multi-day boats
        # Drinking: 3L, Cooking: 5L, Cleaning: 7L = 15L total
        water_per_person_day = 15.0
    else:  # Day boats
        # Only drinking water: 3L
        water_per_person_day = 3.0
    
    needed_water = water_per_person_day * crew_size * trip_days
    
    # Ensure within capacity
    needed_water = min(needed_water, config["water_capacity_L"])
    
    # Minimum water requirement
    needed_water = max(10.0, needed_water)
    
    # Add random variation (±15%)
    needed_water *= np.random.uniform(0.85, 1.15)
    
    return round(needed_water, 2)

# ------------------- 5. GENERATE SINGLE TRIP RECORD -------------------
def generate_trip_record(trip_id, year=2024):
    """Generate a complete trip record with all features needed for ML"""
    
    # 1. Select boat type (weighted probability)
    boat_type = np.random.choice(boat_keys, p=boat_probs)
    config = BOAT_TYPES[boat_type]
    
    # 2. Generate boat specifications
    engine_hp = np.random.randint(config["hp_range"][0], config["hp_range"][1] + 1)
    crew_size = np.random.randint(config["crew_range"][0], config["crew_range"][1] + 1)
    trip_days = np.random.randint(config["trip_days_range"][0], config["trip_days_range"][1] + 1)
    
    # 3. Select departure port
    port = random.choice(PORTS)
    departure_lat = port["lat"]
    departure_lon = port["lon"]
    port_name = port["name"]
    region = port["region"]
    
    # 4. Generate fishing location and calculate distance
    fishing_lat, fishing_lon, distance_km = generate_fishing_location(
        departure_lat, departure_lon, boat_type
    )
    
    # 5. Generate environmental and temporal factors
    month = np.random.randint(1, 13)
    wind_kph = np.random.uniform(5, 30)  # Wind speed in km/h
    wave_height_m = np.random.uniform(0.3, 2.5)  # Wave height in meters
    
    # Weather factor for fuel consumption (0.9 to 1.3)
    if wind_kph < 10:
        weather_factor = np.random.uniform(0.9, 1.0)  # Calm
    elif wind_kph < 20:
        weather_factor = np.random.uniform(1.0, 1.15)  # Moderate
    else:
        weather_factor = np.random.uniform(1.15, 1.3)  # Windy
    
    # 6. Calculate resource requirements
    # Fuel
    fuel_liters, total_hours = calculate_fuel_consumption(
        boat_type, engine_hp, distance_km, trip_days, weather_factor
    )
    
    # Ice
    ice_kg = calculate_ice_requirement(boat_type, trip_days, crew_size, distance_km)
    
    # Water
    water_liters = calculate_water_requirement(boat_type, trip_days, crew_size)
    
    # 7. Get fuel prices for the trip date
    trip_date = datetime(year, month, np.random.randint(1, 29))
    petrol_price, diesel_price, kerosene_price = get_fuel_price_for_date(trip_date)
    
    # 8. Calculate costs
    fuel_cost = 0.0
    fuel_price = 0.0
    
    if config["fuel_type"] == "diesel":
        fuel_price = diesel_price
        fuel_cost = fuel_liters * diesel_price
    elif config["fuel_type"] == "petrol":
        fuel_price = petrol_price
        fuel_cost = fuel_liters * petrol_price
    elif config["fuel_type"] == "kerosene":
        fuel_price = kerosene_price
        fuel_cost = fuel_liters * kerosene_price
    
    ice_cost = ice_kg * ICE_COST_PER_KG
    
    # Water cost: free for multi-day (from tank), paid for day boats
    if boat_type in ["IMUL", "MDBT"] and trip_days > 1:
        water_cost = water_liters * TANK_WATER_COST
    else:
        water_cost = water_liters * WATER_COST_PER_LITER
    
    total_base_cost = fuel_cost + ice_cost + water_cost
    
    # 9. Create the complete record
    record = {
        # Trip ID and Date
        "trip_id": trip_id,
        "trip_year": year,
        "trip_month": month,
        "trip_date": trip_date.strftime("%Y-%m-%d"),
        "season": "Northeast_Monsoon" if month in [11, 12, 1, 2, 3, 4] else "Southwest_Monsoon",
        
        # Boat Specifications (FEATURES)
        "boat_type": boat_type,
        "boat_name": config["name"],
        "engine_hp": engine_hp,
        "fuel_type": config["fuel_type"],
        "crew_size": crew_size,
        "ice_capacity_kg": config["ice_capacity_kg"],
        "water_capacity_L": config["water_capacity_L"],
        "avg_speed_knots": np.mean(config["speed_knots"]),
        "avg_speed_kmh": np.mean(config["speed_knots"]) * 1.852,
        
        # Trip Parameters (FEATURES)
        "trip_days": trip_days,
        "trip_duration_category": "Single_Day" if trip_days == 1 else "Multi_Day",
        
        # Location Data (FEATURES)
        "port_name": port_name,
        "region": region,
        "departure_lat": departure_lat,
        "departure_lon": departure_lon,
        "fishing_lat": round(fishing_lat, 4),
        "fishing_lon": round(fishing_lon, 4),
        "distance_km": round(distance_km, 2),
        
        # Environmental Factors (FEATURES)
        "wind_kph": round(wind_kph, 1),
        "wave_height_m": round(wave_height_m, 2),
        "weather_factor": round(weather_factor, 2),
        
        # Calculated Metrics (FEATURES)
        "total_hours": total_hours,
        "fuel_per_hour": round(fuel_liters / total_hours, 2) if total_hours > 0 else 0,
        "fuel_per_km": round(fuel_liters / distance_km, 2) if distance_km > 0 else 0,
        
        # Resource Quantities (TARGETS for ML)
        "fuel_liters": round(fuel_liters, 2),
        "ice_kg": round(ice_kg, 2),
        "water_liters": round(water_liters, 2),
        
        # Cost Breakdown (TARGETS for ML)
        "fuel_price_per_liter": round(fuel_price, 2),
        "fuel_cost_lkr": round(fuel_cost, 2),
        "ice_cost_lkr": round(ice_cost, 2),
        "water_cost_lkr": round(water_cost, 2),
        "total_base_cost_lkr": round(total_base_cost, 2),
        
        # Cost Ratios (for analysis)
        "fuel_cost_share": round(fuel_cost / total_base_cost, 3) if total_base_cost > 0 else 0,
        "ice_cost_share": round(ice_cost / total_base_cost, 3) if total_base_cost > 0 else 0,
        "water_cost_share": round(water_cost / total_base_cost, 3) if total_base_cost > 0 else 0,
        
        # Derived Features
        "ice_per_crew_day": round(ice_kg / (crew_size * trip_days), 2) if crew_size * trip_days > 0 else 0,
        "water_per_crew_day": round(water_liters / (crew_size * trip_days), 2) if crew_size * trip_days > 0 else 0,
        
        # Flags
        "is_multi_day": 1 if trip_days > 1 else 0,
        "has_engine": 1 if config["fuel_type"] != "none" else 0,
        "is_deep_sea": 1 if distance_km > 100 else 0
    }
    
    return record

# ------------------- 6. GENERATE COMPLETE DATASET -------------------
def generate_complete_dataset(num_trips=10000, years=[2022, 2023, 2024, 2025], test_split=0.2):
    """Generate complete dataset for ML training"""
    
    print("="*70)
    print("SMART FISHER LANKA - DATASET GENERATION")
    print("="*70)
    
    np.random.seed(42)
    random.seed(42)
    
    # Generate trips
    all_trips = []
    trips_per_year = num_trips // len(years)
    
    trip_counter = 0
    for year in years:
        print(f"\nGenerating {trips_per_year} trips for {year}...")
        
        for i in range(trips_per_year):
            trip_record = generate_trip_record(trip_counter, year)
            all_trips.append(trip_record)
            trip_counter += 1
            
            if (i + 1) % 1000 == 0:
                print(f"  Generated {i + 1} trips...")
    
    # Convert to DataFrame
    df = pd.DataFrame(all_trips)
    
    print(f"\n✓ Dataset generated with {len(df)} total records")
    print(f"  Columns: {len(df.columns)}")
    print(f"  Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
    
    # Display basic statistics
    print("\n" + "="*50)
    print("DATASET OVERVIEW")
    print("="*50)
    
    print(f"\nBoat Type Distribution:")
    boat_dist = df["boat_type"].value_counts()
    for boat, count in boat_dist.items():
        percentage = (count / len(df)) * 100
        print(f"  {boat}: {count} trips ({percentage:.1f}%)")
    
    print(f"\nTrip Duration:")
    print(f"  Average: {df['trip_days'].mean():.1f} days")
    print(f"  Range: {df['trip_days'].min()} to {df['trip_days'].max()} days")
    
    print(f"\nDistance Statistics:")
    print(f"  Average: {df['distance_km'].mean():.1f} km")
    print(f"  Min: {df['distance_km'].min():.1f} km")
    print(f"  Max: {df['distance_km'].max():.1f} km")
    
    print(f"\nCost Statistics (LKR):")
    print(f"  Fuel Cost:  Avg={df['fuel_cost_lkr'].mean():,.0f}, Min={df['fuel_cost_lkr'].min():,.0f}, Max={df['fuel_cost_lkr'].max():,.0f}")
    print(f"  Ice Cost:   Avg={df['ice_cost_lkr'].mean():,.0f}, Min={df['ice_cost_lkr'].min():,.0f}, Max={df['ice_cost_lkr'].max():,.0f}")
    print(f"  Water Cost: Avg={df['water_cost_lkr'].mean():,.0f}, Min={df['water_cost_lkr'].min():,.0f}, Max={df['water_cost_lkr'].max():,.0f}")
    print(f"  Total Cost: Avg={df['total_base_cost_lkr'].mean():,.0f}, Min={df['total_base_cost_lkr'].min():,.0f}, Max={df['total_base_cost_lkr'].max():,.0f}")
    
    # Split into train and test
    test_size = int(len(df) * test_split)
    train_df = df.iloc[:-test_size]
    test_df = df.iloc[-test_size:]
    
    print(f"\n" + "="*50)
    print("DATASET SPLIT")
    print("="*50)
    print(f"  Training set: {len(train_df)} records ({len(train_df)/len(df)*100:.1f}%)")
    print(f"  Test set:     {len(test_df)} records ({len(test_df)/len(df)*100:.1f}%)")
    
    # Save datasets
    print(f"\n" + "="*50)
    print("SAVING DATASETS")
    print("="*50)
    
    # Save full dataset
    df.to_csv("smart_fisher_full_dataset.csv", index=False)
    print("✓ Full dataset saved: smart_fisher_full_dataset.csv")
    
    # Save train/test split
    train_df.to_csv("smart_fisher_train.csv", index=False)
    test_df.to_csv("smart_fisher_test.csv", index=False)
    print("✓ Training set saved: smart_fisher_train.csv")
    print("✓ Test set saved: smart_fisher_test.csv")
    
    # Save a sample for quick testing
    sample_df = df.sample(min(1000, len(df)), random_state=42)
    sample_df.to_csv("smart_fisher_sample.csv", index=False)
    print("✓ Sample dataset saved: smart_fisher_sample.csv (1000 records)")
    
    return df, train_df, test_df

# ------------------- 7. PREPARE ML-READY DATASETS -------------------
def prepare_ml_datasets(df):
    """Prepare datasets specifically for ML model training"""
    
    print(f"\n" + "="*50)
    print("PREPARING ML-READY DATASETS")
    print("="*50)
    
    # Define features (X) - What's available at planning time
    feature_columns = [
        'boat_type', 'engine_hp', 'fuel_type', 'crew_size',
        'ice_capacity_kg', 'water_capacity_L', 'avg_speed_kmh',
        'trip_days', 'trip_month', 'distance_km',
        'wind_kph', 'wave_height_m', 'total_hours',
        'region', 'is_multi_day', 'has_engine'
    ]
    
    # Define targets (y) - What we want to predict
    target_columns = [
        'fuel_liters', 'ice_kg', 'water_liters',
        'fuel_cost_lkr', 'ice_cost_lkr', 'water_cost_lkr'
    ]
    
    # One-hot encode categorical features
    categorical_cols = ['boat_type', 'fuel_type', 'region']
    df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
    
    # Ensure all feature columns exist (some may be one-hot encoded now)
    all_features = []
    for col in feature_columns:
        if col in categorical_cols:
            # Add all one-hot encoded columns for this categorical
            encoded_cols = [c for c in df_encoded.columns if c.startswith(col + '_')]
            all_features.extend(encoded_cols)
        else:
            if col in df_encoded.columns:
                all_features.append(col)
    
    # Create X and y datasets
    X = df_encoded[all_features]
    y = df_encoded[target_columns]
    
    print(f"  Features shape: {X.shape}")
    print(f"  Targets shape: {y.shape}")
    print(f"  Number of features: {len(all_features)}")
    print(f"  Number of targets: {len(target_columns)}")
    
    # Save ML-ready datasets
    X.to_csv("ml_features.csv", index=False)
    y.to_csv("ml_targets.csv", index=False)
    
    # Also save combined dataset
    ml_df = pd.concat([X, y], axis=1)
    ml_df.to_csv("ml_combined_dataset.csv", index=False)
    
    print("\n✓ ML-ready datasets saved:")
    print("  - ml_features.csv: Features (X) for ML training")
    print("  - ml_targets.csv: Targets (y) for ML training")
    print("  - ml_combined_dataset.csv: Combined features + targets")
    
    return X, y

# ------------------- 8. CREATE PREDICTION TEMPLATE -------------------
def create_prediction_template():
    """Create template for what mobile app will send to prediction API"""
    
    template = {
        "boat_type": "OBFR",
        "engine_hp": 60,
        "fuel_type": "petrol",
        "crew_size": 4,
        "trip_days": 2,
        "current_location": {
            "lat": 7.2090,
            "lon": 79.8350
        },
        "target_location": {
            "lat": 7.5090,
            "lon": 80.1350
        },
        "month": 6,
        "weather_conditions": {
            "wind_kph": 15,
            "wave_height_m": 1.2
        }
    }
    
    # Save template
    import json
    with open("prediction_input_template.json", "w") as f:
        json.dump(template, f, indent=2)
    
    print("\n✓ Prediction input template saved: prediction_input_template.json")
    print("  This matches what your mobile app will send to the backend API")
    
    return template

# ------------------- 9. MAIN EXECUTION -------------------
if __name__ == "__main__":
    
    # Generate complete dataset (adjust num_trips as needed)
    full_df, train_df, test_df = generate_complete_dataset(
        num_trips=20000,  # 20,000 trips total
        years=[2022, 2023, 2024, 2025],
        test_split=0.2
    )
    
    # Prepare ML-ready datasets
    X, y = prepare_ml_datasets(full_df)
    
    # Create prediction template
    template = create_prediction_template()
    
    print("\n" + "="*70)
    print("DATASET GENERATION COMPLETE!")
    print("="*70)
    
    print("\nGenerated Files:")
    print("  1. smart_fisher_full_dataset.csv - Complete dataset (all records)")
    print("  2. smart_fisher_train.csv - Training set (80% of data)")
    print("  3. smart_fisher_test.csv - Test set (20% of data)")
    print("  4. smart_fisher_sample.csv - Sample (1000 records for quick testing)")
    

Fetching fuel prices from Ceypetco website...
✓ Successfully fetched 170 fuel price records

Current Fuel Prices (LKR/Liter):
  Petrol (92 Octane): 294.0
  Diesel: 277.0
  Kerosene: 180.0
SMART FISHER LANKA - DATASET GENERATION

Generating 5000 trips for 2022...
  Generated 1000 trips...
  Generated 2000 trips...
  Generated 3000 trips...
  Generated 4000 trips...
  Generated 5000 trips...

Generating 5000 trips for 2023...
  Generated 1000 trips...
  Generated 2000 trips...
  Generated 3000 trips...
  Generated 4000 trips...
  Generated 5000 trips...

Generating 5000 trips for 2024...
  Generated 1000 trips...
  Generated 2000 trips...
  Generated 3000 trips...
  Generated 4000 trips...
  Generated 5000 trips...

Generating 5000 trips for 2025...
  Generated 1000 trips...
  Generated 2000 trips...
  Generated 3000 trips...
  Generated 4000 trips...
  Generated 5000 trips...

✓ Dataset generated with 20000 total records
  Columns: 45
  Memory usage: 15.18 MB

DATASET OVERVIEW

Boat Typ

In [None]:
    print("  5. ml_features.csv - ML features (X) for training")
    print("  6. ml_targets.csv - ML targets (y) for training")
    print("  7. ml_combined_dataset.csv - Combined features + targets")
    print("  8. prediction_input_template.json - API input format")
    
    print("\nNext Steps:")
    print("  1. Train ML model using ml_features.csv and ml_targets.csv")
    print("  2. Export trained model (pickle, ONNX, or TensorFlow format)")
    print("  3. Integrate model with NestJS backend")
    print("  4. Implement prediction API endpoint")
    print("  5. Connect mobile app to the API")
    
    print("\n" + "="*70)

  5. ml_features.csv - ML features (X) for training
  6. ml_targets.csv - ML targets (y) for training
  7. ml_combined_dataset.csv - Combined features + targets
  8. prediction_input_template.json - API input format

Next Steps:
  1. Train ML model using ml_features.csv and ml_targets.csv
  2. Export trained model (pickle, ONNX, or TensorFlow format)
  3. Integrate model with NestJS backend
  4. Implement prediction API endpoint
  5. Connect mobile app to the API

