#Merge and Override of final_lstm_dataset_cleaned.parquet


- Merging previous parquet with zeroshot_ground_truth.parquet.
- Look at previous code for pre-processing


In [None]:
# -*- coding: utf-8 -*-
"""
merge_datasets.py
Author: Samantha Lee

Purpose: Merge "Zero-Shot" data (with coords) and "Budapest/Vas" data.
Crucially, it injects missing coordinates for Vas/Budapest so the map works.
"""

from pathlib import Path
import pandas as pd
import sys
import os
import numpy as np

# --- CONFIGURATION ---
# Adjust paths relative to where you run the script (project root)
DATA_DIR = Path("data")

# 1. The file with coordinates (from pre-processing)
# If this file doesn't exist, the script will warn but proceed with just the manual fallback
ZEROSHOT_PATH = DATA_DIR / "zeroshot_ground_truth_with_coords.parquet"

# 2. The file with Vas/Budapest data (YOUR SPECIFIC FILE)
# I am looking for the file you uploaded. Ensure it is renamed to this:
BUDAPEST_VAS_PATH = DATA_DIR / "budapest_vas.parquet"

# 3. The Output File (The one the App reads)
OUTPUT_PATH = DATA_DIR / "final_lstm_dataset_cleaned.parquet"

# --- MANUAL COORDINATE FALLBACK ---
# Coordinates for regions likely in the Budapest/Vas file but missing lat/lon columns
# These act as a safety net.
MANUAL_COORDS = {
    "Budapest": {"lat": 47.4979, "lon": 19.0402},
    "Vas": {"lat": 47.0353, "lon": 16.7665},
    "Fejer": {"lat": 47.1625, "lon": 18.4048},
    "Baranya": {"lat": 46.0727, "lon": 18.2323},
    "Pest": {"lat": 47.4167, "lon": 19.3333},
    "Tolna": {"lat": 46.5000, "lon": 18.5000},
    "Gyor": {"lat": 47.6875, "lon": 17.6504},
    "Somogy": {"lat": 46.5000, "lon": 17.5000},
    "Zala": {"lat": 46.7500, "lon": 16.8333},
    "Heves": {"lat": 47.9026, "lon": 20.3733},
    "Borsod": {"lat": 48.1000, "lon": 20.7833},
    "Szabolcs": {"lat": 48.0000, "lon": 22.0000},
    "Hajdu": {"lat": 47.5316, "lon": 21.6273},
    "Bekes": {"lat": 46.6736, "lon": 21.0737},
    "Csongrad": {"lat": 46.2530, "lon": 20.1482},
    "Bacs": {"lat": 46.5000, "lon": 19.5000},
    "Jasz": {"lat": 47.5000, "lon": 20.0000},
    "Nograd": {"lat": 48.0000, "lon": 19.5000},
    "Komarom": {"lat": 47.5862, "lon": 18.0172},
    "Veszprem": {"lat": 47.1000, "lon": 17.9000}
}

print("--- STARTING CUSTOM MERGE ---")

# 1. Load Zero-Shot Data (The Coordinate Source)
df_zero = pd.DataFrame()
if ZEROSHOT_PATH.exists():
    print(f"Loading Zero-Shot data: {ZEROSHOT_PATH.name}")
    try:
        df_zero = pd.read_parquet(ZEROSHOT_PATH, engine='fastparquet')
        print(f"   Loaded {len(df_zero)} rows with coordinates.")
    except Exception as e:
        print(f"   Error loading zero-shot data: {e}. Proceeding without it.")
else:
    print(f"Warning: {ZEROSHOT_PATH.name} not found. Will rely on manual coordinates.")

# 2. Load Budapest/Vas Data (The Historical Data)
if not BUDAPEST_VAS_PATH.exists():
    # Fallback: Check if the user hasn't renamed it yet
    possible_name = DATA_DIR / "final_lstm_dataset_cleaned budapest_vas.parquet"
    if possible_name.exists():
        print(f"Found file with original name: {possible_name.name}")
        BUDAPEST_VAS_PATH = possible_name
    else:
        print(f"CRITICAL ERROR: Could not find 'budapest_vas.parquet' in {DATA_DIR}")
        print("Please rename your uploaded file to 'budapest_vas.parquet' and place it in the 'data' folder.")
        sys.exit(1)

print(f"Loading Budapest/Vas data: {BUDAPEST_VAS_PATH.name}")
try:
    df_bv = pd.read_parquet(BUDAPEST_VAS_PATH, engine='fastparquet')
    print(f"   Loaded {len(df_bv)} historical rows.")
except Exception as e:
    print(f"CRITICAL ERROR reading parquet file: {e}")
    sys.exit(1)

# 3. Fix Coordinates for Budapest/Vas
print("Backfilling coordinates for Budapest/Vas data...")

# Ensure columns exist
if 'latitude' not in df_bv.columns: df_bv['latitude'] = np.nan
if 'longitude' not in df_bv.columns: df_bv['longitude'] = np.nan

def get_coords(field_id):
    # Helper to find coordinates based on the region name inside the field_id
    # Format example: "vtx|Vas|rapeseed..."
    try:
        # Check against our manual list
        for key, coords in MANUAL_COORDS.items():
            # Case-insensitive check
            if f"|{key}|" in field_id or f"|{key.lower()}|" in field_id.lower():
                return coords['lat'], coords['lon']
    except:
        pass
    return np.nan, np.nan

# Only apply to rows missing coordinates
# This is efficient: we only process the rows that need it.
mask_missing = df_bv['latitude'].isna()
missing_count = mask_missing.sum()

if missing_count > 0:
    print(f"   Found {missing_count} rows missing coords. Applying manual fix...")
    
    # Get unique field_ids that are missing coords to speed up lookup
    unique_missing_ids = df_bv.loc[mask_missing, 'field_id'].unique()
    print(f"   Unique fields to fix: {len(unique_missing_ids)}")
    
    # Create a map for these specific IDs
    id_to_coords = {fid: get_coords(fid) for fid in unique_missing_ids}
    
    # Apply map
    coords_series = df_bv.loc[mask_missing, 'field_id'].map(id_to_coords)
    
    # Assign back
    df_bv.loc[mask_missing, 'latitude'] = coords_series.apply(lambda x: x[0])
    df_bv.loc[mask_missing, 'longitude'] = coords_series.apply(lambda x: x[1])
    
    # Check results
    still_missing = df_bv['latitude'].isna().sum()
    print(f"   Fixed {missing_count - still_missing} rows.")
    if still_missing > 0:
        print(f"   Warning: {still_missing} rows still lack coordinates (Region name not in manual list).")
else:
    print("   All rows already have coordinates!")

# 4. Merge
print("Merging datasets...")
# We stack them. If zero-shot has newer data for the same field/date, we keep it.
df_final = pd.concat([df_zero, df_bv], ignore_index=True)
df_final = df_final.drop_duplicates(subset=['field_id', 'date'], keep='last')

# 5. Save
print(f"Saving FINAL merged dataset ({len(df_final)} rows) to: {OUTPUT_PATH}")
df_final.to_parquet(OUTPUT_PATH, engine='fastparquet', index=False)

print("--- SUCCESS! Vas and Budapest are restored with coordinates. ---")
