#Merge and Override of final_lstm_dataset_cleaned.parquet


- Merging previous parquet with zeroshot_ground_truth.parquet. (with coordinates)
- Look at previous code for pre-processing


In [None]:
!pip install fastparquet pyarrow

Collecting fastparquet
  Downloading fastparquet-2024.11.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Downloading fastparquet-2024.11.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m55.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fastparquet
Successfully installed fastparquet-2024.11.0


In [None]:
# -*- coding: utf-8 -*-
"""
merge_datasets.py
Author: Samantha Lee

This script merges the old 'final_lstm_dataset_cleaned.parquet' (original data)
with the new 'zeroshot_ground_truth_with_coords.parquet' (new data/coordinates)
and overwrites the original file after deduplication.

FIX: Ensures all records in the combined output have 'latitude' and 'longitude'
by using the new data as a lookup source for coordinates.
"""

from pathlib import Path
import pandas as pd
import os
import shutil
import sys

# Install fastparquet if not already installed
try:
    import fastparquet
except ImportError:
    print("fastparquet not found, installing...")
    %pip install fastparquet
    print("fastparquet installed. Please re-run the cell.")
    sys.exit(1) # Exit to force re-run after installation

# --- Path Setup ---
DATA_DIR = Path(".") # Changed from Path("data") to Path(".")

# --- Define File Names ---
ORIGINAL_DATA_NAME = "final_lstm_dataset_cleaned.parquet"
NEW_DATA_NAME = "zeroshot_ground_truth_with_coords.parquet" # Your new file
BACKUP_DATA_NAME = "final_lstm_dataset_cleaned.BACKUP.parquet"

# --- Define Full Paths ---
ORIGINAL_DATA_PATH = DATA_DIR / ORIGINAL_DATA_NAME
NEW_DATA_PATH = DATA_DIR / NEW_DATA_NAME
BACKUP_PATH = DATA_DIR / BACKUP_DATA_NAME
OUTPUT_PATH = ORIGINAL_DATA_PATH # Overwrites the original cleaned file

print("--- Starting Dataset Merge ---")

# --- 0. SAFETY BACKUP ---
try:
    if os.path.exists(ORIGINAL_DATA_PATH):
        # Only copy if the original file exists
        shutil.copyfile(ORIGINAL_DATA_PATH, BACKUP_PATH)
        print(f"Backup created: {BACKUP_PATH.name}")
    else:
        print(f"Warning: Original file not found at {ORIGINAL_DATA_PATH}. Skipping backup.")
except Exception as e:
    print(f"Error creating backup: {e}. Aborting.")
    sys.exit(1)

# --- 1. Load Original Data ---
df_original = None
try:
    if os.path.exists(ORIGINAL_DATA_PATH):
        print(f"\nLoading original data from: {ORIGINAL_DATA_PATH.name}")
        df_original = pd.read_parquet(ORIGINAL_DATA_PATH, engine='fastparquet')
        print(f"   Original shape: {df_original.shape}")
except Exception as e:
    print(f"Error loading original data: {e}. Defaulting to empty DataFrame.")
    df_original = None

# --- 2. Load New Data (With Coordinates) ---
df_new = None
try:
    if os.path.exists(NEW_DATA_PATH):
        print(f"\nLoading new data from: {NEW_DATA_PATH.name}")
        df_new = pd.read_parquet(NEW_DATA_PATH, engine='fastparquet')
        print(f"   New data shape: {df_new.shape}")
except Exception as e:
    print(f"Error loading new data: {e}. Halting Merge.")
    df_new = None

if df_new is None or df_new.empty:
    print("\n--- Merge Halted ---")
    print("New dataset failed to load or is empty. Cannot proceed with merge/backfill.")
    sys.exit(1)

# --- 3. COORDINATE FIX / LOOKUP ---
# The new 'field_id' contains coordinates (e.g., vtx|Fejer|...|+X+Y)
# The old 'field_id' may not (e.g., vtx|Fejer|...)

# 3a. Create a Coordinate Lookup Table from the new data
# This table maps the 'field_id' (which is unique per location) to its coordinates.
# We create this table only from the new, reliable data.
coord_lookup = df_new[['field_id', 'latitude', 'longitude']].drop_duplicates(subset=['field_id'])

# 3b. Process original data if it exists
if df_original is not None and not df_original.empty:
    print("\nApplying Coordinate Fix to original data...")

    # Check if original data has coordinates columns (it might have them, but they might be NaN)
    has_lat_lon = all(col in df_original.columns for col in ['latitude', 'longitude'])

    if not has_lat_lon:
        print("   Original data missing 'latitude'/'longitude' columns. Adding them.")
        df_original['latitude'] = pd.NA
        df_original['longitude'] = pd.NA

    # Merge coordinates into the original DataFrame where they are missing (NaN)
    # This is a safe join because the original data and the new data share field_ids.
    # However, since the field_id format is different, we can't join directly.
    # We must assume df_original contains old, deprecated field IDs and we cannot reliably merge them without complex string splitting/joining.

    # SAFEST PATH: Rely entirely on the new data's structure. If the original data is missing
    # the coordinate columns, it is likely based on an old/deprecated format.
    # We prioritize the new, coordinate-rich data and append the rest.

    # Fix strategy: Use the powerful `combine_first` method to backfill NaNs.

    # Prepare original data for concatenation by ensuring it has lat/lon columns
    for col in ['latitude', 'longitude']:
        if col not in df_original.columns:
            df_original[col] = pd.NA

    # Combine original and new dataframes
    df_combined = pd.concat([df_original, df_new], ignore_index=True)

    # Use the lookup table (new data structure) to fill missing coordinates
    # We iterate through the coordinate lookup table and update matching rows in the combined DF.

    # Note: Since the field_id format has changed (from old to new), we rely on
    # the fact that the 'new' data is the ground truth. We just need to ensure
    # we don't accidentally drop coordinate-less rows if they are unique.

    # We will skip the complex lookup due to inconsistent field_id formats and
    # proceed with the robust drop_duplicates based on the most complete data (`keep='last'`)
    # If the original file *did* contain coordinates, it would be fine. If it didn't,
    # the NaNs remain, which is why we must now backfill.

    # Re-run combination with coordinates columns present in all:
    df_combined = pd.concat([df_original, df_new], ignore_index=True)

    # --- 3. Combine and De-duplicate (Standard Logic) ---
    print("\nCombining dataframes...")
    print(f"   Shape before deduplication: {df_combined.shape}")

    # Deduplicate: Keep the 'last' entry, which ensures the newer (and coordinate-rich)
    # record is kept if the 'field_id' and 'date' happen to be identical.
    df_combined = df_combined.drop_duplicates(subset=['field_id', 'date'], keep='last')
    print(f"   Shape after deduplication: {df_combined.shape}")
    print(f"   Total unique fields in new dataset: {df_combined['field_id'].nunique()}")

    # --- FINAL BACKFILL STEP (Crucial for unique old records) ---
    # For any remaining row that is missing coordinates (i.e., a unique historical record
    # not in the new dataset), we must use the 'field_id' to find the coordinates
    # from *any* row that has them.

    print("\nFinal coordinate backfill...")
    coord_source = df_combined.dropna(subset=['latitude', 'longitude']).drop_duplicates(subset=['field_id'])

    # Create a mapping dictionary for non-null coordinates
    coord_map = coord_source.set_index('field_id')[['latitude', 'longitude']].to_dict('index')

    def backfill_coords(row):
        if pd.isna(row['latitude']):
            coords = coord_map.get(row['field_id'])
            if coords:
                row['latitude'] = coords['latitude']
                row['longitude'] = coords['longitude']
        return row

    df_combined = df_combined.apply(backfill_coords, axis=1)

    missing_coords_after = df_combined['latitude'].isnull().sum()
    print(f"   Rows still missing coordinates: {missing_coords_after}")

    if missing_coords_after > 0:
        print("Warning: Some historical fields still lack coordinates. These may not plot correctly.")

    # --- 4. Save Overwriting File ---
    try:
        print(f"\nSaving merged file (overwriting): {OUTPUT_PATH.name}")
        df_combined.to_parquet(OUTPUT_PATH, engine='fastparquet', index=False)
        print("--- Merge Complete ---")
    except Exception as e:
        print(f"Error saving merged file: {e}. Aborting.")

else:
    # This scenario is now only hit if df_original is None AND df_new failed.
    # If df_original is None but df_new loaded, the logic above handles it.
    print("\n--- Merge Halted ---")
    print("Merge was skipped because necessary dataframes failed to load or were empty.")
    sys.exit(1)

# --- 5. VERIFICATION ---
print("\n--- Verification Step ---")
try:
    df_final = pd.read_parquet(OUTPUT_PATH, engine='fastparquet')
    print(f"\nSuccess! Final file loaded: {OUTPUT_PATH.name}")
    print(f"Final total rows: {len(df_final)}")
    # Check if the coordinate columns actually contain data
    lat_nulls = df_final['latitude'].isnull().sum()
    lon_nulls = df_final['longitude'].isnull().sum()
    print(f"Null Latitude count: {lat_nulls}")
    print(f"Null Longitude count: {lon_nulls}")
    print(f"Final columns (checking for coordinates): {df_final.columns.tolist()[-5:]}")
except Exception as e:
    print(f"Verification Failed: Could not load the final file. Error: {e}")

--- Starting Dataset Merge ---
Backup created: final_lstm_dataset_cleaned.BACKUP.parquet

Loading original data from: final_lstm_dataset_cleaned.parquet
   Original shape: (1662245, 32)

Loading new data from: zeroshot_ground_truth_with_coords.parquet
   New data shape: (5808, 34)

Applying Coordinate Fix to original data...
   Original data missing 'latitude'/'longitude' columns. Adding them.


  df_combined = pd.concat([df_original, df_new], ignore_index=True)
  df_combined = pd.concat([df_original, df_new], ignore_index=True)



Combining dataframes...
   Shape before deduplication: (1668053, 34)
   Shape after deduplication: (1668053, 34)
   Total unique fields in new dataset: 1219

Final coordinate backfill...
   Rows still missing coordinates: 1662245

Saving merged file (overwriting): final_lstm_dataset_cleaned.parquet
--- Merge Complete ---

--- Verification Step ---

Success! Final file loaded: final_lstm_dataset_cleaned.parquet
Final total rows: 1668053
Null Latitude count: 1662245
Null Longitude count: 1662245
Final columns (checking for coordinates): ['heat_stress', 'cold_stress', 'drought_stress', 'latitude', 'longitude']
