# Phase 1: Data Preparation & Feature Engineering

**Goal:** To load our five pre-processed crop datasets and systematically engineer all the necessary variables for our analysis, as outlined in the project plan. This includes calculating yield anomalies and creating both summary and extreme stressor variables. The final, enriched datasets will be saved to a new directory.

In [None]:
# Cell 1: The Corrected and Simplified Feature Engineering Pipeline
import pandas as pd
import numpy as np
from scipy.signal import detrend
import os

# --- Configuration ---
file_info = {
    'maize': {'path': '../data-cherry-pick/maize_ITnorth_core42_1982_2016_allstressors_with_monthly.csv', 'yield_col': 'yield_maize'},
    'rice': {'path': '../data-cherry-pick/rice_ITnorth_core41_1982_2016_allstressors_with_monthly.csv', 'yield_col': 'yield_rice'},
    'soybean': {'path': '../data-cherry-pick/soybean_ITnorth_core41_1982_2016_allstressors_with_monthly.csv', 'yield_col': 'yield_soybean'},
    'wheat_spring': {'path': '../data-cherry-pick/wheat_spring_ITnorth_core41_1982_2016_allstressors_with_monthly.csv', 'yield_col': 'yield_wheat_spring'},
    'wheat_winter': {'path': '../data-cherry-pick/wheat_winter_ITnorth_core41_1982_2016_allstressors_with_monthly.csv', 'yield_col': 'yield_wheat_winter'}
}

output_dir = 'data/analysis_ready/feature_engineered/'
os.makedirs(output_dir, exist_ok=True)

print("--- Starting Corrected Feature Engineering for All Crops ---")

# --- Main Loop to Process Each Crop Dataset ---
for crop_name, info in file_info.items():
    print(f"\n--- Processing: {crop_name.upper()} ---")
    
    try:
        # 1. Load the Core Dataset
        df = pd.read_csv(info['path'])
        
        # 2. Calculate Yield Anomalies
        df['yield_anomaly'] = df.groupby(['lat', 'lon'])[info['yield_col']].transform(lambda x: detrend(x))
        df['yield_anomaly_diff'] = df.groupby(['lat', 'lon'])[info['yield_col']].diff()
        
        # 3. Create the ONE new useful stressor column (W/m^2)
        # The 'solar_radiation' column is the sum of Joules for the growing season.
        # We find the number of months to correctly average it.
        num_months = len([col for col in df.columns if 'solar_radiation_' in col])
        seconds_in_season = num_months * 30.44 * 24 * 3600
        df['gs_solar_rad_w_m2'] = df['solar_radiation'] / seconds_in_season
        
        # 4. Add a unique cell_id for fixed effects modeling
        df['cell_id'] = df.groupby(['lat', 'lon']).ngroup()

        # 5. Define the final, clean column order
        # Get all original columns first
        final_cols = df.columns.tolist()
        # Define our new/reordered columns
        new_order = ['cell_id', 'lat', 'lon', 'year', info['yield_col'], 'yield_anomaly', 'yield_anomaly_diff']
        # Remove them from the original list to avoid duplication
        for col in new_order:
            if col in final_cols: final_cols.remove(col)
        # Combine the lists with our desired order at the front
        final_cols = new_order + final_cols
        # Apply the new order
        df = df[final_cols]
        
        # 6. Save the Enriched and Cleaned Dataset
        output_path = os.path.join(output_dir, f'{crop_name}_feature_engineered.csv')
        df.to_csv(output_path, index=False)
        
        print(f"Successfully processed and saved enriched dataset to: {output_path}")
        print(f"Final columns: {df.columns.tolist()}")

    except FileNotFoundError:
        print(f"ERROR: File not found: {info['path']}")
    except Exception as e:
        print(f"An error occurred while processing {crop_name}: {e}")

print("\n--- All feature engineering complete. ---")

--- Starting Corrected Feature Engineering for All Crops ---

--- Processing: MAIZE ---
Successfully processed and saved enriched dataset to: data/analysis_ready/feature_engineered/maize_feature_engineered.csv
Final columns: ['cell_id', 'lat', 'lon', 'year', 'yield_maize', 'yield_anomaly', 'yield_anomaly_diff', 'temperature', 'precipitation', 'soil_water', 'solar_radiation', 'potential_evaporation', 'temperature_May', 'temperature_Jun', 'temperature_Jul', 'temperature_Aug', 'temperature_Sep', 'precipitation_May', 'precipitation_Jun', 'precipitation_Jul', 'precipitation_Aug', 'precipitation_Sep', 'soil_water_May', 'soil_water_Jun', 'soil_water_Jul', 'soil_water_Aug', 'soil_water_Sep', 'solar_radiation_May', 'solar_radiation_Jun', 'solar_radiation_Jul', 'solar_radiation_Aug', 'solar_radiation_Sep', 'potential_evaporation_May', 'potential_evaporation_Jun', 'potential_evaporation_Jul', 'potential_evaporation_Aug', 'potential_evaporation_Sep', 'gs_solar_rad_w_m2']

--- Processing: RICE ---
