# ðŸ§® Phase 2A â€” Multi-year Bioeconomic LP (NPV)
This notebook extends Phase 1 to **T years** with discounting and time-varying multipliers.

**Whatâ€™s new:** time index `t=1..T`, NPV objective, scenario arrays.

**Whatâ€™s NOT yet included (later phases):** crop storage carryover, livestock herd dynamics, risk/uncertainty.


In [None]:
# ========================================
# SETUP: IMPORT LIBRARIES AND PATHS
# ========================================
# Import the tools we need for calculations and data handling

import json                        # For reading scenario configuration files
from dataclasses import dataclass  # For creating data structure templates
from typing import Dict            # For type hints (code documentation)
from pathlib import Path           # For handling file paths

import numpy as np                 # For numerical calculations (arrays, math)
import pandas as pd                # For working with data tables (like Excel)
from scipy.optimize import linprog # The optimization solver that finds best decisions

# Try to import display tool for pretty tables
try:
    from caas_jupyter_tools import display_dataframe_to_user
except Exception:
    display_dataframe_to_user = None

# Define where our input data and outputs are stored
DATA = Path('./data_phase_2A')  # Folder with CSV files for Phase 2A
OUT = Path('./outputs')          # Folder where results will be saved
OUT.mkdir(exist_ok=True, parents=True)  # Create outputs folder if it doesn't exist

print('Loaded numpy, pandas, scipy.optimize.linprog')


Loaded numpy, pandas, scipy.optimize.linprog


## Load inputs
Edit the CSVs in `./data/` and the scenario file `scenario_multi.json`.

In [None]:
# ========================================
# LOAD AND PREVIEW INPUT DATA
# ========================================
# Read all CSV files and scenario configuration, then display them for review

# Helper function to check if required files exist
def must(path: Path) -> Path:
    """
    Checks if a file exists and raises an error if missing
    
    INPUTS: path - Path to a required file
    OUTPUTS: Returns the same path if file exists
    PURPOSE: Prevents the model from running with missing data
    """
    if not path.exists():
        raise FileNotFoundError(f'Missing required file: {path}')
    return path

# Load all the required CSV files from the data folder
hh_df = pd.read_csv(must(DATA/'households.csv'))      # Household characteristics
crops_df = pd.read_csv(must(DATA/'crops.csv'))        # Crop parameters
livest_df = pd.read_csv(must(DATA/'livestock.csv'))   # Livestock parameters
prices_df = pd.read_csv(must(DATA/'prices.csv'))      # Market prices (wages)

# Load the scenario configuration (multi-year settings)
scenario = json.load(open(must(DATA/'scenario_multi.json'),'r'))

# Try to load observed data for validation (if available)
obs_path = DATA/'observed_prod_only.csv'
obs_df = pd.read_csv(obs_path) if obs_path.exists() else None

# Preview all the loaded data
# This lets you see what data the model will use
preview = [
    ('households.csv', hh_df),
    ('crops.csv', crops_df),
    ('livestock.csv', livest_df),
    ('prices.csv', prices_df),
    ('scenario_multi.json', pd.DataFrame([{'T':scenario.get('T'), 'discount_rate':scenario.get('discount_rate')}]))
]

# Display each table
for name, df in preview:
    if display_dataframe_to_user: 
        display_dataframe_to_user(name, df)
    else: 
        display(df.head())

# Display observed data if available
if obs_df is not None:
    if display_dataframe_to_user: 
        display_dataframe_to_user('observed_prod_only.csv', obs_df)
    else: 
        display(obs_df.head())

print('Loaded inputs.')


Unnamed: 0,name,n_households,adult_equiv,labor_endowment,land_available,max_hired_labor
0,HFR,100,3.8,220,0.8,120
1,HMR,100,3.9,240,1.0,130
2,MFIR,100,4.2,300,1.2,160
3,MMR,100,4.2,300,1.2,160
4,MMIR,100,4.6,320,1.5,180


Unnamed: 0,name,calorie_per_kg,yield_per_ha,price_sale,seed_cost_per_ha,fert_cost_per_ha,chem_cost_per_ha,labor_req_per_ha
0,maize,3600,3500,5.0,800,1800,600,60
1,beans,3400,1500,8.0,900,700,400,70


Unnamed: 0,name,price_sale,feed_cost_per_unit,vet_cost_per_unit,labor_req_per_unit
0,goat,2500,800,200,6.0
1,chicken,300,60,15,0.6


Unnamed: 0,wage
0,80


Unnamed: 0,T,discount_rate
0,10,0.1


Unnamed: 0,household_class,rev_crops,rev_livestock,off_farm_labor,cost_crop_inputs,cost_livestock,cost_hired_labor
0,HFR,1726.28,844.47,463.33,1166.3,231.0,1195.3
1,HMR,6104.32,1568.32,3649.15,3061.1,70.76,880.5
2,MFIR,5151.81,904.5,1674.58,1431.36,1035.73,992.6
3,MMR,3827.85,1521.11,1376.11,836.6,801.53,721.53
4,MMIR,21985.49,5510.7,3378.98,4247.52,1863.71,1015.24


Loaded inputs.


## Parameter schemas and checks

In [None]:
# ========================================
# DATA STRUCTURES (BLUEPRINTS)
# ========================================
# Same as Phase 1, but we'll use them across multiple years

@dataclass
class HouseholdClass:
    """
    Blueprint for a household group - same as Phase 1
    Stores resources available: land, labor, household size, etc.
    """
    name: str
    n_households: float
    adult_equiv: float
    labor_endowment: float
    land_available: float
    max_hired_labor: float

@dataclass
class CropParam:
    """
    Blueprint for a crop type - same as Phase 1
    Stores yields, prices, costs, and labor requirements
    """
    name: str
    calorie_per_kg: float
    yield_per_ha: float
    price_sale: float
    seed_cost_per_ha: float
    fert_cost_per_ha: float
    chem_cost_per_ha: float
    labor_req_per_ha: float

@dataclass
class LivestockParam:
    """
    Blueprint for livestock - same as Phase 1
    Stores prices, costs, and labor requirements
    """
    name: str
    price_sale: float
    feed_cost_per_unit: float
    vet_cost_per_unit: float
    labor_req_per_unit: float

@dataclass
class PriceParam:
    """
    Blueprint for market prices - same as Phase 1
    Currently just the wage rate
    """
    wage: float

@dataclass
class ModelParams:
    """
    Master container that holds ALL baseline data
    Same structure as Phase 1
    """
    households: Dict[str, HouseholdClass]
    crops: Dict[str, CropParam]
    livestock: Dict[str, LivestockParam]
    prices: PriceParam
    min_kcal_per_person_per_day: float = 2000.0
    days_per_year: int = 365

# ========================================
# FUNCTION: VALIDATE AND LOAD PARAMETERS
# ========================================

def _req(x, field):
    """
    Helper function to validate that required fields are not missing
    
    INPUTS: 
    - x: The value to check
    - field: Name of the field (for error messages)
    
    OUTPUTS: Returns the value as a float if valid
    PURPOSE: Catches missing/invalid data before running the model
    """
    if pd.isna(x) or str(x).strip()=='':
        raise ValueError(f'Missing value for {field}')
    return float(x)

def load_params() -> ModelParams:
    """
    Reads CSV files and creates structured parameter objects
    
    INPUTS: Uses global dataframes (hh_df, crops_df, livest_df, prices_df)
    OUTPUTS: Returns ModelParams object with all baseline data
    
    HOW IT WORKS:
    1. Loop through each CSV row
    2. Validate all required fields exist
    3. Create data structure objects
    4. Bundle everything into ModelParams
    """
    # Load households with validation
    households = {str(r['name']).strip(): HouseholdClass(
        name=str(r['name']).strip(),
        n_households=_req(r['n_households'],'households.n_households'),
        adult_equiv=_req(r['adult_equiv'],'households.adult_equiv'),
        labor_endowment=_req(r['labor_endowment'],'households.labor_endowment'),
        land_available=_req(r['land_available'],'households.land_available'),
        max_hired_labor=_req(r['max_hired_labor'],'households.max_hired_labor'),
    ) for _, r in hh_df.iterrows()}

    # Load crops with validation
    crops = {str(r['name']).strip(): CropParam(
        name=str(r['name']).strip(),
        calorie_per_kg=_req(r['calorie_per_kg'],'crops.calorie_per_kg'),
        yield_per_ha=_req(r['yield_per_ha'],'crops.yield_per_ha'),
        price_sale=_req(r['price_sale'],'crops.price_sale'),
        seed_cost_per_ha=_req(r['seed_cost_per_ha'],'crops.seed_cost_per_ha'),
        fert_cost_per_ha=_req(r['fert_cost_per_ha'],'crops.fert_cost_per_ha'),
        chem_cost_per_ha=_req(r['chem_cost_per_ha'],'crops.chem_cost_per_ha'),
        labor_req_per_ha=_req(r['labor_req_per_ha'],'crops.labor_req_per_ha'),
    ) for _, r in crops_df.iterrows()}

    # Load livestock with validation
    livestock = {str(r['name']).strip(): LivestockParam(
        name=str(r['name']).strip(),
        price_sale=_req(r['price_sale'],'livestock.price_sale'),
        feed_cost_per_unit=_req(r['feed_cost_per_unit'],'livestock.feed_cost_per_unit'),
        vet_cost_per_unit=_req(r['vet_cost_per_unit'],'livestock.vet_cost_per_unit'),
        labor_req_per_unit=_req(r['labor_req_per_unit'],'livestock.labor_req_per_unit'),
    ) for _, r in livest_df.iterrows()}

    # Load prices
    prices = PriceParam(wage=_req(prices_df.iloc[0]['wage'],'prices.wage'))
    
    return ModelParams(households=households, crops=crops, livestock=livestock, prices=prices)

# ========================================
# FUNCTION: LOAD MULTI-YEAR SCENARIO
# ========================================

def _as_T(x, T, name):
    """
    Converts scenario parameters to T-length arrays (one value per year)
    
    INPUTS:
    - x: Either a single number (same for all years) or a list (different per year)
    - T: Number of years
    - name: Parameter name (for error messages)
    
    OUTPUTS: List of length T with values for each year
    
    EXAMPLES:
    - _as_T(1.0, 5, 'multiplier') â†’ [1.0, 1.0, 1.0, 1.0, 1.0]
    - _as_T([1.0, 0.9, 0.8], 3, 'yield') â†’ [1.0, 0.9, 0.8]
    
    PURPOSE: Allows scenarios like "drought in year 3" with [1.0, 1.0, 0.7, 1.0, 1.0]
    """
    if isinstance(x, (int,float)):
        # Single value - use same value for all years
        return [float(x)]*T
    if isinstance(x, list):
        # List provided - must match number of years
        if len(x)!=T:
            raise ValueError(f'scenario.{name} must have length T={T}, got {len(x)}')
        return [float(v) for v in x]
    raise ValueError(f'scenario.{name} must be a number or list length T')

def load_scenario_arrays(sc):
    """
    Loads multi-year scenario with time-varying multipliers and discount factors
    
    INPUTS: sc - Scenario dictionary from scenario_multi.json
    
    OUTPUTS: Returns 8 values:
    - T: Number of years to simulate
    - r: Discount rate (how much we value future money less than today)
    - y: Yield multipliers per year (crop productivity changes)
    - p: Price multipliers per year (crop price changes)
    - w: Wage multipliers per year (labor cost changes)
    - f: Fertilizer price multipliers per year
    - pop: Population multipliers per year (household size changes)
    - disc: Discount factors per year (for NPV calculation)
    
    HOW IT WORKS:
    - Reads T and discount_rate from scenario
    - Converts each multiplier to a T-length array
    - Calculates discount factors: year 0 = 1.0, year 1 = 1/(1+r), year 2 = 1/(1+r)^2, etc.
    
    EXAMPLE:
    If T=3 and r=0.10 (10% discount rate):
    - Year 0: discount factor = 1.000 (worth full value today)
    - Year 1: discount factor = 0.909 (worth 10% less)
    - Year 2: discount factor = 0.826 (worth 20% less total)
    
    This is Net Present Value (NPV) - future income is worth less than today's income
    """
    T = int(sc.get('T', 10))                    # Number of years (default: 10)
    r = float(sc.get('discount_rate', 0.10))    # Discount rate (default: 10%)
    
    # Convert all multipliers to T-length arrays
    y = _as_T(sc.get('yield_multiplier', 1.0), T, 'yield_multiplier')
    p = _as_T(sc.get('crop_price_multiplier', 1.0), T, 'crop_price_multiplier')
    w = _as_T(sc.get('wage_multiplier', 1.0), T, 'wage_multiplier')
    f = _as_T(sc.get('fert_price_multiplier', 1.0), T, 'fert_price_multiplier')
    pop = _as_T(sc.get('population_multiplier', 1.0), T, 'population_multiplier')
    
    # Calculate discount factors for NPV
    # Formula: 1 / (1 + r)^t where t is the year index
    disc = [1.0/((1.0+r)**t) for t in range(T)]
    
    return T, r, y, p, w, f, pop, disc

# ========================================
# EXECUTE: LOAD ALL PARAMETERS
# ========================================
# Load baseline parameters from CSVs
params = load_params()

# Load multi-year scenario arrays
T, r, Ymul, Pmul, Wmul, Fmul, POPmul, DISC = load_scenario_arrays(scenario)

# Display what we loaded
print(f'Loaded params: H={len(params.households)}, C={len(params.crops)}, L={len(params.livestock)}; T={T}, r={r}')


Loaded params: H=6, C=2, L=2; T=10, r=0.1


## Multi-year LP solver (Phase 2A)
No inter-year storage; each year is linked only via the discounted objective (NPV).

In [None]:
# ========================================
# FUNCTION: BUILD VARIABLE INDEX MAP (MULTI-YEAR)
# ========================================
def build_index_maps(H, C, L, T):
    """
    Creates mapping for ALL decision variables across ALL years
    Like Phase 1, but now we track decisions for each year separately
    
    INPUTS:
    - H: List of household groups
    - C: List of crops
    - L: List of livestock types
    - T: Number of years
    
    OUTPUTS:
    - idx: Dictionary mapping (decision type, household, item, year) â†’ position number
    - pos: Total number of decision variables
    
    EXAMPLE:
    If we have 2 households, 3 crops, 1 livestock, and 5 years:
    - Total crop area decisions = 2 Ã— 3 Ã— 5 = 30
    - Total consumption decisions = 2 Ã— 3 Ã— 5 = 30
    - Total sales decisions = 2 Ã— 3 Ã— 5 = 30
    - Total hired labor decisions = 2 Ã— 5 = 10
    - Total off-farm work decisions = 2 Ã— 5 = 10
    - Total livestock decisions = 2 Ã— 1 Ã— 5 = 10
    - Grand total = 120 decision variables
    
    PURPOSE: The model optimizes across ALL years simultaneously to maximize NPV
    """
    idx = {}  # Dictionary to store mappings
    pos = 0   # Counter for variable positions
    
    # For each year, create variables for all decisions
    # Note: We repeat the Phase 1 structure for EACH year
    for t in range(T):
        # LAND ALLOCATION per crop per household per year
        for h in H:
            for c in C: 
                idx[('area', h, c, t)] = pos
                pos += 1
        
        # CONSUMPTION per crop per household per year
        for h in H:
            for c in C: 
                idx[('cons', h, c, t)] = pos
                pos += 1
        
        # SALES per crop per household per year
        for h in H:
            for c in C: 
                idx[('sold', h, c, t)] = pos
                pos += 1
        
        # HIRED LABOR per household per year
        for h in H: 
            idx[('hired', h, None, t)] = pos
            pos += 1
        
        # OFF-FARM WORK per household per year
        for h in H: 
            idx[('off_farm', h, None, t)] = pos
            pos += 1
        
        # LIVESTOCK per type per household per year
        for h in H:
            for l in L: 
                idx[('live_units', h, l, t)] = pos
                pos += 1
    
    return idx, pos

# ========================================
# FUNCTION: SOLVE MULTI-YEAR OPTIMIZATION
# ========================================
def solve_multi_year_lp(params: ModelParams, T, Ymul, Pmul, Wmul, Fmul, POPmul, DISC):
    """
    Solves the MULTI-YEAR farm optimization problem
    
    KEY DIFFERENCE FROM PHASE 1:
    - Objective is now NET PRESENT VALUE (NPV) across all T years
    - Each year has its own multipliers (yields, prices, wages can change)
    - Years are independent (no storage carryover yet - that's Phase 2B)
    
    INPUTS:
    - params: Baseline data (households, crops, livestock, prices)
    - T: Number of years
    - Ymul: Yield multipliers per year (length T)
    - Pmul: Crop price multipliers per year (length T)
    - Wmul: Wage multipliers per year (length T)
    - Fmul: Fertilizer price multipliers per year (length T)
    - POPmul: Population multipliers per year (length T)
    - DISC: Discount factors per year (length T) for NPV calculation
    
    OUTPUTS:
    - res: Optimization result object
    - idx: Variable index mapping
    - H, C, L: Lists of households, crops, livestock
    
    HOW NPV WORKS:
    Instead of maximizing single-year profit, we maximize:
    NPV = Î£(discount_factor[t] Ã— profit[t]) for all years t
    
    Example with 3 years and 10% discount rate:
    - Year 1 profit: 1000 Birr Ã— 1.000 = 1000 (present value)
    - Year 2 profit: 1000 Birr Ã— 0.909 = 909 (present value)
    - Year 3 profit: 1000 Birr Ã— 0.826 = 826 (present value)
    - NPV = 1000 + 909 + 826 = 2735 Birr
    
    This captures the time value of money: a Birr today is worth more than a Birr tomorrow
    """
    
    # Step 1: Get lists of households, crops, livestock
    H = list(params.households.keys())
    C = list(params.crops.keys())
    L = list(params.livestock.keys())
    
    # Step 2: Create variable index map for all years
    idx, nvars = build_index_maps(H, C, L, T)
    
    # Step 3: Build the OBJECTIVE FUNCTION (maximize NPV)
    cvec = np.zeros(nvars)
    
    # For each year, add discounted costs and revenues to objective
    for t in range(T):
        disc = DISC[t]  # Discount factor for this year
        wage_t = params.prices.wage * Wmul[t]  # Wage rate for this year
        
        for h in H:
            # COST: Hired labor (discounted)
            cvec[idx[('hired', h, None, t)]] += disc * wage_t
            
            # REVENUE: Off-farm work (discounted, negative to maximize)
            cvec[idx[('off_farm', h, None, t)]] += -disc * wage_t
            
            for cn in C:
                cp = params.crops[cn]
                # REVENUE: Crop sales (discounted, negative to maximize)
                cvec[idx[('sold', h, cn, t)]] += -disc * (cp.price_sale * Pmul[t])
                
                # COST: Crop inputs per hectare (discounted)
                per_ha = cp.seed_cost_per_ha + (cp.fert_cost_per_ha * Fmul[t]) + cp.chem_cost_per_ha
                cvec[idx[('area', h, cn, t)]] += disc * per_ha
            
            for l in L:
                lv = params.livestock[l]
                # Net cost for livestock (cost - revenue, discounted)
                cvec[idx[('live_units', h, l, t)]] += disc * ((lv.feed_cost_per_unit + lv.vet_cost_per_unit) - lv.price_sale)

    # Step 4: Initialize constraint matrices
    A_eq, b_eq = [], []  # Equality constraints (must be exactly satisfied)
    A_ub, b_ub = [], []  # Inequality constraints (upper bounds)
    bounds = [(0, None) for _ in range(nvars)]  # All variables must be non-negative

    # CONSTRAINT 1: CROP PRODUCTION BALANCE (per year, per household, per crop)
    # Production = Consumption + Sales
    # Note: No storage carryover between years yet (that's Phase 2B)
    for t in range(T):
        for h in H:
            for cn in C:
                row = np.zeros(nvars)
                # Production (yield Ã— area, adjusted for year-specific yield multiplier)
                row[idx[('area', h, cn, t)]] = params.crops[cn].yield_per_ha * Ymul[t]
                # Consumption (negative because on right side of equation)
                row[idx[('cons', h, cn, t)]] = -1.0
                # Sales (negative because on right side of equation)
                row[idx[('sold', h, cn, t)]] = -1.0
                A_eq.append(row)
                b_eq.append(0.0)  # Must equal zero

    # CONSTRAINT 2: LAND AVAILABILITY (per year, per household)
    # Total crop area â‰¤ Available land
    for t in range(T):
        for h in H:
            row = np.zeros(nvars)
            for cn in C:
                row[idx[('area', h, cn, t)]] = 1.0
            A_ub.append(row)
            b_ub.append(params.households[h].land_available)

    # CONSTRAINT 3: LABOR AVAILABILITY (per year, per household)
    # Crop labor + Livestock labor + Off-farm work â‰¤ Family labor + Hired labor
    for t in range(T):
        for h in H:
            row = np.zeros(nvars)
            # Labor used by crops
            for cn in C:
                row[idx[('area', h, cn, t)]] = params.crops[cn].labor_req_per_ha
            # Labor used by livestock
            for l in L:
                row[idx[('live_units', h, l, t)]] = params.livestock[l].labor_req_per_unit
            # Labor allocated to off-farm work
            row[idx[('off_farm', h, None, t)]] = 1.0
            # Hired labor (negative because it adds to available labor)
            row[idx[('hired', h, None, t)]] = -1.0
            A_ub.append(row)
            b_ub.append(params.households[h].labor_endowment)

    # CONSTRAINT 4: HIRED LABOR LIMIT (per year, per household)
    # Hired labor â‰¤ Maximum allowed
    for t in range(T):
        for h in H:
            row = np.zeros(nvars)
            row[idx[('hired', h, None, t)]] = 1.0
            A_ub.append(row)
            b_ub.append(params.households[h].max_hired_labor)

    # CONSTRAINT 5: NUTRITION REQUIREMENT (per year, per household)
    # Total calories consumed â‰¥ Minimum daily requirement Ã— people Ã— 365 days
    # Adjusted for year-specific population multiplier
    for t in range(T):
        for h in H:
            row = np.zeros(nvars)
            for cn in C:
                # Negative because we move to left side: -calories â‰¤ -requirement
                row[idx[('cons', h, cn, t)]] = -params.crops[cn].calorie_per_kg
            # Calculate annual calorie requirement for this year
            kcal_need = params.min_kcal_per_person_per_day * (params.households[h].adult_equiv * POPmul[t]) * params.days_per_year
            A_ub.append(row)
            b_ub.append(-kcal_need)

    # Step 5: SOLVE the optimization problem
    # Find the best decisions across all years to maximize NPV
    res = linprog(cvec, 
                  A_ub=np.array(A_ub), b_ub=np.array(b_ub),
                  A_eq=np.array(A_eq), b_eq=np.array(b_eq),
                  bounds=bounds, 
                  method='highs')
    
    return res, idx, H, C, L


## Run solver and export yearly + NPV outputs

In [None]:
# ========================================
# EXECUTE: SOLVE AND EXTRACT RESULTS
# ========================================
# Run the multi-year optimization and organize the results

# Step 1: Solve the optimization problem
res, idx, H, C, L = solve_multi_year_lp(params, T, Ymul, Pmul, Wmul, Fmul, POPmul, DISC)

# Step 2: Check if optimization succeeded
print('Status:', res.message)
print('Success:', bool(res.success))
if not res.success:
    raise RuntimeError('Optimization failed â€” check coefficients/bounds or calorie feasibility.')

# Step 3: Extract the solution (optimal values for all decision variables)
x = res.x

# Helper function to get the value of a specific decision
def val(kind, h, k, t):
    """
    Retrieves the optimal value for a decision variable
    
    INPUTS:
    - kind: Type of decision ('area', 'cons', 'sold', etc.)
    - h: Household group
    - k: Crop or livestock type (or None for labor)
    - t: Year index (0 to T-1)
    
    OUTPUTS: The optimal value (e.g., hectares, kg, work-days)
    """
    return float(x[idx[(kind, h, k, t)]])

# Step 4: Calculate detailed results for each year and household
# Build a table showing revenues, costs, and profits year by year
rows = []
for t in range(T):
    wage_t = params.prices.wage * Wmul[t]  # Wage for this year
    
    for h in H:
        # REVENUES (income sources)
        # Crop sales revenue = kg sold Ã— price
        rev_crops = sum(val('sold', h, cn, t) * (params.crops[cn].price_sale * Pmul[t]) for cn in C)
        
        # Livestock sales revenue = animals sold Ã— price
        rev_livestock = sum(val('live_units', h, l, t) * params.livestock[l].price_sale for l in L)
        
        # Off-farm wage revenue = work-days Ã— wage
        rev_off = val('off_farm', h, None, t) * wage_t

        # COSTS (expenses)
        # Crop input costs = area Ã— (seed + fertilizer + chemicals)
        cost_crop_inputs = sum(val('area', h, cn, t) * 
                              (params.crops[cn].seed_cost_per_ha + 
                               params.crops[cn].fert_cost_per_ha * Fmul[t] + 
                               params.crops[cn].chem_cost_per_ha) 
                              for cn in C)
        
        # Livestock costs = animals Ã— (feed + vet)
        cost_livestock = sum(val('live_units', h, l, t) * 
                            (params.livestock[l].feed_cost_per_unit + params.livestock[l].vet_cost_per_unit) 
                            for l in L)
        
        # Hired labor cost = work-days hired Ã— wage
        cost_hired = val('hired', h, None, t) * wage_t

        # PROFIT for this year (revenues - costs)
        profit = (rev_crops + rev_livestock + rev_off) - (cost_crop_inputs + cost_livestock + cost_hired)
        
        # Store results for this year and household
        rows.append({
            'year': t+1,  # Display as year 1, 2, 3... instead of 0, 1, 2...
            'household_class': h,
            'rev_crops': rev_crops,
            'rev_livestock': rev_livestock,
            'rev_off_farm': rev_off,
            'cost_crop_inputs': cost_crop_inputs,
            'cost_livestock': cost_livestock,
            'cost_hired_labor': cost_hired,
            'profit': profit,
            'discount_factor': DISC[t],
            'discounted_profit': profit * DISC[t],  # Present value of this year's profit
        })

# Step 5: Create yearly results table
yearly_df = pd.DataFrame(rows)

# Step 6: Calculate NPV summary per household
# Group by household and sum discounted profits to get NPV
# Also calculate mean, min, max profit across years for each household
npv_df = (yearly_df.groupby('household_class', as_index=False)
          .agg(NPV=('discounted_profit','sum'),          # Total NPV (sum of all discounted profits)
               mean_profit=('profit','mean'),             # Average annual profit
               min_profit=('profit','min'),               # Worst year profit
               max_profit=('profit','max')))              # Best year profit

# Step 7: Display the results
if display_dataframe_to_user:
    display_dataframe_to_user('Phase 2A â€” Yearly results', yearly_df)
    display_dataframe_to_user('Phase 2A â€” NPV summary', npv_df)
else:
    display(yearly_df.head(12))
    display(npv_df)

# Step 8: Save results to CSV files for later analysis
yearly_df.to_csv(OUT/'phase2A_yearly_results.csv', index=False)
npv_df.to_csv(OUT/'phase2A_npv_summary.csv', index=False)

# Step 9: Save metadata (summary information about the run)
json.dump({
    'success': True, 
    'objective_npv': float(-res.fun),  # Total NPV across all households
    'T': T,                             # Number of years simulated
    'discount_rate': r                  # Discount rate used
}, open(OUT/'phase2A_meta.json','w'), indent=2)

print('Saved outputs in ./outputs/')


Status: Optimization terminated successfully. (HiGHS Status 7: Optimal)
Success: True


Unnamed: 0,year,household_class,rev_crops,rev_livestock,rev_off_farm,cost_crop_inputs,cost_livestock,cost_hired_labor,profit,discount_factor,discounted_profit
0,1,HFR,0.0,163395.238095,0.0,704.507937,40848.809524,9600.0,112241.920635,1.0,112241.920635
1,1,HMR,0.0,178221.428571,0.0,723.047619,44555.357143,10400.0,122543.02381,1.0,122543.02381
2,1,MFIR,0.0,222700.0,0.0,778.666667,55675.0,12800.0,153446.333333,1.0,153446.333333
3,1,MMR,0.0,222700.0,0.0,778.666667,55675.0,12800.0,153446.333333,1.0,153446.333333
4,1,MMIR,0.0,242004.761905,0.0,852.825397,60501.190476,14400.0,166250.746032,1.0,166250.746032
5,1,MFR,0.0,208047.619048,0.0,741.587302,52011.904762,12000.0,143294.126984,1.0,143294.126984
6,2,HFR,0.0,163395.238095,0.0,704.507937,40848.809524,9600.0,112241.920635,0.909091,102038.109668
7,2,HMR,0.0,178221.428571,0.0,723.047619,44555.357143,10400.0,122543.02381,0.909091,111402.748918
8,2,MFIR,0.0,222700.0,0.0,778.666667,55675.0,12800.0,153446.333333,0.909091,139496.666667
9,2,MMR,0.0,222700.0,0.0,778.666667,55675.0,12800.0,153446.333333,0.909091,139496.666667


Unnamed: 0,household_class,NPV,mean_profit,min_profit,max_profit
0,HFR,758645.8,112241.920635,112241.920635,112241.920635
1,HMR,828271.2,122543.02381,122543.02381,122543.02381
2,MFIR,1037147.0,153446.333333,153446.333333,153446.333333
3,MFR,968528.4,143294.126984,143294.126984,143294.126984
4,MMIR,1123693.0,166250.746032,166250.746032,166250.746032
5,MMR,1037147.0,153446.333333,153446.333333,153446.333333


Saved outputs in ./outputs/


## Optional: Validate year 1 (production-only)
If `observed_prod_only.csv` exists, we compare year-1 model metrics to observed production-only metrics.

In [None]:
# ========================================
# VALIDATION: COMPARE YEAR 1 TO OBSERVED DATA
# ========================================
# Check how well the model's first-year predictions match real-world data
# (Similar to Phase 1 validation, but only for year 1)

if obs_df is None:
    # No observed data available - skip validation
    print('No observed_prod_only.csv found. Skipping validation.')
else:
    # Step 1: Extract year 1 model results
    # Filter for year 1 and rename column to match observed data format
    y1 = yearly_df[yearly_df['year']==1].copy().rename(columns={'rev_off_farm':'off_farm_labor'})
    
    # Step 2: Define which metrics to compare
    metrics = ['rev_crops','rev_livestock','off_farm_labor','cost_crop_inputs','cost_livestock','cost_hired_labor']
    
    # Step 3: Merge model and observed data
    # Creates columns like 'rev_crops_model' and 'rev_crops_obs'
    merged = y1.merge(obs_df[['household_class']+metrics], 
                     on='household_class', 
                     suffixes=('_model','_obs'), 
                     how='outer')

    # Step 4: Calculate percentage difference
    def pct_diff(m,o):
        """
        Calculate percentage difference between model and observed
        
        INPUTS:
        - m: Model value
        - o: Observed value
        
        OUTPUTS: Percentage difference (positive = overestimate, negative = underestimate)
        
        FORMULA: 100% Ã— (model - observed) / |observed|
        """
        if pd.isna(m) or pd.isna(o): 
            return np.nan
        if o==0: 
            return np.inf if m!=0 else 0.0
        return 100.0*(m-o)/abs(o)

    # Step 5: Create detailed comparison table
    # One row for each household Ã— metric combination
    comp_rows=[]
    for _, rrow in merged.iterrows():
        for m in metrics:
            comp_rows.append({
                'household_class': rrow['household_class'],
                'metric': m,
                'observed': rrow[f'{m}_obs'],
                'model': rrow[f'{m}_model'],
                'diff': rrow[f'{m}_model'] - rrow[f'{m}_obs'] if pd.notna(rrow[f'{m}_model']) and pd.notna(rrow[f'{m}_obs']) else np.nan,
                'pct_diff_%': pct_diff(rrow[f'{m}_model'], rrow[f'{m}_obs'])
            })
    comp_df = pd.DataFrame(comp_rows)

    # Step 6: Calculate summary error metrics
    # RMSE (Root Mean Square Error) and MAPE (Mean Absolute Percentage Error)
    agg=[]
    for m in metrics:
        # Filter for this metric and remove invalid values
        sub = comp_df[comp_df['metric']==m].replace([np.inf,-np.inf], np.nan).dropna(subset=['observed','model'])
        
        if len(sub)>0:
            # RMSE: Average magnitude of errors in Birr
            # Formula: sqrt(mean((model - observed)^2))
            rmse = float(np.sqrt(np.mean((sub['model']-sub['observed'])**2)))
            
            # MAPE: Average percentage error
            # Formula: mean(|model - observed| / |observed|) Ã— 100%
            denom = sub['observed'].replace(0, np.nan)
            mape = float(np.mean(np.abs((sub['model']-sub['observed'])/denom))*100.0)
        else:
            rmse = np.nan
            mape = np.nan
        
        agg.append({'metric': m, 'RMSE': rmse, 'MAPE_%': mape})
    
    agg_df = pd.DataFrame(agg)

    # Step 7: Display validation results
    if display_dataframe_to_user:
        display_dataframe_to_user('Validation â€” Year 1 diffs', comp_df)
        display_dataframe_to_user('Validation â€” Year 1 summary', agg_df)
    else:
        display(comp_df)
        display(agg_df)

    # Step 8: Save validation results to CSV
    comp_df.to_csv(OUT/'phase2A_validation_year1.csv', index=False)
    agg_df.to_csv(OUT/'phase2A_validation_year1_summary.csv', index=False)
    print('Saved validation outputs.')


Unnamed: 0,household_class,metric,observed,model,diff,pct_diff_%
0,HFR,rev_crops,1726.28,0.0,-1726.28,-100.0
1,HFR,rev_livestock,844.47,163395.238095,162550.768095,19248.850533
2,HFR,off_farm_labor,463.33,0.0,-463.33,-100.0
3,HFR,cost_crop_inputs,1166.3,704.507937,-461.792063,-39.594621
4,HFR,cost_livestock,231.0,40848.809524,40617.809524,17583.467326
5,HFR,cost_hired_labor,1195.3,9600.0,8404.7,703.145654
6,HMR,rev_crops,6104.32,0.0,-6104.32,-100.0
7,HMR,rev_livestock,1568.32,178221.428571,176653.108571,11263.843385
8,HMR,off_farm_labor,3649.15,0.0,-3649.15,-100.0
9,HMR,cost_crop_inputs,3061.1,723.047619,-2338.052381,-76.379484


Unnamed: 0,metric,RMSE,MAPE_%
0,rev_crops,10946.740093,100.0
1,rev_livestock,204944.732339,12776.36389
2,off_farm_labor,2412.961644,100.0
3,cost_crop_inputs,2023.106454,54.405793
4,cost_livestock,50894.021048,16324.007726
5,cost_hired_labor,11018.074616,1084.812196


Saved validation outputs.
