# 02 – Modeling: Moroccan Orchard Yield (Synthetic)

Train baseline and tree-based models with proper preprocessing.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor

data_path = Path('../data/moroccan_orchard_yield_synthetic.csv')
df = pd.read_csv(data_path)

y = df['yield_kg_per_ha']
X = df.drop(columns=['yield_kg_per_ha','yield_kg_per_tree','irrigation_events','soil_moisture_30cm_weekly'])

cat_cols = X.select_dtypes(include=['object']).columns.tolist()
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()

preprocess = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def eval_model(name, model):
    pipe = Pipeline([
        ('prep', preprocess),
        ('model', model)
    ])
    pipe.fit(X_train, y_train)
    pred = pipe.predict(X_test)
    rmse = mean_squared_error(y_test, pred, squared=False)
    mae = mean_absolute_error(y_test, pred)
    r2 = r2_score(y_test, pred)
    print(f"{name}: RMSE={rmse:.1f}  MAE={mae:.1f}  R²={r2:.3f}")
    return pipe, pred

lin_pipe, lin_pred = eval_model('Linear', LinearRegression())
ridge_pipe, ridge_pred = eval_model('Ridge', Ridge(alpha=10.0))
rf_pipe, rf_pred = eval_model('RandomForest', RandomForestRegressor(
    n_estimators=400, random_state=42, n_jobs=-1))

plt.figure()
plt.scatter(y_test, rf_pred, alpha=0.6)
plt.xlabel('Actual yield (kg/ha)')
plt.ylabel('Predicted yield (kg/ha)')
plt.title('Random Forest – Predicted vs Actual')
plt.show()

res = y_test - rf_pred
plt.figure()
plt.scatter(rf_pred, res, alpha=0.6)
plt.axhline(0, linestyle='--')
plt.xlabel('Predicted')
plt.ylabel('Residual')
plt.title('Random Forest – Residuals')
plt.show()
