# Prototype B — Crop Yield Prediction (GenAI-Assisted)
We add a simple GDD feature and use a compact RF hyperparameter grid via GridSearchCV, inspired by GenAI suggestions.


In [9]:
# ---- Path bootstrap: find project root and set canonical paths ----
import sys
from pathlib import Path

def locate_project_root() -> Path:
    here = Path.cwd().resolve()
    for p in [here, *here.parents]:
        if (p / "src").is_dir():
            return p
    raise RuntimeError(f"Couldn't find project root containing 'src' from {here}")

PROJECT_ROOT = locate_project_root()

# Ensure project root is importable so "from src..." works
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

# Canonical paths (ALWAYS relative to project root)
DATA_PATH  = PROJECT_ROOT / "data" / "crop_yield_sample.csv"
OUT_MODELS = PROJECT_ROOT / "outputs" / "models"
OUT_FIGS   = PROJECT_ROOT / "outputs" / "figures"
OUT_MODELS.mkdir(parents=True, exist_ok=True)
OUT_FIGS.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("DATA_PATH:", DATA_PATH, "| exists?", DATA_PATH.exists())
print("OUT_MODELS:", OUT_MODELS)
print("OUT_FIGS:", OUT_FIGS)


PROJECT_ROOT: D:\2 Level\AI Business\Assignment 2\crop_yield_genai
DATA_PATH: D:\2 Level\AI Business\Assignment 2\crop_yield_genai\data\crop_yield_sample.csv | exists? True
OUT_MODELS: D:\2 Level\AI Business\Assignment 2\crop_yield_genai\outputs\models
OUT_FIGS: D:\2 Level\AI Business\Assignment 2\crop_yield_genai\outputs\figures


In [12]:
# Core imports from your src package
from src.data_preprocessing import (
    load_data, basic_clean, add_gdd_feature, encode_categoricals,
    select_features, ensure_year_column, time_based_split
)

from src.model import train_linear_regression, train_random_forest, evaluate, save_model
from src.visualization import plot_residuals, plot_feature_importance

# External libs
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import joblib

# Notebook-only helper: GDD feature (used by GenAI-assisted pipeline)
def add_gdd_feature(df: pd.DataFrame, base_temp: float = 10.0) -> pd.DataFrame:
    """
    Approximate Growing Degree Days (GDD) using mean daily temperature proxy.
    GDD ~ max(0, Temperature_C - base_temp). Here we use the single Temperature_C value.
    """
    df = df.copy()
    if "Temperature_C" not in df.columns:
        raise KeyError("Temperature_C is required to compute GDD_approx")
    df["GDD_approx"] = (df["Temperature_C"] - float(base_temp)).clip(lower=0)
    return df

print("Imports OK. Helpers ready.")


ImportError: cannot import name 'ensure_year_column' from 'src.data_preprocessing' (D:\2 Level\AI Business\Assignment 2\crop_yield_genai\src\data_preprocessing.py)

In [None]:
import os
from pathlib import Path
import pandas as pd
import numpy as np

from src.data_preprocessing import load_data, basic_clean, add_gdd_feature, encode_categoricals, select_features, time_based_split
from src.model import train_linear_regression, train_rf_gridsearch, suggested_rf_grid, evaluate, save_model
from src.visualization import plot_residuals, plot_feature_importance

BASE = Path.cwd().resolve()
DATA_PATH = BASE / "data" / "crop_yield_sample.csv"
OUT_MODELS = BASE / "outputs" / "models"
OUT_FIGS = BASE / "outputs" / "figures"
OUT_TEXT = BASE / "outputs"
OUT_MODELS.mkdir(parents=True, exist_ok=True)
OUT_FIGS.mkdir(parents=True, exist_ok=True)

print("Working dir:", BASE)
print("Data path:", DATA_PATH)


ModuleNotFoundError: No module named 'src'

In [None]:
df = load_data(str(DATA_PATH))
df = basic_clean(df)
df = add_gdd_feature(df, base_temp=10.0)  # GenAI-inspired feature
df = encode_categoricals(df)
df.head()


In [None]:
train_df, test_df = time_based_split(df, time_col="Year", test_years=2)
X_train, y_train, feature_names = select_features(train_df)
X_test, y_test, _ = select_features(test_df, features=feature_names)
feature_names, len(train_df), len(test_df)


In [None]:
lr = train_linear_regression(X_train, y_train)
y_pred_lr = lr.predict(X_test)
metrics_lr = evaluate(y_test.values, y_pred_lr)
metrics_lr


In [None]:
grid = suggested_rf_grid(small_data=True)
grid


In [None]:
y_pred_rf = rf_best.predict(X_test)
metrics_rf = evaluate(y_test.values, y_pred_rf)
metrics_rf

plot_residuals(y_test.values, y_pred_rf, str(OUT_FIGS / "residuals_rf_genai.png"))
plot_feature_importance(rf_best, feature_names, str(OUT_FIGS / "feature_importance_rf_genai.png"))

eda_narrative = """
### EDA Narrative (Draft)
- Residuals are mostly centered; slight spread at high predictions hints mild heteroskedasticity.
- Feature importance: rainfall and fertilizer dominate; temperature and GDD_approx contribute.
- Next: add soil/NDVI, try gradient boosting, use SHAP for explanations.
"""
(Path(OUT_TEXT) / "eda_narrative_genai.md").write_text(eda_narrative)
"Saved narrative to outputs/eda_narrative_genai.md"


In [None]:
save_path = OUT_MODELS / "genai_rf.pkl"
save_model(rf_best, str(save_path))
str(save_path)


In [None]:
import pandas as pd
results = pd.DataFrame([
    {"Model": "Linear Regression (GDD)", **metrics_lr},
    {"Model": "RF (GenAI-assisted grid)", **metrics_rf},
]).set_index("Model")
results
