# Prototype A — Crop Yield Prediction (No GenAI)
Goal: Train traditional ML baselines (Linear Regression & Random Forest) to predict `Yield_kg_ha`
from seasonal features using a time-based split (train on earlier years, test on last 2 years).


In [16]:
import os
from pathlib import Path
import pandas as pd
import numpy as np

import sys
from pathlib import Path

# Go up one level from notebooks/ to project root
project_root = Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

print("Project root added to path:", project_root)
# ---- Path bootstrap: make 'src' importable and set project-level paths ----
import sys, os
from pathlib import Path

def locate_project_root() -> Path:
    """
    Find the directory that contains 'src/' (your project root)
    regardless of where the notebook is run from.
    """
    candidates = []

    # Current working dir and a few parents
    here = Path.cwd().resolve()
    candidates += [here, here.parent, here.parent.parent, here.parent.parent.parent]

    # If nbconvert defines __file__, add those parents too
    if '__file__' in globals():
        nb_dir = Path(__file__).resolve().parent
        candidates += [nb_dir, nb_dir.parent, nb_dir.parent.parent]

    # Walk up a few more levels just in case
    p = here
    for _ in range(5):
        candidates.append(p)
        p = p.parent

    seen = set()
    uniq = []
    for c in candidates:
        if c and str(c) not in seen:
            uniq.append(c)
            seen.add(str(c))

    for c in uniq:
        if (c / "src").is_dir():
            return c

    raise RuntimeError(
        "Could not locate project root containing 'src/'. "
        f"Tried: {[str(c) for c in uniq]}"
    )

PROJECT_ROOT = locate_project_root()

# Ensure project root is on sys.path so "from src... import ..." works
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

# Define canonical paths (ALWAYS relative to project root)
DATA_PATH  = PROJECT_ROOT / "data" / "crop_yield_sample.csv"
OUT_MODELS = PROJECT_ROOT / "outputs" / "models"
OUT_FIGS   = PROJECT_ROOT / "outputs" / "figures"

OUT_MODELS.mkdir(parents=True, exist_ok=True)
OUT_FIGS.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("DATA_PATH:", DATA_PATH)
print("OUT_MODELS:", OUT_MODELS)
print("OUT_FIGS:", OUT_FIGS)
print("CSV exists?", DATA_PATH.exists())



from src.data_preprocessing import load_data, basic_clean, encode_categoricals, select_features, time_based_split
from src.model import train_linear_regression, train_random_forest, evaluate, save_model
from src.visualization import plot_residuals, plot_feature_importance
from sklearn.dummy import DummyRegressor

BASE = Path.cwd().resolve()
DATA_PATH = BASE / "data" / "crop_yield_sample.csv"
OUT_MODELS = BASE / "outputs" / "models"
OUT_FIGS = BASE / "outputs" / "figures"
OUT_MODELS.mkdir(parents=True, exist_ok=True)
OUT_FIGS.mkdir(parents=True, exist_ok=True)

print("Working dir:", BASE)
print("Data path:", DATA_PATH)


Project root added to path: d:\2 Level\AI Business\Assignment 2\crop_yield_no_genai
PROJECT_ROOT: D:\2 Level\AI Business\Assignment 2\crop_yield_no_genai
DATA_PATH: D:\2 Level\AI Business\Assignment 2\crop_yield_no_genai\data\crop_yield_sample.csv
OUT_MODELS: D:\2 Level\AI Business\Assignment 2\crop_yield_no_genai\outputs\models
OUT_FIGS: D:\2 Level\AI Business\Assignment 2\crop_yield_no_genai\outputs\figures
CSV exists? True
Working dir: D:\2 Level\AI Business\Assignment 2\crop_yield_no_genai\notebooks
Data path: D:\2 Level\AI Business\Assignment 2\crop_yield_no_genai\notebooks\data\crop_yield_sample.csv


In [17]:
# ---- Path bootstrap: find project root and set canonical paths ----
import sys
from pathlib import Path

def locate_project_root() -> Path:
    here = Path.cwd().resolve()
    # walk up until we find a folder that contains 'src'
    for p in [here, *here.parents]:
        if (p / "src").is_dir():
            return p
    raise RuntimeError(f"Couldn't find project root containing 'src' from {here}")

PROJECT_ROOT = locate_project_root()

# Ensure project root is importable so 'from src...' works
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

# Define paths RELATIVE TO PROJECT ROOT (not cwd!)
DATA_PATH  = PROJECT_ROOT / "data" / "crop_yield_sample.csv"
OUT_MODELS = PROJECT_ROOT / "outputs" / "models"
OUT_FIGS   = PROJECT_ROOT / "outputs" / "figures"
OUT_MODELS.mkdir(parents=True, exist_ok=True)
OUT_FIGS.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("DATA_PATH:", DATA_PATH, "| exists?", DATA_PATH.exists())
print("OUT_MODELS:", OUT_MODELS)
print("OUT_FIGS:", OUT_FIGS)

# ---- Pipeline: load, clean, encode, split, select features ----
from src.data_preprocessing import (
    load_data, basic_clean, encode_categoricals,
    select_features, time_based_split
)

# 1) Load + clean
df = load_data(str(DATA_PATH))
df = basic_clean(df)
df = encode_categoricals(df)

# 2) Time-based split (robust: creates/derives Year if missing)
train_df, test_df = time_based_split(df, time_col="Year", test_years=2)

# 3) Feature selection (keeps same features for train & test)
X_train, y_train, feature_names = select_features(train_df)
X_test, y_test, _ = select_features(test_df, features=feature_names)

print("Rows → train/test:", len(train_df), len(test_df))
print("X shapes → train/test:", X_train.shape, X_test.shape)
print("Features used:", feature_names)
print("Train years:", sorted(train_df['Year'].unique())[:5], "...",
      "Test years:", sorted(test_df['Year'].unique())[:5])


PROJECT_ROOT: D:\2 Level\AI Business\Assignment 2\crop_yield_no_genai
DATA_PATH: D:\2 Level\AI Business\Assignment 2\crop_yield_no_genai\data\crop_yield_sample.csv | exists? True
OUT_MODELS: D:\2 Level\AI Business\Assignment 2\crop_yield_no_genai\outputs\models
OUT_FIGS: D:\2 Level\AI Business\Assignment 2\crop_yield_no_genai\outputs\figures
Rows → train/test: 6 2
X shapes → train/test: (6, 3) (2, 3)
Features used: ['Rainfall_mm', 'Temperature_C', 'Fertilizer_kg']
Train years: [np.int64(2016), np.int64(2017), np.int64(2018)] ... Test years: [np.int64(2019), np.int64(2020)]


In [18]:
# df = load_data(str(DATA_PATH))
df.shape, df.head()


((8, 4),
    Rainfall_mm  Temperature_C  Fertilizer_kg  Yield_kg_ha
 0          350             18             80         2800
 1          420             20            100         3200
 2          500             22            110         3500
 3          600             25            120         3800
 4          700             27            130         3700)

In [19]:
df = basic_clean(df)
df = encode_categoricals(df)
df.head()


Unnamed: 0,Rainfall_mm,Temperature_C,Fertilizer_kg,Yield_kg_ha
0,350,18,80,2800
1,420,20,100,3200
2,500,22,110,3500
3,600,25,120,3800
4,700,27,130,3700


In [20]:
train_df, test_df = time_based_split(df, time_col="Year", test_years=2)
len(train_df), len(test_df), int(train_df['Year'].max()), int(test_df['Year'].min())


(6, 2, 2018, 2019)

In [21]:
X_train, y_train, feature_names = select_features(train_df)
X_test, y_test, _ = select_features(test_df, features=feature_names)
feature_names


['Rainfall_mm', 'Temperature_C', 'Fertilizer_kg']

In [22]:
dummy = DummyRegressor(strategy='mean')
dummy.fit(X_train, y_train)
y_pred_dummy = dummy.predict(X_test)
metrics_dummy = evaluate(y_test.values, y_pred_dummy)
metrics_dummy


{'MAE': 583.3333333333335, 'RMSE': 585.4722690083433, 'R2': -136.1111111111112}

In [23]:
lr = train_linear_regression(X_train, y_train)
y_pred_lr = lr.predict(X_test)
metrics_lr = evaluate(y_test.values, y_pred_lr)
metrics_lr
plot_residuals(y_test.values, y_pred_lr, str(OUT_FIGS / "residuals_lr.png"))


In [24]:
rf = train_random_forest(X_train, y_train, n_estimators=300, max_depth=None, min_samples_split=2, random_state=42)
y_pred_rf = rf.predict(X_test)
metrics_rf = evaluate(y_test.values, y_pred_rf)
metrics_rf
plot_residuals(y_test.values, y_pred_rf, str(OUT_FIGS / "residuals_rf.png"))
plot_feature_importance(rf, feature_names, str(OUT_FIGS / "feature_importance_rf.png"))


In [25]:
results = pd.DataFrame([
    {"Model": "Naïve (mean)", **metrics_dummy},
    {"Model": "Linear Regression", **metrics_lr},
    {"Model": "Random Forest", **metrics_rf},
]).set_index("Model")
results


Unnamed: 0_level_0,MAE,RMSE,R2
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Naïve (mean),583.333333,585.472269,-136.111111
Linear Regression,214.967885,275.753075,-29.415903
Random Forest,223.0,232.84759,-20.6872


In [26]:
best_name = results['RMSE'].idxmin()
best_name


'Random Forest'

In [27]:
if best_name == "Random Forest":
    save_path = OUT_MODELS / "baseline_rf.pkl"
    save_model(rf, str(save_path))
elif best_name == "Linear Regression":
    save_path = OUT_MODELS / "baseline_lr.pkl"
    save_model(lr, str(save_path))
else:
    save_path = OUT_MODELS / "baseline_lr.pkl"
    save_model(lr, str(save_path))

save_path


WindowsPath('D:/2 Level/AI Business/Assignment 2/crop_yield_no_genai/outputs/models/baseline_rf.pkl')