# NO_ANN

In [None]:
# --- Imports ---
import os
from pathlib import Path
import numpy as np
import pandas as pd
import pyreadr
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# =========================
# Project root 
# =========================
def detect_project_root():
    env = os.environ.get("CF_CONT_ROOT")
    if env:
        return Path(env).expanduser().resolve()
    try:
        here = Path(__file__).resolve()
    except NameError:
        here = Path.cwd().resolve()
    for p in [here] + list(here.parents):
        if p.name == "CF_Continuous":
            return p
        if (p / "data" / "data_20230504").exists() and (p / "codes").exists():
            return p
    return here

PROJ_ROOT = detect_project_root()

# =========================
# Paths 
# =========================
SPLIT_DIR = PROJ_ROOT / "data" / "train_test_split"
DATA_2ND_STAGE_RDS = PROJ_ROOT / "data" / "data_20230504" / "data_2nd_stage.rds"
EVALL_N_SEQ_RDS    = PROJ_ROOT / "data" / "data_20230504" / "evall_N_seq.rds"

# =========================
# Output 
# =========================
model_tag   = "NO_ANN"
n_fields    = 5                # choose 1, 5, or 10
run_test_id = 1                # run only this test_id
fields_label_map = {1: "one_field", 5: "five_fields", 10: "ten_fields"}
fields_label = fields_label_map.get(n_fields, f"{n_fields}_fields")

RESULTS_BASE = PROJ_ROOT / "results" / "yield_response_function_for_one_iteration"
RESULTS_DIR  = RESULTS_BASE / f"YRF_{model_tag}_{fields_label}"
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === Model: Simple ANN (NO ANN) ===
class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(4, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
    def forward(self, x):
        return self.net(x)

# --- Load data ---
data_2nd_stage = next(iter(pyreadr.read_r(str(DATA_2ND_STAGE_RDS)).values()))
evall_N_seq    = next(iter(pyreadr.read_r(str(EVALL_N_SEQ_RDS)).values()))

# 
for c in ['yield','Nk','plateau','b0','N']:
    if c in data_2nd_stage.columns:
        data_2nd_stage[c] = pd.to_numeric(data_2nd_stage[c], errors='coerce')
for c in ['sim','N']:
    if c in evall_N_seq.columns:
        evall_N_seq[c] = pd.to_numeric(evall_N_seq[c], errors='coerce')

# Clean
data_2nd_stage = data_2nd_stage.dropna(subset=['yield','Nk','plateau','b0','N']).reset_index(drop=True)
evall_N_seq    = evall_N_seq.dropna(subset=['N']).reset_index(drop=True)

# --- Load split CSV and restrict to one test_id ---
split_csv_path = SPLIT_DIR / f"train_test_splits_{n_fields}fields.csv"
splits_df = pd.read_csv(split_csv_path)
splits_df = splits_df[splits_df['test_id'] == run_test_id].iloc[:1].copy()

# Features
feature_cols = ['Nk', 'plateau', 'b0', 'N']

# === Loop (only one iteration) ===
for _, row in tqdm(splits_df.iterrows(), total=len(splits_df), desc="Processing test_id"):
    test_id = int(row['test_id'])
    train_ids = row[[c for c in row.index if c.startswith('train_')]].values

    # ------- Train/val -------
    dataset = data_2nd_stage[data_2nd_stage['sim'].isin(train_ids)].reset_index(drop=True)
    dataset = dataset[['yield'] + feature_cols].copy()

    train_df = dataset.sample(frac=0.8, random_state=0)
    val_df   = dataset.drop(train_df.index)

    X_train = train_df.drop('yield', axis=1)
    y_train = train_df['yield'].to_numpy().reshape(-1, 1)
    X_val   = val_df.drop('yield', axis=1)
    y_val   = val_df['yield'].to_numpy().reshape(-1, 1)

    scaler = StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_val_scaled   = scaler.transform(X_val)

    X_train_t = torch.tensor(X_train_scaled, dtype=torch.float32, device=device)
    y_train_t = torch.tensor(y_train,       dtype=torch.float32, device=device)
    X_val_t   = torch.tensor(X_val_scaled,  dtype=torch.float32, device=device)
    y_val_t   = torch.tensor(y_val,         dtype=torch.float32, device=device)

    train_ds = TensorDataset(X_train_t, y_train_t)
    train_ld = DataLoader(train_ds, batch_size=512, shuffle=True)

    model = MyModel().to(device)
    criterion = nn.L1Loss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    best_val = float('inf')
    patience = 10
    counter = 0
    max_epochs = 500

    for epoch in range(max_epochs):
        model.train()
        for xb, yb in train_ld:
            optimizer.zero_grad()
            loss = criterion(model(xb), yb)
            loss.backward()
            optimizer.step()

        model.eval()
        with torch.no_grad():
            val_preds = model(X_val_t)
            val_loss = criterion(val_preds, y_val_t).item()

        if val_loss < best_val:
            best_val = val_loss
            counter = 0
        else:
            counter += 1
            if counter >= patience:
                print(f"Early stop at epoch {epoch} (sim {test_id})")
                break

    # ------- Save validation preds -------
    model.eval()
    with torch.no_grad():
        val_out = model(X_val_t).cpu().numpy().flatten()
    pd.DataFrame({'pred': val_out, 'true': y_val.flatten()}).to_csv(
        RESULTS_DIR / f'validation_{test_id}.csv', index=False
    )

    # ------- Yield response function (NO EONR) -------
    test_df   = data_2nd_stage[data_2nd_stage['sim'] == test_id].reset_index(drop=True)
    base_feats = test_df[['Nk', 'plateau', 'b0']].reset_index(drop=True)

    # N sequence for this sim
    eval_seq = evall_N_seq[evall_N_seq['sim'] == test_id].reset_index(drop=True)
    if eval_seq.empty:
        (RESULTS_DIR / f'yield_response_{test_id}_EMPTY_EVAL_SEQ.csv').write_text(
            "No eval N sequence found for this sim\n"
        )
        continue

    Nseq = eval_seq['N'].to_numpy()
    L = len(Nseq)

    id_cols = [c for c in ['aunit_id', 'cell_id', 'field_id'] if c in test_df.columns]

    all_preds = []
    with torch.no_grad():
        for i in range(len(base_feats)):
            base = base_feats.iloc[[i]]
            repeated = pd.concat([base] * L, ignore_index=True)
            full_feat = pd.concat([repeated, eval_seq[['N']]], axis=1)
            full_feat = full_feat[['Nk', 'plateau', 'b0', 'N']]

            X_feat = torch.tensor(scaler.transform(full_feat), dtype=torch.float32, device=device)
            y_hat  = model(X_feat).cpu().numpy().reshape(-1)

            row = {
                'sim':        [test_id] * L,
                'row_id':     [i] * L,
                'N':          Nseq,
                'pred_yield': y_hat
            }
            for col in id_cols:
                row[col] = [test_df.loc[i, col]] * L

            all_preds.append(pd.DataFrame(row))

    df_out = pd.concat(all_preds, ignore_index=True)
    df_out.to_csv(RESULTS_DIR / f'yield_response_{test_id}.csv', index=False)


Processing test_id:   0%|          | 0/1 [00:00<?, ?it/s]

Early stop at epoch 336 (sim 1)


Processing test_id: 100%|██████████| 1/1 [02:26<00:00, 146.01s/it]


# NO_RF

In [None]:
# --- Imports ---
import os
from pathlib import Path
import numpy as np
import pandas as pd
import pyreadr
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RepeatedKFold, GridSearchCV

# =========================
# Project root 
# =========================
def detect_project_root():
    env = os.environ.get("CF_CONT_ROOT")
    if env:
        return Path(env).expanduser().resolve()
    try:
        here = Path(__file__).resolve()
    except NameError:  # e.g., notebooks
        here = Path.cwd().resolve()
    for p in [here] + list(here.parents):
        if p.name == "CF_Continuous":
            return p
        if (p / "data" / "data_20230504").exists() and (p / "codes").exists():
            return p
    return here

PROJ_ROOT = detect_project_root()

# =========================
# Paths 
# =========================
SPLIT_DIR          = PROJ_ROOT / "data" / "train_test_split"
DATA_2ND_STAGE_RDS = PROJ_ROOT / "data" / "data_20230504" / "data_2nd_stage.rds"
EVALL_N_SEQ_RDS    = PROJ_ROOT / "data" / "data_20230504" / "evall_N_seq.rds"

# Output 
model_tag   = "NO_RF"
n_fields    = 5                # choose 1, 5, or 10
run_test_id = 1                # run only this test_id
fields_label_map = {1: "one_field", 5: "five_fields", 10: "ten_fields"}
fields_label = fields_label_map.get(n_fields, f"{n_fields}_fields")

RESULTS_BASE = PROJ_ROOT / "results" / "yield_response_function_for_one_iteration"
RESULTS_DIR  = RESULTS_BASE / f"YRF_{model_tag}_{fields_label}"
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

# === Load data from .rds ===
data_2nd_stage = next(iter(pyreadr.read_r(str(DATA_2ND_STAGE_RDS)).values()))
evall_N_seq    = next(iter(pyreadr.read_r(str(EVALL_N_SEQ_RDS)).values()))

# 
for c in ['yield', 'Nk', 'plateau', 'b0', 'N']:
    if c in data_2nd_stage.columns:
        data_2nd_stage[c] = pd.to_numeric(data_2nd_stage[c], errors='coerce')
for c in ['sim', 'N']:
    if c in evall_N_seq.columns:
        evall_N_seq[c] = pd.to_numeric(evall_N_seq[c], errors='coerce')

# Drop rows with NaNs in key columns
data_2nd_stage = data_2nd_stage.dropna(subset=['yield','Nk','plateau','b0','N']).reset_index(drop=True)
evall_N_seq    = evall_N_seq.dropna(subset=['N']).reset_index(drop=True)

# === Load split CSV and restrict to one test_id ===
split_csv_path = SPLIT_DIR / f"train_test_splits_{n_fields}fields.csv"
splits_df = pd.read_csv(split_csv_path)
splits_df = splits_df[splits_df['test_id'] == run_test_id].iloc[:1].copy()

# === Features for training ===
feature_cols = ['Nk', 'plateau', 'b0', 'N']

# === Loop (only one iteration) ===
for _, row in tqdm(splits_df.iterrows(), total=len(splits_df), desc="Processing test_id"):
    test_id = int(row['test_id'])
    train_ids = row[[c for c in row.index if c.startswith('train_')]].values

    # -------------------------
    # train/val data
    # -------------------------
    dataset = data_2nd_stage[data_2nd_stage['sim'].isin(train_ids)].reset_index(drop=True)
    dataset = dataset[['yield'] + feature_cols].copy()

    # Random 80/20 split
    train_df = dataset.sample(frac=0.8, random_state=0)
    val_df   = dataset.drop(train_df.index)

    X_train = train_df.drop('yield', axis=1)
    y_train = train_df['yield']
    X_val   = val_df.drop('yield', axis=1)
    y_val   = val_df['yield']

    scaler = StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_val_scaled   = scaler.transform(X_val)

    # -------------------------
    # Random Forest with CV
    # -------------------------
    param_grid = {
        'max_depth':    [3, 5],
        'n_estimators': [50, 250, 500, 1000],
        'max_features': [1, 2, 3],
    }
    rf = RandomForestRegressor(random_state=777)
    cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=777)
    grid = GridSearchCV(rf, param_grid, cv=cv, n_jobs=-1, scoring='neg_mean_squared_error')
    grid.fit(X_train_scaled, y_train)
    model = grid.best_estimator_

    # -------------------------
    # Save validation predictions
    # -------------------------
    val_preds = model.predict(X_val_scaled)
    pd.DataFrame({'pred': val_preds, 'true': y_val.values}).to_csv(
        RESULTS_DIR / f'validation_{test_id}.csv', index=False
    )

    # -------------------------
    # Yield response function (no EONR)
    # -------------------------
    test_df  = data_2nd_stage[data_2nd_stage['sim'] == test_id].reset_index(drop=True)
    features = test_df[['Nk', 'plateau', 'b0']].reset_index(drop=True)

    # N sequence for THIS test_id
    eval_seq = evall_N_seq[evall_N_seq['sim'] == test_id].reset_index(drop=True)
    if eval_seq.empty:
        (RESULTS_DIR / f'yield_response_{test_id}_EMPTY_EVAL_SEQ.csv').write_text(
            "No eval N sequence found for this sim\n"
        )
        continue

    Nseq = eval_seq['N'].to_numpy()
    L = len(Nseq)

    # 
    id_cols = [c for c in ['aunit_id', 'cell_id', 'field_id'] if c in test_df.columns]

    all_preds = []
    for i in range(len(features)):
        base = features.iloc[[i]]
        repeated = pd.concat([base] * L, ignore_index=True)
        full_feat = pd.concat([repeated, eval_seq[['N']]], axis=1)
        full_feat = full_feat[['Nk', 'plateau', 'b0', 'N']]

        X_feat = scaler.transform(full_feat)
        preds  = model.predict(X_feat)

        row_dict = {
            'sim':        [test_id] * L,
            'row_id':     [i] * L,
            'N':          Nseq,
            'pred_yield': preds
        }
        for col in id_cols:
            row_dict[col] = [test_df.loc[i, col]] * L

        all_preds.append(pd.DataFrame(row_dict))

    df_out = pd.concat(all_preds, ignore_index=True)
    df_out.to_csv(RESULTS_DIR / f'yield_response_{test_id}.csv', index=False)

# --- Note ---
# If test_df has 1,440 rows and N sequence has 100 values → output CSV has 144,000 rows.


Processing test_id: 100%|██████████| 1/1 [01:12<00:00, 72.74s/it]


# SO_ANN

In [None]:
# --- Imports ---
import os
from pathlib import Path
import numpy as np
import pandas as pd
import pyreadr
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# =========================
# Project root 
# =========================
def detect_project_root():
    env = os.environ.get("CF_CONT_ROOT")
    if env:
        return Path(env).expanduser().resolve()
    try:
        here = Path(__file__).resolve()
    except NameError:
        here = Path.cwd().resolve()
    for p in [here] + list(here.parents):
        if p.name == "CF_Continuous":
            return p
        if (p / "data" / "data_20230504").exists() and (p / "codes").exists():
            return p
    return here

PROJ_ROOT = detect_project_root()

# =========================
# Paths
# =========================
SPLIT_DIR = PROJ_ROOT / "data" / "train_test_split"
DATA_2ND_STAGE_RDS = PROJ_ROOT / "data" / "data_20230504" / "data_2nd_stage.rds"
EVALL_N_SEQ_RDS    = PROJ_ROOT / "data" / "data_20230504" / "evall_N_seq.rds"

# =========================
# Output 
# =========================
model_tag   = "SO_ANN"
n_fields    = 10               # choose 1, 5, or 10
run_test_id = 1                # run only this test_id
fields_label_map = {1: "one_field", 5: "five_fields", 10: "ten_fields"}
fields_label = fields_label_map.get(n_fields, f"{n_fields}_fields")

RESULTS_BASE = PROJ_ROOT / "results" / "yield_response_function_for_one_iteration"
RESULTS_DIR  = RESULTS_BASE / f"YRF_{model_tag}_{fields_label}"
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === Model  ===
class MyModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(4, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
    def forward(self, x):
        return self.net(x)

# === Load data from .rds ===
data_2nd_stage = next(iter(pyreadr.read_r(str(DATA_2ND_STAGE_RDS)).values()))
evall_N_seq    = next(iter(pyreadr.read_r(str(EVALL_N_SEQ_RDS)).values()))

# 
for c in ['y_tilde', 'Nk', 'plateau', 'b0', 'N']:
    if c in data_2nd_stage.columns:
        data_2nd_stage[c] = pd.to_numeric(data_2nd_stage[c], errors='coerce')
for c in ['sim', 'N']:
    if c in evall_N_seq.columns:
        evall_N_seq[c] = pd.to_numeric(evall_N_seq[c], errors='coerce')

# Drop rows with NaNs 
data_2nd_stage = data_2nd_stage.dropna(subset=['y_tilde','Nk','plateau','b0','N']).reset_index(drop=True)
evall_N_seq    = evall_N_seq.dropna(subset=['N']).reset_index(drop=True)

# === Load split CSV and restrict to one test_id ===
split_csv_path = SPLIT_DIR / f"train_test_splits_{n_fields}fields.csv"
splits_df = pd.read_csv(split_csv_path)
splits_df = splits_df[splits_df['test_id'] == run_test_id].iloc[:1].copy()

# === Features for training ===
feature_cols = ['Nk', 'plateau', 'b0', 'N']

# === Loop (only one iteration) ===
for _, row in tqdm(splits_df.iterrows(), total=len(splits_df), desc="Processing test_id"):
    test_id = int(row['test_id'])
    train_ids = row[[c for c in row.index if c.startswith('train_')]].values

    # -------------------------
    # train/val data
    # -------------------------
    dataset = data_2nd_stage[data_2nd_stage['sim'].isin(train_ids)].reset_index(drop=True)
    dataset = dataset[['y_tilde'] + feature_cols].copy()

    # Random 80/20 split
    train_df = dataset.sample(frac=0.8, random_state=0)
    val_df   = dataset.drop(train_df.index)

    X_train = train_df.drop('y_tilde', axis=1)
    y_train = train_df['y_tilde'].to_numpy().reshape(-1, 1)
    X_val   = val_df.drop('y_tilde', axis=1)
    y_val   = val_df['y_tilde'].to_numpy().reshape(-1, 1)

    scaler = StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_val_scaled   = scaler.transform(X_val)

    X_train_t = torch.tensor(X_train_scaled, dtype=torch.float32, device=device)
    y_train_t = torch.tensor(y_train,       dtype=torch.float32, device=device)
    X_val_t   = torch.tensor(X_val_scaled,  dtype=torch.float32, device=device)
    y_val_t   = torch.tensor(y_val,         dtype=torch.float32, device=device)

    train_ds = TensorDataset(X_train_t, y_train_t)
    train_ld = DataLoader(train_ds, batch_size=512, shuffle=True)

    model = MyModel().to(device)
    criterion = nn.L1Loss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    best_val = float('inf')
    patience = 10
    counter = 0
    max_epochs = 500

    for epoch in range(max_epochs):
        model.train()
        for xb, yb in train_ld:
            optimizer.zero_grad()
            preds = model(xb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()

        model.eval()
        with torch.no_grad():
            val_preds = model(X_val_t)
            val_loss = criterion(val_preds, y_val_t).item()

        if val_loss < best_val:
            best_val = val_loss
            counter = 0
        else:
            counter += 1
            if counter >= patience:
                print(f"Early stop at epoch {epoch} (sim {test_id})")
                break

    # -------------------------
    # Save validation predictions
    # -------------------------
    model.eval()
    with torch.no_grad():
        val_out = model(X_val_t).cpu().numpy().flatten()
    pd.DataFrame({'pred': val_out, 'true': y_val.flatten()}).to_csv(
        RESULTS_DIR / f'validation_{test_id}.csv', index=False
    )

    # -------------------------
    # Yield response function (NO EONR)
    # -------------------------
    test_df  = data_2nd_stage[data_2nd_stage['sim'] == test_id].reset_index(drop=True)
    base_feats = test_df[['Nk', 'plateau', 'b0']].reset_index(drop=True)

    # N sequence for THIS test_id
    eval_seq = evall_N_seq[evall_N_seq['sim'] == test_id].reset_index(drop=True)
    if eval_seq.empty:
        (RESULTS_DIR / f'yield_response_{test_id}_EMPTY_EVAL_SEQ.csv').write_text(
            "No eval N sequence found for this sim\n"
        )
        continue

    Nseq = eval_seq['N'].to_numpy()
    L = len(Nseq)

    # identifiers 
    id_cols = [c for c in ['aunit_id', 'cell_id', 'field_id'] if c in test_df.columns]

    all_preds = []
    with torch.no_grad():
        for i in range(len(base_feats)):
            base = base_feats.iloc[[i]]                              # 1×3 (Nk, plateau, b0)
            repeated = pd.concat([base] * L, ignore_index=True)      # L×3
            full_feat = pd.concat([repeated, eval_seq[['N']]], axis=1)  # L×4
            full_feat = full_feat[['Nk', 'plateau', 'b0', 'N']]      

            X_feat = torch.tensor(scaler.transform(full_feat), dtype=torch.float32, device=device)
            y_hat  = model(X_feat).cpu().numpy().reshape(-1)

            row_dict = {
                'sim':        [test_id] * L,
                'row_id':     [i] * L,
                'N':          Nseq,
                'pred_yield': y_hat
            }
            for col in id_cols:
                row_dict[col] = [test_df.loc[i, col]] * L

            all_preds.append(pd.DataFrame(row_dict))

    df_out = pd.concat(all_preds, ignore_index=True)
    df_out.to_csv(RESULTS_DIR / f'yield_response_{test_id}.csv', index=False)




Processing test_id:   0%|          | 0/1 [00:00<?, ?it/s]

Early stop at epoch 143 (sim 1)


Processing test_id: 100%|██████████| 1/1 [02:07<00:00, 127.07s/it]


# DO_ANN

In [None]:
# --- Imports ---
import os
from pathlib import Path
import numpy as np
import pandas as pd
import pyreadr
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
import torch.optim as optim

# =========================
# Project root 
# =========================
def detect_project_root():
    env = os.environ.get("CF_CONT_ROOT")
    if env:
        return Path(env).expanduser().resolve()
    try:
        here = Path(__file__).resolve()
    except NameError:
        here = Path.cwd().resolve()
    for p in [here] + list(here.parents):
        if p.name == "CF_Continuous":
            return p
        if (p / "data" / "data_20230504").exists() and (p / "codes").exists():
            return p
    return here

PROJ_ROOT = detect_project_root()

# =========================
# Paths 
# =========================
SPLIT_DIR = PROJ_ROOT / "data" / "train_test_split"
DATA_2ND_STAGE_RDS = PROJ_ROOT / "data" / "data_20230504" / "data_2nd_stage.rds"
EVALL_N_SEQ_RDS    = PROJ_ROOT / "data" / "data_20230504" / "evall_N_seq.rds"

# =========================
# Output 
# =========================
model_tag   = "DO_ANN"
n_fields    = 5                 # choose 1, 5, or 10
run_test_id = 1                 # run only this test_id
fields_label_map = {1: "one_field", 5: "five_fields", 10: "ten_fields"}
fields_label = fields_label_map.get(n_fields, f"{n_fields}_fields")

RESULTS_BASE = PROJ_ROOT / "results" / "yield_response_function_for_one_iteration"
RESULTS_DIR  = RESULTS_BASE / f"YRF_{model_tag}_{fields_label}"
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === Model  ===
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.branch = nn.Sequential(
            nn.Linear(3, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
    def forward(self, inp):
        x = inp[:, :3]   # Nk, plateau, b0
        n = inp[:, 3:4]  # N_tilde (scaled)
        return self.branch(x) * n

# === Load data from .rds ===
data_2nd_stage = next(iter(pyreadr.read_r(str(DATA_2ND_STAGE_RDS)).values()))
evall_N_seq    = next(iter(pyreadr.read_r(str(EVALL_N_SEQ_RDS)).values()))

# 
for c in ['y_tilde', 'Nk', 'plateau', 'b0', 'N_tilde', 'N']:
    if c in data_2nd_stage.columns:
        data_2nd_stage[c] = pd.to_numeric(data_2nd_stage[c], errors='coerce')
    if c in evall_N_seq.columns:
        evall_N_seq[c] = pd.to_numeric(evall_N_seq[c], errors='coerce')

# Clean
data_2nd_stage = data_2nd_stage.dropna(subset=['y_tilde','Nk','plateau','b0','N_tilde']).reset_index(drop=True)
# 
need_eval_cols = ['sim', 'N_tilde']
if not set(need_eval_cols).issubset(evall_N_seq.columns):
    raise ValueError("evall_N_seq must contain columns: 'sim' and 'N_tilde'")
evall_N_seq = evall_N_seq.dropna(subset=['N_tilde']).reset_index(drop=True)

# === Load split CSV and restrict to one test_id ===
split_csv_path = SPLIT_DIR / f"train_test_splits_{n_fields}fields.csv"
splits_df = pd.read_csv(split_csv_path)
splits_df = splits_df[splits_df['test_id'] == run_test_id].iloc[:1].copy()

# === Features for training ===
feature_cols = ['Nk', 'plateau', 'b0', 'N_tilde']

# === Loop (only one iteration) ===
for _, row in tqdm(splits_df.iterrows(), total=len(splits_df), desc="Processing test_id"):
    test_id = int(row['test_id'])
    train_ids = row[[c for c in row.index if c.startswith('train_')]].values

    # -------------------------
    # train/val data
    # -------------------------
    dataset = data_2nd_stage[data_2nd_stage['sim'].isin(train_ids)].reset_index(drop=True)
    dataset = dataset[['y_tilde'] + feature_cols].copy()

    # Random 80/20 split (reproducible)
    train_df = dataset.sample(frac=0.8, random_state=0)
    val_df   = dataset.drop(train_df.index)

    X_train = train_df[feature_cols]
    y_train = train_df['y_tilde'].to_numpy().reshape(-1, 1)
    X_val   = val_df[feature_cols]
    y_val   = val_df['y_tilde'].to_numpy().reshape(-1, 1)

    scaler = StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_val_scaled   = scaler.transform(X_val)

    X_train_t = torch.tensor(X_train_scaled, dtype=torch.float32, device=device)
    y_train_t = torch.tensor(y_train,       dtype=torch.float32, device=device)
    X_val_t   = torch.tensor(X_val_scaled,  dtype=torch.float32, device=device)
    y_val_t   = torch.tensor(y_val,         dtype=torch.float32, device=device)

    model = MyModel().to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.L1Loss()

    best_val_loss = float('inf')
    patience = 10
    counter = 0
    max_epochs = 500

    for epoch in range(max_epochs):
        model.train()
        # manual mini-batch
        bs = 512
        for i in range(0, len(X_train_t), bs):
            xb = X_train_t[i:i+bs]
            yb = y_train_t[i:i+bs]
            optimizer.zero_grad()
            loss = criterion(model(xb), yb)
            loss.backward()
            optimizer.step()

        model.eval()
        with torch.no_grad():
            val_loss = criterion(model(X_val_t), y_val_t).item()

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            counter = 0
        else:
            counter += 1
            if counter >= patience:
                print(f"Early stop at epoch {epoch} (sim {test_id})")
                break

    # -------------------------
    # Save validation predictions
    # -------------------------
    model.eval()
    with torch.no_grad():
        val_preds = model(X_val_t).cpu().numpy().flatten()
    pd.DataFrame({'pred': val_preds, 'true': y_val.flatten()}).to_csv(
        RESULTS_DIR / f'validation_{test_id}.csv', index=False
    )

    # -------------------------
    # Yield response function (NO EONR)
    # -------------------------
    test_df   = data_2nd_stage[data_2nd_stage['sim'] == test_id].reset_index(drop=True)
    base_feats = test_df[['Nk', 'plateau', 'b0']].reset_index(drop=True)

    # N sequence (N_tilde) for THIS test_id
    eval_seq = evall_N_seq[evall_N_seq['sim'] == test_id].reset_index(drop=True)
    if eval_seq.empty:
        (RESULTS_DIR / f'yield_response_{test_id}_EMPTY_EVAL_SEQ.csv').write_text(
            "No eval N_tilde sequence found for this sim\n"
        )
        continue

    Nt_seq = eval_seq['N_tilde'].to_numpy()
    L = len(Nt_seq)

    # 
    id_cols = [c for c in ['aunit_id', 'cell_id', 'field_id'] if c in test_df.columns]

    all_preds = []
    with torch.no_grad():
        for i in range(len(base_feats)):
            base = base_feats.iloc[[i]]                             # 1×3
            repeated = pd.concat([base] * L, ignore_index=True)     # L×3
            full_feat = pd.concat([repeated, eval_seq[['N_tilde']]], axis=1)  # L×4
            full_feat = full_feat[['Nk', 'plateau', 'b0', 'N_tilde']]         

            X_feat = torch.tensor(scaler.transform(full_feat), dtype=torch.float32, device=device)
            y_hat  = model(X_feat).cpu().numpy().reshape(-1)

            row = {
                'sim':        [test_id] * L,
                'row_id':     [i] * L,
                'N_tilde':    Nt_seq,         
                'pred_yield': y_hat
            }
            if 'N' in eval_seq.columns:
                row['N'] = eval_seq['N'].to_numpy()  
            for col in id_cols:
                row[col] = [test_df.loc[i, col]] * L

            all_preds.append(pd.DataFrame(row))

    df_out = pd.concat(all_preds, ignore_index=True)
    df_out.to_csv(RESULTS_DIR / f'yield_response_{test_id}.csv', index=False)




Processing test_id: 100%|██████████| 1/1 [02:23<00:00, 143.31s/it]


# SO_RF

In [None]:
# --- Imports ---
import os
from pathlib import Path
import numpy as np
import pandas as pd
import pyreadr
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RepeatedKFold, GridSearchCV

# =========================
# Project root 
# =========================
def detect_project_root():
    env = os.environ.get("CF_CONT_ROOT")
    if env:
        return Path(env).expanduser().resolve()
    try:
        here = Path(__file__).resolve()
    except NameError:  # e.g., notebooks
        here = Path.cwd().resolve()
    for p in [here] + list(here.parents):
        if p.name == "CF_Continuous":
            return p
        if (p / "data" / "data_20230504").exists() and (p / "codes").exists():
            return p
    return here

PROJ_ROOT = detect_project_root()

# =========================
# Paths 
# =========================
SPLIT_DIR = PROJ_ROOT / "data" / "train_test_split"
DATA_2ND_STAGE_RDS = PROJ_ROOT / "data" / "data_20230504" / "data_2nd_stage.rds"
EVALL_N_SEQ_RDS    = PROJ_ROOT / "data" / "data_20230504" / "evall_N_seq.rds"

# =========================
# Output 
# =========================
model_tag   = "SO_RF"
n_fields    = 5                 # choose 1, 5, or 10
run_test_id = 1                 # run only this test_id
fields_label_map = {1: "one_field", 5: "five_fields", 10: "ten_fields"}
fields_label = fields_label_map.get(n_fields, f"{n_fields}_fields")

RESULTS_BASE = PROJ_ROOT / "results" / "yield_response_function_for_one_iteration"
RESULTS_DIR  = RESULTS_BASE / f"YRF_{model_tag}_{fields_label}"
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

# === Load data from .rds ===
data_2nd_stage = next(iter(pyreadr.read_r(str(DATA_2ND_STAGE_RDS)).values()))
evall_N_seq    = next(iter(pyreadr.read_r(str(EVALL_N_SEQ_RDS)).values()))

# 
for c in ['y_tilde','Nk','plateau','b0','N']:
    if c in data_2nd_stage.columns:
        data_2nd_stage[c] = pd.to_numeric(data_2nd_stage[c], errors='coerce')
for c in ['sim','N']:
    if c in evall_N_seq.columns:
        evall_N_seq[c] = pd.to_numeric(evall_N_seq[c], errors='coerce')

# Drop rows with NaNs 
data_2nd_stage = data_2nd_stage.dropna(subset=['y_tilde','Nk','plateau','b0','N']).reset_index(drop=True)
evall_N_seq    = evall_N_seq.dropna(subset=['N']).reset_index(drop=True)

# === Load split CSV and restrict to one test_id ===
split_csv_path = SPLIT_DIR / f"train_test_splits_{n_fields}fields.csv"
splits_df = pd.read_csv(split_csv_path)
splits_df = splits_df[splits_df['test_id'] == run_test_id].iloc[:1].copy()

# === Features for training/prediction ===
feature_cols = ['Nk', 'plateau', 'b0', 'N']

# === Loop (only one iteration) ===
for _, row in tqdm(splits_df.iterrows(), total=len(splits_df), desc="Processing test_id"):
    test_id = int(row['test_id'])
    train_ids = row[[c for c in row.index if c.startswith('train_')]].values

    # -------------------------
    # Train / validation data
    # -------------------------
    dataset = data_2nd_stage[data_2nd_stage['sim'].isin(train_ids)].reset_index(drop=True)
    dataset = dataset[['y_tilde'] + feature_cols].copy()

    train_df = dataset.sample(frac=0.8, random_state=0)
    val_df   = dataset.drop(train_df.index)

    X_train = train_df.drop('y_tilde', axis=1)
    y_train = train_df['y_tilde']
    X_val   = val_df.drop('y_tilde', axis=1)
    y_val   = val_df['y_tilde']

    scaler = StandardScaler().fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_val_scaled   = scaler.transform(X_val)

    # -------------------------
    # Random Forest with CV
    # -------------------------
    param_grid = {
        'max_depth':    [3, 5],
        'n_estimators': [50, 250, 500, 1000],
        'max_features': [1, 2, 3],
    }
    rf = RandomForestRegressor(random_state=777)
    cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=777)
    grid = GridSearchCV(rf, param_grid, cv=cv, n_jobs=-1, scoring='neg_mean_squared_error')
    grid.fit(X_train_scaled, y_train)
    model = grid.best_estimator_

    # -------------------------
    # Save validation predictions
    # -------------------------
    val_preds = model.predict(X_val_scaled)
    pd.DataFrame({'pred': val_preds, 'true': y_val.values}).to_csv(
        RESULTS_DIR / f'validation_{test_id}.csv', index=False
    )

    # -------------------------
    # Yield response function (NO EONR)
    # -------------------------
    test_df   = data_2nd_stage[data_2nd_stage['sim'] == test_id].reset_index(drop=True)
    base_feats = test_df[['Nk', 'plateau', 'b0']].reset_index(drop=True)

    # N sequence for THIS test_id
    eval_seq = evall_N_seq[evall_N_seq['sim'] == test_id].reset_index(drop=True)
    if eval_seq.empty:
        (RESULTS_DIR / f'yield_response_{test_id}_EMPTY_EVAL_SEQ.csv').write_text(
            "No eval N sequence found for this sim\n"
        )
        continue

    Nseq = eval_seq['N'].to_numpy()
    L = len(Nseq)

    # 
    id_cols = [c for c in ['aunit_id', 'cell_id', 'field_id'] if c in test_df.columns]

    all_preds = []
    for i in range(len(base_feats)):
        base = base_feats.iloc[[i]]                               # 1×3
        repeated = pd.concat([base] * L, ignore_index=True)       # L×3
        full_feat = pd.concat([repeated, eval_seq[['N']]], axis=1)  # L×4
        full_feat = full_feat[['Nk', 'plateau', 'b0', 'N']]      

        X_feat = scaler.transform(full_feat)
        preds  = model.predict(X_feat)

        row = {
            'sim':        [test_id] * L,
            'row_id':     [i] * L,
            'N':          Nseq,
            'pred_yield': preds
        }
        for col in id_cols:
            row[col] = [test_df.loc[i, col]] * L

        all_preds.append(pd.DataFrame(row))

    df_out = pd.concat(all_preds, ignore_index=True)
    df_out.to_csv(RESULTS_DIR / f'yield_response_{test_id}.csv', index=False)

# --- Notes ---
# • Trains on ['Nk','plateau','b0','N'] to predict y_tilde.
# • Output rows = (# test rows for sim) × (length of N sequence for sim).


Processing test_id: 100%|██████████| 1/1 [01:11<00:00, 71.57s/it]
