In [1]:
import os
import pickle
import random
import time
import re
import itertools
from pathlib import Path

import polars as pl
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import r2_score, mean_absolute_error, root_mean_squared_error

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch.multiprocessing as mp
from torchinfo import summary


from utils import preprocessing as ppsr
from utils.potsimloader import potsimloader as psl
from utils import split
from models import linearregression, mlp, cnn, tcn, lstm, transformer
from utils import scaler
from training import train
from testing import evaluate
pl.enable_string_cache()

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
TARGET_FEATURES = {
    "NTotL1": ["DayAfterPlant", "NApp", "Rain", "SolarRad", "AirTempC"],
    "NTotL2": ["DayAfterPlant", "NApp", "Rain", "SolarRad", "AirTempC"],
    "SWatL1": ['DayAfterPlant','IrrgThresh', 'NLeach', 'Irrg', 'Rain'],
    "SWatL2": ['DayAfterPlant','IrrgThresh', 'NLeach', 'Irrg', 'Rain'],
    "NLeach": ['NTotL1', 'NTotL2', 'Rain', 'SolarRad', 'AirTempC'],
    "NPlantUp": ['DayAfterPlant', 'NTotL1', 'NTotL2', 'Rain', 'SolarRad', 'AirTempC']
}

In [None]:
def set_seed(seed):
    torch.manual_seed(seed)  # Set seed for PyTorch CPU
    torch.cuda.manual_seed(seed)  # Set seed for PyTorch GPU (if using CUDA)
    np.random.seed(seed)  # Set seed for NumPy
    random.seed(seed)  # Set seed for Python's random module
set_seed(42)

In [21]:
def generate_treatments(n_values):
    combis = list(itertools.product(n_values, repeat=3))
    return ["-".join(map(str, t)) for t in combis if sum(t) <= 400]


def join_potsim_yearly(data_dir, save_dir=Path("data")):
    data_dir = Path(data_dir).resolve()
    files = os.listdir(data_dir)
    pattern = re.compile(r"^potsim_\d{4}\.parquet$")
    files = sorted([file for file in files if pattern.match(file)])
    files = [data_dir / file for file in files]
    df = pl.scan_parquet(files)
    filepath = save_dir / "potsim.parquet"
    df.sink_parquet(
        filepath,
        statistics=True,
        compression="zstd",
        compression_level=1,
        row_group_size=1_000_000,
    )

In [19]:
usecols = ['Year', 'Date', 'Treatment', 'NFirstApp','PlantingDay', 'IrrgDep',
           'IrrgThresh', 'DayAfterPlant', 'NApp', 'NLeach', 'NPlantUp', 'NTotL1', 
           'NTotL2', 'Irrg', 'SWatL1', 'SWatL2', 'Rain', 'SolarRad', 'AirTempC']
   
mask = (
    ((pl.col("NFirstApp") == "Npl") & (pl.col("DayAfterPlant") >= -1)) |
    ((pl.col("NFirstApp") == "Npre") & (pl.col("DayAfterPlant") >= -37))
)

In [None]:
potsim_dir = "data/potsim_yearly/"
weather_file = "data/weather.parquet"
join_potsim_yearly(potsim_dir)
data_file = "data/potsim.parquet"

data = psl.read_data(
    dataset_path=data_file,
    weather_path=weather_file,
    usecols=usecols,
    lazy=True,
    as_pandas=False,
)

In [None]:
n_values = [0, 56, 112, 168, 196]
treatments = generate_treatments(n_values)
scenario_filter= {
    "Year": list(range(2001, 2025)),
    "Treatment": treatments,
    "PlantingDay": [1, 15, 29, 43, 57],
    "IrrgDep": [30, 40, 50, 60],
    "IrrgThresh": [50, 60, 70, 80, 90],
    "NFirstApp": ["Npl", "Npre"]
}
train_years = list(range(2001, 2017))
val_years = list(range(2017, 2021))
test_years = list(range(2021, 2025))

In [None]:
data = psl.apply_filter(data, filters=scenario_filter, lazy=False, as_pandas=True)
df = data.filter(mask).to_pandas()

In [None]:
train_split, val_split, test_split = split.random_sample_train_val_test(
    df,
    split=(0.6, 0.2, 0.2),
    train_years=[2001, 2002, 2023],
    val_years=[2002],
    test_years=[2002],
    seed = 42
)

Scenarios: train(13392), val(4464), test(4464)


In [17]:
target_col = "NTotL1"
feature_cols = TARGET_FEATURES[target_col]
output_dir = Path() / "outputs" / target_col
output_dir.mkdir(parents=True, exist_ok=True)
scaler_save_path =  output_dir / f"{target_col}_scaler.pkl"

In [None]:
# For Non-Sequential models like LinearRegression, MLP etc.
seq_len = None
# For Sequential models like CNN, TCN, LSTM etc.
# seq_len = 15

X_train, y_train = ppsr.process_data(
    train_split,
    feats=feature_cols,
    tgt=target_col,
    scaler_path=scaler_save_path,
    mode="fit",
    seq_len=seq_len,
)
X_val, y_val = ppsr.process_data(
    val_split,
    feats=feature_cols,
    tgt=target_col,
    scaler_path=scaler_save_path,
    mode="transform",
    seq_len=seq_len,
)
X_test, y_test = ppsr.process_data(
    test_split,
    feats=feature_cols,
    tgt=target_col,
    scaler_path=scaler_save_path,
    mode="transform",
    seq_len=seq_len,
)

In [None]:
print(f"Train shapes: \nX_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"Val shapes: \nX_val: {X_val.shape}, y_val: {y_val.shape}")
print(f"Test shapes: \nX_test: {X_test.shape}, y_test: {y_test.shape}")

In [None]:
model_name = "LinearRegression"
model = linearregression.LinearRegression(input_dim=5)
model_state = output_dir / f"{target_col}_LinearRegression.pth"

# model_name = "MLP"
# model = mlp.MLP(input_dim=5, hidden_size=128, num_layers=2, dropout=0.2)
# model_state = output_dir / f"{target_col}_MLP.pth"

# model_name = "CNN1D"
# model = cnn.CNN1D(input_dim=5, hidden_size=64, kernel_size=3, padding=1, dropout=0.2)
# model_state = output_dir / f"{target_col}_CNN1D.pth"

# model_name = "TCN"
# model = tcn.TCN(input_dim=5, num_channels=[64,32,16,8], kernel_size=3, dropout=0.2)
# model_state = output_dir / f"{target_col}_TCN.pth"

# model_name = "LSTM"
# model = lstm.LSTM(input_dim=5, hidden_size=64, num_layers=2, dropout=0.2)
# model_state = output_dir / f"{target_col}_LSTM.pth"

# model_name = "EncoderOnlyTransformer"
# model = transformer.EncoderOnlyTransformer(
#     input_dim=5, nhead=2, num_layers=2, d_model=64, dropout=0.2, max_seq_len=15
# )
# model_state = output_dir / f"{target_col}_EncoderOnlyTransformer.pth"

In [None]:
# Putting model to device
model.to(device)
summary(model)

In [None]:
# Used for calculating and printing metrics during training
min_tgt, max_tgt = scaler.get_min_max(tgt=target_col, scaler_path=scaler_save_path)
min_tgt = torch.tensor(min_tgt, device=device)
max_tgt = torch.tensor(max_tgt, device=device)
print(min_tgt, max_tgt)

In [None]:
lr = 0.0001
b_sz = 256
max_epochs = 10
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.5, patience=5, min_lr=1e-8
)
epochs_no_improve = 0
min_loss_reduction=1e-4
early_stop_patience: int = 10
ampscaler = torch.amp.GradScaler(device=device.type)
output_dir.mkdir(parents=True, exist_ok=True)
model_save_path = output_dir / f"{target_col}_{model_name}.pth"
logs_save_path = output_dir / f"logs_{target_col}_{model_name}.csv"

In [None]:
# Creating dataloader
train_loader = DataLoader(
    TensorDataset(X_train, y_train),
    num_workers=10,
    prefetch_factor=4,
    pin_memory=True,
    batch_size=b_sz,
    shuffle=True,
)
val_loader = DataLoader(
    TensorDataset(X_val, y_val),
    num_workers=4,
    prefetch_factor=4,
    pin_memory=True,
    batch_size=b_sz,
    shuffle=False,
)

In [None]:
print(f"Training {model_name }on device: {device}")
# Training Loop
best_val_loss = float("inf")
train_val_logs = []
early_stop = False

for epoch in range(max_epochs):
    # Break loop if early stop triggered
    if early_stop:
        print(f"Early stopping at epoch {epoch}")
        break

    start_time = time.time()
    train_loss, train_r2 = train.train_epoch(
        model,
        train_loader,
        optimizer,
        criterion,
        device,
        min_tgt,
        max_tgt,
        ampscaler=ampscaler,
    )
    val_loss, val_r2 = train.val_epoch(model, val_loader, criterion, device, min_tgt, max_tgt)
    scheduler.step(val_loss)

    # Early stop and save logic
    if val_loss < best_val_loss - min_loss_reduction:
        best_val_loss = val_loss
        torch.save(model.state_dict(), model_save_path)
        epochs_no_improve = 0
    else:
        epochs_no_improve += 1
        if epochs_no_improve > early_stop_patience:
            early_stop = True

    elapsed = time.time() - start_time
    curr_lr = optimizer.param_groups[0]["lr"]
    train_val_logs.append(
        {
            "epoch": epoch+1,
            "train_loss": train_loss,
            "val_loss": val_loss,
            "train_r2": train_r2,
            "val_r2": val_r2,
            "learning_rate": curr_lr,
        }
    )
    print(
        f"Epoch: {epoch + 1}/{max_epochs} | "
        f"Train Loss: {train_loss:.5f}, Train R²: {train_r2:.4f} | "
        f"Val Loss: {val_loss:.5f}, Val R²: {val_r2:.4f} | "
        f"LR: {curr_lr}, Time: {round(elapsed, 2)} secs"
    )
if not model_save_path.exists():
    torch.save(model.state_dict(), model_save_path)
train_val_df = pd.DataFrame(train_val_logs)
train_val_df.to_csv(logs_save_path, index=False)
print("Training Complete....!")


Training MLP on device: cuda
Epoch: 0/9 | Train Loss: 0.01124, Train R²: 0.7503 | Val Loss: 0.00736, Val R²: 0.8410 | LR: 0.005, Time: 39.36 secs
Epoch: 1/9 | Train Loss: 0.00877, Train R²: 0.8051 | Val Loss: 0.00716, Val R²: 0.8452 | LR: 0.005, Time: 38.75 secs
Epoch: 2/9 | Train Loss: 0.00838, Train R²: 0.8139 | Val Loss: 0.00687, Val R²: 0.8516 | LR: 0.005, Time: 38.93 secs
Epoch: 3/9 | Train Loss: 0.00814, Train R²: 0.8193 | Val Loss: 0.00685, Val R²: 0.8520 | LR: 0.005, Time: 41.64 secs
Epoch: 4/9 | Train Loss: 0.00794, Train R²: 0.8237 | Val Loss: 0.00669, Val R²: 0.8555 | LR: 0.005, Time: 43.22 secs


In [None]:
df_log = pd.read_csv(logs_save_path)
plt.figure(figsize=(8, 5))
sns.lineplot(x="epoch", y="train_loss", data=df_log, label="Train Loss")
sns.lineplot(x="epoch", y="val_loss", data=df_log, linestyle="--", label="Val Loss")
plt.grid(True, linestyle='--', alpha=0.7)
plt.savefig(output_dir / f"{target_col}_losscurve.png", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
test_loader = DataLoader(
    TensorDataset(X_test, y_test),
    num_workers=4,
    prefetch_factor=4,
    pin_memory=True,
    batch_size=b_sz,
    shuffle=False,
)

In [None]:
mae, rmse, nrmse, r2 = evaluate.evaluate_model(
    model,
    data_loader=test_loader,
    tgt=target_col,
    device=device,
    scaler_path=scaler_save_path,
)
print(f"\n\nMean Absolute Error: {mae:.4f}")
print(f"Root Mean Square Error: {rmse:.4f}")
print(f"Normalized Root Mean Square Error: {nrmse:.4f}")
print(f"Coefficient of determination (R-Squared): {r2:.4f}\n")