In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install optuna lightning

Collecting optuna
  Downloading optuna-4.5.0-py3-none-any.whl.metadata (17 kB)
Collecting lightning
  Downloading lightning-2.5.5-py3-none-any.whl.metadata (39 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting lightning-utilities<2.0,>=0.10.0 (from lightning)
  Downloading lightning_utilities-0.15.2-py3-none-any.whl.metadata (5.7 kB)
Collecting torchmetrics<3.0,>0.7.0 (from lightning)
  Downloading torchmetrics-1.8.2-py3-none-any.whl.metadata (22 kB)
Collecting pytorch-lightning (from lightning)
  Downloading pytorch_lightning-2.5.5-py3-none-any.whl.metadata (20 kB)
Downloading optuna-4.5.0-py3-none-any.whl (400 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.9/400.9 kB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning-2.5.5-py3-none-any.whl (828 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m828.5/828.5 kB[0m [31m62.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownlo

In [3]:
# ==============
# 1. IMPORTS
# ==============
import optuna
import torch
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.preprocessing import StandardScaler
from torch.optim import Adam
from torch.nn import MSELoss
from lightning.pytorch import LightningModule
import pandas as pd
from sklearn.model_selection import KFold
import torch.optim as optim
import os
import functools
import torch.nn as nn
import time

# ========================
# 2. MODEL DEFINITION
# ========================
class LayerNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-12):
        super(LayerNorm, self).__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.bias = nn.Parameter(torch.zeros(hidden_size))
        self.variance_epsilon = eps

    def forward(self, x):
        u = x.mean(-1, keepdim=True)
        s = (x - u).pow(2).mean(-1, keepdim=True)
        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
        return self.weight * x + self.bias

class SelfAttention(LightningModule):
    def __init__(self, num_attention_heads, input_size, hidden_size, output_dim=1, kernel_size=3,
                 hidden_dropout_prob=0.5, attention_probs_dropout_prob=0.5, learning_rate=0.001):
        super(SelfAttention, self).__init__()
        self.num_attention_heads = num_attention_heads
        self.attention_head_size = int(hidden_size / num_attention_heads)
        self.all_head_size = hidden_size
        self.query = torch.nn.Linear(input_size, self.all_head_size)
        self.key = torch.nn.Linear(input_size, self.all_head_size)
        self.value = torch.nn.Linear(input_size, self.all_head_size)
        self.attn_dropout = torch.nn.Dropout(attention_probs_dropout_prob)
        self.out_dropout = torch.nn.Dropout(hidden_dropout_prob)
        self.dense = torch.nn.Linear(hidden_size, input_size)
        self.LayerNorm = torch.nn.LayerNorm(input_size, eps=1e-12)
        self.relu = torch.nn.ReLU()
        self.out = torch.nn.Linear(input_size, output_dim)
        self.cnn = torch.nn.Conv1d(1, 1, kernel_size, stride=1, padding=1)
        self.learning_rate = learning_rate
        self.loss_fn = MSELoss()

    def forward(self, input_tensor):
        cnn_hidden = self.cnn(input_tensor.view(input_tensor.size(0), 1, -1))
        # --- FIX #1: This was causing a shape error. It now correctly keeps the 3D shape for the attention mechanism. ---
        input_tensor_after_cnn = cnn_hidden

        mixed_query_layer = self.query(input_tensor_after_cnn)
        mixed_key_layer = self.key(input_tensor_after_cnn)
        mixed_value_layer = self.value(input_tensor_after_cnn)
        query_layer = mixed_query_layer
        key_layer = mixed_key_layer
        value_layer = mixed_value_layer
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores = attention_scores / np.sqrt(self.attention_head_size)
        attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
        attention_probs = self.attn_dropout(attention_probs)
        context_layer = torch.matmul(attention_probs, value_layer)
        hidden_states = self.dense(context_layer)
        hidden_states = self.out_dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor_after_cnn)
        output = self.out(self.relu(hidden_states.view(hidden_states.size(0), -1)))
        return output

    # (The rest of the class methods are fine)
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_pred = self(x)
        loss = self.loss_fn(y_pred, y)
        return loss
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_pred = self(x)
        val_loss = self.loss_fn(y_pred, y)
        return val_loss
    def configure_optimizers(self):
        return Adam(self.parameters(), lr=self.learning_rate)

# ========================
# 3. HELPER FUNCTIONS
# ========================
def objective(trial, x_train, y_train, inner_cv, DEVICE, hidden_dim, output_dim, kernel_size, learning_rate):
    # This function is for hyperparameter tuning and seems okay.
    num_attention_heads = trial.suggest_categorical('num_attention_heads', [4, 8])
    attention_probs_dropout_prob = trial.suggest_categorical('attention_probs_dropout_prob', [0.2, 0.5])
    fold_losses = []
    for train_idx, valid_idx in inner_cv.split(x_train):
        x_inner_train, x_inner_valid = x_train[train_idx], x_train[valid_idx]
        y_inner_train, y_inner_valid = y_train[train_idx], y_train[valid_idx]
        scaler = StandardScaler()
        x_inner_train = scaler.fit_transform(x_inner_train)
        x_inner_valid = scaler.transform(x_inner_valid)
        x_inner_train_tensor = torch.from_numpy(x_inner_train).float().to(DEVICE)
        y_inner_train_tensor = torch.from_numpy(y_inner_train).float().to(DEVICE)
        x_inner_valid_tensor = torch.from_numpy(x_inner_valid).float().to(DEVICE)
        y_inner_valid_tensor = torch.from_numpy(y_inner_valid).float().to(DEVICE)
        train_data = TensorDataset(x_inner_train_tensor, y_inner_train_tensor)
        valid_data = TensorDataset(x_inner_valid_tensor, y_inner_valid_tensor)
        train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
        valid_loader = DataLoader(valid_data, batch_size=32, shuffle=False)
        model = SelfAttention(num_attention_heads, x_inner_train.shape[1], hidden_dim, output_dim,
                              hidden_dropout_prob=0.5, kernel_size=kernel_size,
                              attention_probs_dropout_prob=attention_probs_dropout_prob).to(DEVICE)
        loss_function = torch.nn.MSELoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
        for epoch in range(20):
            model.train()
            for x_batch, y_batch in train_loader:
                optimizer.zero_grad()
                y_pred = model(x_batch)
                loss = loss_function(y_pred, y_batch.reshape(-1, 1))
                loss.backward()
                optimizer.step()
        model.eval()
        valid_losses = []
        with torch.no_grad():
            for x_batch, y_batch in valid_loader:
                y_pred = model(x_batch)
                loss = loss_function(y_pred, y_batch.reshape(-1, 1))
                valid_losses.append(loss.item())
        fold_losses.append(np.mean(valid_losses))
    return np.mean(fold_losses)

class EarlyStopping:
    def __init__(self, patience=10, delta=0):
        self.patience = patience
        self.delta = delta
        self.best_score = -np.Inf
        self.counter = 0
        self.early_stop = False
    def __call__(self, score):
        if self.best_score == -np.Inf:
            self.best_score = score
        elif score < self.best_score + self.delta:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.counter = 0

def run_nested_cv_with_early_stopping(data, label, outer_cv, inner_cv, learning_rate, batch_size, hidden_dim,
                                      output_dim, kernel_size, patience, DEVICE):
    best_corr_coefs = []
    time_star = time.time()
    for fold, (train_idx, test_idx) in enumerate(outer_cv.split(data)):
        model_save_path = f'/content/drive/MyDrive/crop_former_model/best_model_fold_{fold + 1}.pth'
        if os.path.exists(model_save_path):
            print(f"✅ Fold {fold + 1} already completed. Model file found. Skipping.")
            continue
        print(f"\n--- Starting Fold {fold + 1}/5 ---")
        x_train, x_test = data[train_idx], data[test_idx]
        y_train, y_test = label[train_idx], label[test_idx]
        print("Running Optuna for hyperparameter tuning...")
        objective_with_data = functools.partial(objective, x_train=x_train, y_train=y_train, inner_cv=inner_cv,
                                                DEVICE=DEVICE, hidden_dim=hidden_dim, output_dim=output_dim,
                                                kernel_size=kernel_size, learning_rate=learning_rate)
        study = optuna.create_study(direction='minimize')
        study.optimize(objective_with_data, n_trials=20)
        best_trial = study.best_trial
        print(f"Optuna found best params for Fold {fold + 1}: {best_trial.params}")
        num_attention_heads = best_trial.params['num_attention_heads']
        attention_probs_dropout_prob = best_trial.params['attention_probs_dropout_prob']
        model = SelfAttention(num_attention_heads, x_train.shape[1], hidden_dim, output_dim,
                              hidden_dropout_prob=0.5, kernel_size=kernel_size,
                              attention_probs_dropout_prob=attention_probs_dropout_prob).to(DEVICE)
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
        loss_function = torch.nn.MSELoss()
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=10)
        scaler = StandardScaler()
        x_train = scaler.fit_transform(x_train)
        x_test = scaler.transform(x_test)

        # --- FIX #2: This was the source of the NameError. It now correctly uses x_train, y_train, etc. ---
        x_train_tensor = torch.from_numpy(x_train).float().to(DEVICE)
        y_train_tensor = torch.from_numpy(y_train).float().to(DEVICE)
        x_test_tensor = torch.from_numpy(x_test).float().to(DEVICE)
        y_test_tensor = torch.from_numpy(y_test).float().to(DEVICE)

        train_data = TensorDataset(x_train_tensor, y_train_tensor)
        test_data = TensorDataset(x_test_tensor, y_test_tensor)
        train_loader = DataLoader(train_data, batch_size, shuffle=True)
        test_loader = DataLoader(test_data, batch_size, shuffle=False)
        early_stopping = EarlyStopping(patience=patience)
        best_corr_coef = -float('inf')
        print(f"Starting final training for Fold {fold + 1}...")
        for epoch in range(100):
            model.train()
            for x_batch, y_batch in train_loader:
                optimizer.zero_grad()
                y_pred = model(x_batch)
                loss = loss_function(y_pred, y_batch.reshape(-1, 1))
                loss.backward()
                optimizer.step()
            model.eval()
            y_test_preds, y_test_trues = [], []
            with torch.no_grad():
                for x_batch, y_batch in test_loader:
                    y_test_pred = model(x_batch)
                    y_test_preds.extend(y_test_pred.cpu().numpy().reshape(-1).tolist())
                    y_test_trues.extend(y_batch.cpu().numpy().reshape(-1).tolist())
            corr_coef = np.corrcoef(y_test_preds, y_test_trues)[0, 1]
            scheduler.step(corr_coef)
            if corr_coef > best_corr_coef:
                best_corr_coef = corr_coef
                torch.save(model.state_dict(), model_save_path)
            early_stopping(corr_coef)
            if early_stopping.early_stop:
                print(f"Early stopping at epoch {epoch + 1}")
                break
        best_corr_coefs.append(best_corr_coef)
        print(f'Fold {fold + 1}: Best Correlation Coefficient: {best_corr_coef:.4f}')
    average_corr_coef = np.mean(best_corr_coefs)
    print(f"\n--- Training Complete ---")
    print(f"Average Best Correlation Coefficient across all folds: {average_corr_coef:.4f}")
    time_end = time.time()
    execution_time = int(time_end - time_star)
    print(f"Total execution time: {execution_time // 60} minutes, {execution_time % 60} seconds.")
    result_data = {'time': [execution_time], 'mean_corr_coef': [average_corr_coef]}
    pd.DataFrame(result_data).to_csv("/content/drive/MyDrive/crop_former_model/final_training_results.csv")

def data_preprocessing(data_path, label_path, target_columns=10000):
    label = pd.read_csv(label_path, index_col=0).values
    data = pd.read_csv(data_path, index_col=0)
    if data.shape[1] < target_columns:
        missing_columns = target_columns - data.shape[1]
        zeros_df = pd.DataFrame(np.zeros((data.shape[0], missing_columns)))
        data = pd.concat([data, zeros_df], axis=1)
    data = data.values
    return data, label

# ============================
# 4. MAIN EXECUTION BLOCK
# ============================
if __name__ == '__main__':
    # --- Hyperparameters ---
    outer_cv = KFold(n_splits=5, shuffle=True, random_state=42)
    inner_cv = KFold(n_splits=3, shuffle=True, random_state=42)
    batch_size = 128
    learning_rate = 0.001
    patience = 5
    hidden_dim = 64
    DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(f"Training on device: {DEVICE}")

    # --- File Paths ---
    data_path = "/content/drive/MyDrive/crop_former_model/X_train.csv"
    label_path = "/content/drive/MyDrive/crop_former_model/y_train.csv"

    # --- Run ---
    print("Starting data preprocessing...")
    data, label = data_preprocessing(data_path, label_path)
    print("Data preprocessing complete. Starting training...")
    run_nested_cv_with_early_stopping(data=data, label=label, outer_cv=outer_cv,
                                      inner_cv=inner_cv, learning_rate=learning_rate,
                                      batch_size=batch_size, hidden_dim=hidden_dim, output_dim=1,
                                      kernel_size=3, patience=patience, DEVICE=DEVICE)
    print("--- Successfully Finished ---")

Training on device: cuda
Starting data preprocessing...
Data preprocessing complete. Starting training...

--- Starting Fold 1/5 ---


[I 2025-10-04 07:38:26,208] A new study created in memory with name: no-name-9bbdd24d-4d63-4136-805d-45ce73b5e9c7


Running Optuna for hyperparameter tuning...


[I 2025-10-04 07:38:44,091] Trial 0 finished with value: 353.6909993489583 and parameters: {'num_attention_heads': 8, 'attention_probs_dropout_prob': 0.5}. Best is trial 0 with value: 353.6909993489583.
[I 2025-10-04 07:39:01,157] Trial 1 finished with value: 604.6578494340946 and parameters: {'num_attention_heads': 4, 'attention_probs_dropout_prob': 0.5}. Best is trial 0 with value: 353.6909993489583.
[I 2025-10-04 07:39:19,048] Trial 2 finished with value: 517.586169642261 and parameters: {'num_attention_heads': 4, 'attention_probs_dropout_prob': 0.2}. Best is trial 0 with value: 353.6909993489583.
[I 2025-10-04 07:39:36,890] Trial 3 finished with value: 398.65988863431494 and parameters: {'num_attention_heads': 4, 'attention_probs_dropout_prob': 0.5}. Best is trial 0 with value: 353.6909993489583.
[I 2025-10-04 07:39:54,053] Trial 4 finished with value: 450.1202152610844 and parameters: {'num_attention_heads': 4, 'attention_probs_dropout_prob': 0.5}. Best is trial 0 with value: 353.

Optuna found best params for Fold 1: {'num_attention_heads': 8, 'attention_probs_dropout_prob': 0.5}
Starting final training for Fold 1...
Early stopping at epoch 20
Fold 1: Best Correlation Coefficient: 0.8090

--- Starting Fold 2/5 ---


[I 2025-10-04 07:44:26,039] A new study created in memory with name: no-name-e2b84bde-6e06-4b20-80a5-b8cd41d1a4c7


Running Optuna for hyperparameter tuning...


[I 2025-10-04 07:44:44,116] Trial 0 finished with value: 489.9615627191006 and parameters: {'num_attention_heads': 8, 'attention_probs_dropout_prob': 0.2}. Best is trial 0 with value: 489.9615627191006.
[I 2025-10-04 07:45:01,650] Trial 1 finished with value: 479.45655601045 and parameters: {'num_attention_heads': 4, 'attention_probs_dropout_prob': 0.5}. Best is trial 1 with value: 479.45655601045.
[I 2025-10-04 07:45:21,275] Trial 2 finished with value: 413.1463390904614 and parameters: {'num_attention_heads': 8, 'attention_probs_dropout_prob': 0.5}. Best is trial 2 with value: 413.1463390904614.
[I 2025-10-04 07:45:38,994] Trial 3 finished with value: 487.7657953246027 and parameters: {'num_attention_heads': 4, 'attention_probs_dropout_prob': 0.2}. Best is trial 2 with value: 413.1463390904614.
[I 2025-10-04 07:45:56,629] Trial 4 finished with value: 390.6503024631076 and parameters: {'num_attention_heads': 8, 'attention_probs_dropout_prob': 0.5}. Best is trial 4 with value: 390.6503

Optuna found best params for Fold 2: {'num_attention_heads': 8, 'attention_probs_dropout_prob': 0.5}
Starting final training for Fold 2...
Early stopping at epoch 75
Fold 2: Best Correlation Coefficient: 0.9417

--- Starting Fold 3/5 ---


[I 2025-10-04 07:50:46,500] A new study created in memory with name: no-name-e1a12e1c-02b5-4702-b363-d25449a4a4be


Running Optuna for hyperparameter tuning...


[I 2025-10-04 07:51:03,959] Trial 0 finished with value: 597.2101962097689 and parameters: {'num_attention_heads': 4, 'attention_probs_dropout_prob': 0.2}. Best is trial 0 with value: 597.2101962097689.
[I 2025-10-04 07:51:22,896] Trial 1 finished with value: 504.1681951539129 and parameters: {'num_attention_heads': 4, 'attention_probs_dropout_prob': 0.2}. Best is trial 1 with value: 504.1681951539129.
[I 2025-10-04 07:51:41,093] Trial 2 finished with value: 486.88317818926953 and parameters: {'num_attention_heads': 4, 'attention_probs_dropout_prob': 0.2}. Best is trial 2 with value: 486.88317818926953.
[I 2025-10-04 07:52:01,675] Trial 3 finished with value: 497.7334722535223 and parameters: {'num_attention_heads': 8, 'attention_probs_dropout_prob': 0.2}. Best is trial 2 with value: 486.88317818926953.
[I 2025-10-04 07:52:20,822] Trial 4 finished with value: 504.28173932458594 and parameters: {'num_attention_heads': 8, 'attention_probs_dropout_prob': 0.2}. Best is trial 2 with value: 

Optuna found best params for Fold 3: {'num_attention_heads': 4, 'attention_probs_dropout_prob': 0.5}
Starting final training for Fold 3...
Early stopping at epoch 49
Fold 3: Best Correlation Coefficient: 0.9363

--- Starting Fold 4/5 ---


[I 2025-10-04 07:57:28,489] A new study created in memory with name: no-name-3e5eb089-75c1-492c-aaf8-1afa13457088


Running Optuna for hyperparameter tuning...


[I 2025-10-04 07:57:46,861] Trial 0 finished with value: 477.2264190412994 and parameters: {'num_attention_heads': 4, 'attention_probs_dropout_prob': 0.2}. Best is trial 0 with value: 477.2264190412994.
[I 2025-10-04 07:58:04,437] Trial 1 finished with value: 586.4726421649639 and parameters: {'num_attention_heads': 4, 'attention_probs_dropout_prob': 0.2}. Best is trial 0 with value: 477.2264190412994.
[I 2025-10-04 07:58:22,779] Trial 2 finished with value: 477.21240495209 and parameters: {'num_attention_heads': 4, 'attention_probs_dropout_prob': 0.2}. Best is trial 2 with value: 477.21240495209.
[I 2025-10-04 07:58:40,104] Trial 3 finished with value: 498.09606868385254 and parameters: {'num_attention_heads': 4, 'attention_probs_dropout_prob': 0.2}. Best is trial 2 with value: 477.21240495209.
[I 2025-10-04 07:58:57,394] Trial 4 finished with value: 502.6407564603366 and parameters: {'num_attention_heads': 4, 'attention_probs_dropout_prob': 0.5}. Best is trial 2 with value: 477.21240

Optuna found best params for Fold 4: {'num_attention_heads': 8, 'attention_probs_dropout_prob': 0.5}
Starting final training for Fold 4...
Early stopping at epoch 47
Fold 4: Best Correlation Coefficient: 0.9427

--- Starting Fold 5/5 ---


[I 2025-10-04 08:03:39,560] A new study created in memory with name: no-name-fbc54b1c-ec33-4f8a-9688-69c382ca5d18


Running Optuna for hyperparameter tuning...


[I 2025-10-04 08:03:57,250] Trial 0 finished with value: 415.24580292008886 and parameters: {'num_attention_heads': 8, 'attention_probs_dropout_prob': 0.2}. Best is trial 0 with value: 415.24580292008886.
[I 2025-10-04 08:04:14,436] Trial 1 finished with value: 519.2561893300114 and parameters: {'num_attention_heads': 4, 'attention_probs_dropout_prob': 0.2}. Best is trial 0 with value: 415.24580292008886.
[I 2025-10-04 08:04:32,216] Trial 2 finished with value: 577.575193225828 and parameters: {'num_attention_heads': 8, 'attention_probs_dropout_prob': 0.5}. Best is trial 0 with value: 415.24580292008886.
[I 2025-10-04 08:04:49,559] Trial 3 finished with value: 379.4198857494909 and parameters: {'num_attention_heads': 4, 'attention_probs_dropout_prob': 0.5}. Best is trial 3 with value: 379.4198857494909.
[I 2025-10-04 08:05:06,679] Trial 4 finished with value: 332.700180314545 and parameters: {'num_attention_heads': 4, 'attention_probs_dropout_prob': 0.5}. Best is trial 4 with value: 33

Optuna found best params for Fold 5: {'num_attention_heads': 4, 'attention_probs_dropout_prob': 0.5}
Starting final training for Fold 5...
Early stopping at epoch 29
Fold 5: Best Correlation Coefficient: 0.9277

--- Training Complete ---
Average Best Correlation Coefficient across all folds: 0.9115
Total execution time: 31 minutes, 7 seconds.
--- Successfully Finished ---


In [6]:
import torch
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import torch.nn as nn
from lightning.pytorch import LightningModule

# (The model definition is copied here for the script to be self-contained)
class SelfAttention(LightningModule):
    def __init__(self, num_attention_heads, input_size, hidden_size, output_dim=1, kernel_size=3,
                 hidden_dropout_prob=0.5, attention_probs_dropout_prob=0.5, learning_rate=0.001):
        super(SelfAttention, self).__init__()
        self.num_attention_heads = num_attention_heads
        self.attention_head_size = int(hidden_size / num_attention_heads)
        self.all_head_size = hidden_size
        self.query = torch.nn.Linear(input_size, self.all_head_size)
        self.key = torch.nn.Linear(input_size, self.all_head_size)
        self.value = torch.nn.Linear(input_size, self.all_head_size)
        self.attn_dropout = torch.nn.Dropout(attention_probs_dropout_prob)
        self.out_dropout = torch.nn.Dropout(hidden_dropout_prob)
        self.dense = torch.nn.Linear(hidden_size, input_size)
        self.LayerNorm = torch.nn.LayerNorm(input_size, eps=1e-12)
        self.relu = torch.nn.ReLU()
        self.out = torch.nn.Linear(input_size, output_dim)
        self.cnn = torch.nn.Conv1d(1, 1, kernel_size, stride=1, padding=1)

    def forward(self, input_tensor):
        cnn_hidden = self.cnn(input_tensor.view(input_tensor.size(0), 1, -1))
        input_tensor_after_cnn = cnn_hidden
        mixed_query_layer = self.query(input_tensor_after_cnn)
        mixed_key_layer = self.key(input_tensor_after_cnn)
        mixed_value_layer = self.value(input_tensor_after_cnn)
        query_layer, key_layer, value_layer = mixed_query_layer, mixed_key_layer, mixed_value_layer
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores = attention_scores / np.sqrt(self.attention_head_size)
        attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
        attention_probs = self.attn_dropout(attention_probs)
        context_layer = torch.matmul(attention_probs, value_layer)
        hidden_states = self.dense(context_layer)
        hidden_states = self.out_dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor_after_cnn)
        output = self.out(self.relu(hidden_states.view(hidden_states.size(0), -1)))
        return output

# --- Main Prediction Logic ---
if __name__ == '__main__':
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f'Using device: {DEVICE}')

    # --- UPDATE THESE FILE PATHS ---
    # Use the model from the fold with the highest score (e.g., Fold 5)
    model_path = '/content/drive/MyDrive/crop_former_model/best_model_fold_5.pth'

    # We need the training data to fit the scaler correctly
    train_data_path = '/content/drive/MyDrive/crop_former_model/X_train.csv'

    # This is the "new" data we want to predict on
    test_data_path = '/content/drive/MyDrive/crop_former_model/X_test.csv'

    # This is where the final predictions will be saved
    output_path = '/content/drive/MyDrive/crop_former_model/predicted_result.csv'
    # --------------------------------

    # --- Hyperparameters (must match the trained model) ---
    input_size = 10000
    # Use the best params found for your best fold.
    # From your log, Fold 5 used: {'num_attention_heads': 4, 'attention_probs_dropout_prob': 0.5}
    num_attention_heads = 4
    attention_probs_dropout_prob = 0.5

    # Initialize the model structure
    model = SelfAttention(num_attention_heads=num_attention_heads, input_size=input_size,
                          hidden_size=64, output_dim=1, kernel_size=3,
                          attention_probs_dropout_prob=attention_probs_dropout_prob).to(DEVICE)

    # Load the saved weights
    print(f"Loading trained model from: {model_path}")
    model.load_state_dict(torch.load(model_path, map_location=DEVICE))

    # --- CRITICAL FIX: SCALING THE DATA ---
    print("Loading training data to fit the scaler...")
    X_train = pd.read_csv(train_data_path, index_col=0)

    scaler = StandardScaler()
    scaler.fit(X_train.values) # Fit the scaler ONLY on the training data

    print(f"Loading and scaling test data from: {test_data_path}")
    X_test = pd.read_csv(test_data_path, index_col=0)
    X_test_scaled = scaler.transform(X_test.values) # Transform the test data
    # -----------------------------------------

    X_test_tensor = torch.from_numpy(X_test_scaled).to(torch.float32).to(DEVICE)

    # Make predictions
    print("Making predictions on the test data...")
    model.eval()
    with torch.no_grad():
        output = model(X_test_tensor)

    # Save predictions
    pd.DataFrame(output.cpu().numpy(), columns=['predicted_value'], index=X_test.index).to_csv(output_path)

    print(f"✅ Success! Predictions saved to '{output_path}'.")

Using device: cuda
Loading trained model from: /content/drive/MyDrive/crop_former_model/best_model_fold_5.pth
Loading training data to fit the scaler...
Loading and scaling test data from: /content/drive/MyDrive/crop_former_model/X_test.csv
Making predictions on the test data...
✅ Success! Predictions saved to '/content/drive/MyDrive/crop_former_model/predicted_result.csv'.


In [7]:
import pandas as pd
import numpy as np

# --- IMPORTANT: UPDATE THESE FILE PATHS ---
# Path to the file with your model's predictions
predicted_file = '/content/drive/MyDrive/crop_former_model/predicted_result.csv'

# Path to the file with the true, original test labels
true_labels_file = '/content/drive/MyDrive/crop_former_model/y_test.csv'
# -----------------------------------------

# Load the datasets
predicted_df = pd.read_csv(predicted_file)
true_df = pd.read_csv(true_labels_file)

# Extract the numerical values
predicted_values = predicted_df['predicted_value'].values
true_values = true_df.iloc[:, 1].values # Select the second column which contains the phenotype values

# Calculate the Pearson Correlation Coefficient (PCC)
# np.corrcoef returns a 2x2 matrix, the value at [0, 1] is the correlation
accuracy = np.corrcoef(predicted_values, true_values)[0, 1]

print(f"✅ Model Accuracy (Pearson Correlation Coefficient): {accuracy:.4f}")

✅ Model Accuracy (Pearson Correlation Coefficient): 0.9285


In [9]:
import pandas as pd
import numpy as np

# --- Configuration ---
# File with the correct 10,000 SNP columns
feature_file = '/content/drive/MyDrive/crop_former_model/chr10_top10k_snps.csv'

# Name of our new simulated data file
output_simulated_file = '/content/drive/MyDrive/crop_former_model/new_simulated_maize.csv'

# How many new samples to simulate
num_new_samples = 5
# ---------------------

print("Creating a simulated data file for prediction...")

# Read just the header of the feature file to get the column names
try:
    snp_columns = pd.read_csv(feature_file, nrows=0).columns[1:] # Skip the first 'IID' column

    # Create some new, fake sample IDs
    new_sample_ids = [f'Maize_Sample_{i+1}' for i in range(num_new_samples)]

    # Create a DataFrame with the correct shape
    simulated_df = pd.DataFrame(index=new_sample_ids, columns=snp_columns)
    simulated_df.index.name = 'IID'

    # Fill it with random genotype data (the 0-9 encoding)
    # This simulates having new genotype data for the same SNPs
    simulated_data = np.random.randint(0, 10, size=simulated_df.shape)
    simulated_df[:] = simulated_data

    # Save the simulated data to a new CSV file
    simulated_df.to_csv(output_simulated_file)

    print(f"✅ Success! Created '{output_simulated_file}' with {num_new_samples} simulated samples.")

except FileNotFoundError:
    print(f"❌ Error: Make sure '{feature_file}' exists in your Google Drive.")

Creating a simulated data file for prediction...
✅ Success! Created '/content/drive/MyDrive/crop_former_model/new_simulated_maize.csv' with 5 simulated samples.


In [None]:
t

In [10]:
# ===================================================================
# Master Prediction and Evaluation Script for Cropformer
# ===================================================================
import torch
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import torch.nn as nn
from lightning.pytorch import LightningModule
import os

# ========================
# 1. MODEL DEFINITION
# (This must be included so the script knows the model's structure)
# ========================
class SelfAttention(LightningModule):
    def __init__(self, num_attention_heads, input_size, hidden_size, output_dim=1, kernel_size=3,
                 hidden_dropout_prob=0.5, attention_probs_dropout_prob=0.5):
        super(SelfAttention, self).__init__()
        self.num_attention_heads = num_attention_heads
        self.attention_head_size = int(hidden_size / num_attention_heads)
        self.all_head_size = hidden_size
        self.query = torch.nn.Linear(input_size, self.all_head_size)
        self.key = torch.nn.Linear(input_size, self.all_head_size)
        self.value = torch.nn.Linear(input_size, self.all_head_size)
        self.attn_dropout = torch.nn.Dropout(attention_probs_dropout_prob)
        self.out_dropout = torch.nn.Dropout(hidden_dropout_prob)
        self.dense = torch.nn.Linear(hidden_size, input_size)
        self.LayerNorm = torch.nn.LayerNorm(input_size, eps=1e-12)
        self.relu = torch.nn.ReLU()
        self.out = torch.nn.Linear(input_size, output_dim)
        self.cnn = torch.nn.Conv1d(1, 1, kernel_size, stride=1, padding=1)

    def forward(self, input_tensor):
        cnn_hidden = self.cnn(input_tensor.view(input_tensor.size(0), 1, -1))
        input_tensor_after_cnn = cnn_hidden
        mixed_query_layer = self.query(input_tensor_after_cnn)
        mixed_key_layer = self.key(input_tensor_after_cnn)
        mixed_value_layer = self.value(input_tensor_after_cnn)
        query_layer, key_layer, value_layer = mixed_query_layer, mixed_key_layer, mixed_value_layer
        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
        attention_scores = attention_scores / np.sqrt(self.attention_head_size)
        attention_probs = torch.nn.Softmax(dim=-1)(attention_scores)
        attention_probs = self.attn_dropout(attention_probs)
        context_layer = torch.matmul(attention_probs, value_layer)
        hidden_states = self.dense(context_layer)
        hidden_states = self.out_dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor_after_cnn)
        output = self.out(self.relu(hidden_states.view(hidden_states.size(0), -1)))
        return output

# ========================
# 2. CONFIGURATION
# (Update these paths to match your Google Drive)
# ========================
DRIVE_FOLDER = '/content/drive/MyDrive/crop_former_model/'
MODEL_FOLDER = DRIVE_FOLDER

# Input files
TRAIN_DATA_PATH = os.path.join(DRIVE_FOLDER, 'X_train.csv')
TEST_DATA_PATH = os.path.join(DRIVE_FOLDER, 'X_test.csv')
TEST_LABELS_PATH = os.path.join(DRIVE_FOLDER, 'y_test.csv')
NEW_DATA_PATH = os.path.join(DRIVE_FOLDER, 'new_simulated_maize.csv')

# --- Hyperparameters (must match the models you trained) ---
INPUT_SIZE = 10000
HIDDEN_SIZE = 64

# ========================
# 3. MAIN SCRIPT LOGIC
# ========================
if __name__ == '__main__':
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f'Using device: {DEVICE}\n')

    # --- Part 1: Evaluate the accuracy of each saved model ---
    print("--- Evaluating Accuracy of Saved Models ---")

    # Load data for evaluation
    X_train_df = pd.read_csv(TRAIN_DATA_PATH, index_col=0)
    X_test_df = pd.read_csv(TEST_DATA_PATH, index_col=0)
    y_test_df = pd.read_csv(TEST_LABELS_PATH, index_col=0)

    # Fit a scaler on the training data
    scaler = StandardScaler()
    scaler.fit(X_train_df.values)

    # Scale the test data
    X_test_scaled = scaler.transform(X_test_df.values)
    X_test_tensor = torch.from_numpy(X_test_scaled).to(torch.float32).to(DEVICE)
    true_values = y_test_df.values.flatten()

    model_accuracies = {}

    for i in range(1, 6):
        model_name = f'best_model_fold_{i}.pth'
        model_path = os.path.join(MODEL_FOLDER, model_name)

        # NOTE: This assumes best params are similar across folds. For highest accuracy,
        # you'd need to load the specific params for each fold. We use the best overall.
        best_params = {'num_attention_heads': 4, 'attention_probs_dropout_prob': 0.5}

        model = SelfAttention(**best_params, input_size=INPUT_SIZE, hidden_size=HIDDEN_SIZE).to(DEVICE)
        model.load_state_dict(torch.load(model_path, map_location=DEVICE))
        model.eval()

        with torch.no_grad():
            predictions_tensor = model(X_test_tensor)

        predicted_values = predictions_tensor.cpu().numpy().flatten()

        # Calculate Pearson Correlation
        accuracy = np.corrcoef(predicted_values, true_values)[0, 1]
        model_accuracies[model_name] = accuracy
        print(f"  - {model_name}: Accuracy = {accuracy:.4f}")

    print(f"\nAverage Accuracy: {np.mean(list(model_accuracies.values())):.4f}\n")

    # --- Part 2: Predict DTT for new data ---
    print(f"--- Predicting DTT for New Data from '{os.path.basename(NEW_DATA_PATH)}' ---")

    # Load the new data
    new_data_df = pd.read_csv(NEW_DATA_PATH, index_col=0)

    # Scale the new data using the SAME scaler fitted on the training data
    new_data_scaled = scaler.transform(new_data_df.values)
    new_data_tensor = torch.from_numpy(new_data_scaled).to(torch.float32).to(DEVICE)

    all_predictions = {}

    for i in range(1, 6):
        model_name = f'best_model_fold_{i}.pth'
        model_path = os.path.join(MODEL_FOLDER, model_name)

        best_params = {'num_attention_heads': 4, 'attention_probs_dropout_prob': 0.5}

        model = SelfAttention(**best_params, input_size=INPUT_SIZE, hidden_size=HIDDEN_SIZE).to(DEVICE)
        model.load_state_dict(torch.load(model_path, map_location=DEVICE))
        model.eval()

        with torch.no_grad():
            predictions_tensor = model(new_data_tensor)

        all_predictions[f'Model_Fold_{i}'] = predictions_tensor.cpu().numpy().flatten()

    # Create a final results DataFrame
    results_df = pd.DataFrame(all_predictions, index=new_data_df.index)
    results_df['Ensemble_Average_DTT'] = results_df.mean(axis=1)

    print("\n✅ Prediction Complete. Results:\n")
    print(results_df.round(2))

Using device: cuda

--- Evaluating Accuracy of Saved Models ---
  - best_model_fold_1.pth: Accuracy = 0.8233
  - best_model_fold_2.pth: Accuracy = 0.9459
  - best_model_fold_3.pth: Accuracy = 0.9384
  - best_model_fold_4.pth: Accuracy = 0.9417
  - best_model_fold_5.pth: Accuracy = 0.9285

Average Accuracy: 0.9156

--- Predicting DTT for New Data from 'new_simulated_maize.csv' ---

✅ Prediction Complete. Results:

                Model_Fold_1  Model_Fold_2  Model_Fold_3  Model_Fold_4  \
IID                                                                      
Maize_Sample_1     93.419998     92.849998     91.199997     89.730003   
Maize_Sample_2     93.070000     93.879997     90.860001     87.940002   
Maize_Sample_3     93.430000     93.760002     92.230003     89.620003   
Maize_Sample_4     93.489998     93.779999     91.260002     88.610001   
Maize_Sample_5     93.639999     93.430000     93.379997     90.250000   

                Model_Fold_5  Ensemble_Average_DTT  
IID        