<div align='center'>
    <img src="https://github.com/HamzaElshafie/Spectral-Spatial-Transformers-for-Precise-Crop-Classification-from-UAV-borne-Hyperspectral-Images/blob/SSFTT/Figs/SSFTT_HL.png?raw=1" width="80%"/>
</div>

---
*Sun, L., Zhao, G., Zheng, Y., & Wu, Z. (2022). Spectral–spatial feature tokenization transformer for hyperspectral image classification. IEEE Transactions on Geoscience and Remote Sensing, 60, 1–14.*




In [1]:
!apt-get install git-lfs
!git lfs install --skip-smudge
!git clone https://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@github.com/HamzaElshafie/Spectral-Spatial-Transformers-for-Precise-Crop-Classification-from-UAV-borne-Hyperspectral-Images.git

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
git-lfs is already the newest version (3.0.2-1ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 4 not upgraded.
Git LFS initialized.
Cloning into 'Spectral-Spatial-Transformers-for-Precise-Crop-Classification-from-UAV-borne-Hyperspectral-Images'...
remote: Enumerating objects: 974, done.[K
remote: Counting objects: 100% (319/319), done.[K
remote: Compressing objects: 100% (109/109), done.[K
remote: Total 974 (delta 232), reused 283 (delta 210), pack-reused 655 (from 1)[K
Receiving objects: 100% (974/974), 13.36 MiB | 27.69 MiB/s, done.
Resolving deltas: 100% (592/592), done.


In [2]:
# Comment out the checkout operation if we are in main
!cd /content/Spectral-Spatial-Transformers-for-Precise-Crop-Classification-from-UAV-borne-Hyperspectral-Images && git checkout SSFTT --

Branch 'SSFTT' set up to track remote branch 'SSFTT' from 'origin'.
Switched to a new branch 'SSFTT'


In [3]:
!pip install einops
!pip install optuna
!pip install cloud-tpu-client torch-xla
!pip uninstall -y tensorflow
!pip install tensorflow-cpu
# !pip install torch
# Uninstall any existing PyTorch installation
#!pip uninstall -y torch
# Install PyTorch (GPU version with CUDA 12.1)
#!pip install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/cu121

Collecting einops
  Downloading einops-0.8.0-py3-none-any.whl.metadata (12 kB)
Downloading einops-0.8.0-py3-none-any.whl (43 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m661.2 kB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: einops
Successfully installed einops-0.8.0
Collecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.2-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Collecting sqlalchemy>=1.3.0 (from optuna)
  Downloading SQLAlchemy-2.0.32-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.5-py3-none-any.whl.metadata (2.9 kB)
Collecting greenlet!=0.4.17 (from sqlalchemy>=1.3.0->optuna)
  Downloading greenlet-3.0.3-cp310-cp310-manylinux_2_24_x86_64.manylinux

In [4]:
import numpy as np
import scipy.io as sio
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, cohen_kappa_score
import torch
import torch.nn as nn
import torch.optim as optim
import optuna
from operator import truediv
import sys
import time
import matplotlib.pyplot as plt
import os
import zipfile
import argparse
import scipy.io as sio
import psutil
import gc
import torch_xla
import torch_xla.core.xla_model as xm
import torch_xla.distributed.data_parallel as dp
import torch_xla.distributed.xla_multiprocessing as xmp
import torch_xla.utils.serialization as xser
import torch_xla.utils.utils as xu
import torch_xla.distributed.parallel_loader as pl
import pickle

In [5]:
from google.colab import files
files.upload()  # Manually upload kaggle.json

# Move kaggle.json to the correct location
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

Saving kaggle.json to kaggle.json


# **Training + Bayesian Optimization**



---



##HongHu



---



In [None]:
main_dir = '/content/Spectral-Spatial-Transformers-for-Precise-Crop-Classification-from-UAV-borne-Hyperspectral-Images'
sys.path.append(main_dir)

data_dir = '/content/Data'
sys.path.append(data_dir)

from data_fetcher import loadData
from SSFTT import SSFTTnet
import get_cls_map

BATCH_SIZE_TRAIN = 64

NUM_CLASSES = {
    'HanChuan': 16,
    'HongHu': 22,
    'LongKou': 9
}

dataset_mapping = {
    'HanChuan': 'WHU-Hi-HanChuan',
    'HongHu': 'WHU-Hi-HongHu',
    'LongKou': 'WHU-Hi-LongKou'
}

def loadDataWrapper(dataset, kaggle_json_path, train_samples):
    dataset_name = dataset_mapping[dataset]
    full_data_file = f'{data_dir}/{dataset_name}/WHU_Hi_{dataset}.mat'
    train_file = f'{data_dir}/{dataset_name}/Training samples and test samples/Train{train_samples}.mat'
    test_file = f'{data_dir}/{dataset_name}/Training samples and test samples/Test{train_samples}.mat'

    data, labels = loadData(dataset_name, kaggle_json_path)

    if dataset == 'HongHu':
        train_mask = sio.loadmat(train_file)[f'HHCYtrain{train_samples}']
        test_mask = sio.loadmat(test_file)[f'HHCYtest{train_samples}']
    elif dataset == 'HanChuan':
        train_mask = sio.loadmat(train_file)[f'Train{train_samples}']
        test_mask = sio.loadmat(test_file)[f'Test{train_samples}']
    elif dataset == 'LongKou':
        train_mask = sio.loadmat(train_file)[f'LKtrain{train_samples}']
        test_mask = sio.loadmat(test_file)[f'LKtest{train_samples}']

    return data, labels, train_mask, test_mask

def applyPCA(X_train, X_val, X_test, pca_components=30):
    newX_train = np.reshape(X_train, (-1, X_train.shape[-1])).astype(np.float32)
    newX_val = np.reshape(X_val, (-1, X_val.shape[-1])).astype(np.float32)
    newX_test = np.reshape(X_test, (-1, X_test.shape[-1])).astype(np.float32)

    pca = PCA(n_components=pca_components, whiten=True)
    newX_train = pca.fit_transform(newX_train).astype(np.float32)
    newX_val = pca.transform(newX_val).astype(np.float32)
    newX_test = pca.transform(newX_test).astype(np.float32)

    valid_components = pca.n_components_
    print(f"Valid components: {valid_components}")

    newX_train = np.reshape(newX_train, (X_train.shape[0], X_train.shape[1], X_train.shape[2], valid_components))
    newX_val = np.reshape(newX_val, (X_val.shape[0], X_val.shape[1], X_val.shape[2], valid_components))
    newX_test = np.reshape(newX_test, (X_test.shape[0], X_test.shape[1], X_test.shape[2], valid_components))

    return newX_train, newX_val, newX_test, pca

def padWithZeros(X, margin=2):
    newX = np.zeros((X.shape[0] + 2 * margin, X.shape[1] + 2 * margin, X.shape[2]), dtype=np.float32)
    x_offset = margin
    y_offset = margin
    newX[x_offset:X.shape[0] + x_offset, y_offset:X.shape[1] + y_offset, :] = X.astype(np.float32)
    return newX

def createImageCubesWithMask(X, y, mask, mask_type, windowSize=13):
    margin = int((windowSize - 1) / 2)
    print(f"Margin: {margin}")
    zeroPaddedX = padWithZeros(X, margin=margin)
    print(f"Padded X shape: {zeroPaddedX.shape}")
    positions = np.argwhere(mask >= 1)
    patchesData = np.zeros((len(positions), windowSize, windowSize, X.shape[2]), dtype=np.float32)
    patchesLabels = np.zeros(len(positions))

    unique, counts = np.unique(mask, return_counts=True)
    mask_stats = dict(zip(unique, counts))
    print(f"{mask_type} statistics: {mask_stats}")

    print(f"y shape: {y.shape}, type: {type(y)}")
    print(f"{mask_type} positions shape: {positions.shape}, type: {type(positions)}")
    print("_________________________________________________________________________\n")

    for i, pos in enumerate(positions):
        x, y_pos = pos
        x_padded = x + margin
        y_padded = y_pos + margin
        patch = zeroPaddedX[x_padded - margin:x_padded + margin + 1, y_padded - margin:y_padded + margin + 1, :].astype(np.float32)
        patchesData[i, :, :, :] = patch
        patchesLabels[i] = y[x, y_pos] - 1

    return patchesData, patchesLabels

def createImageCubes(X, y, windowSize=13, removeZeroLabels=True):
    margin = int((windowSize - 1) / 2)
    zeroPaddedX = padWithZeros(X, margin=margin)
    patchesData = np.zeros((X.shape[0] * X.shape[1], windowSize, windowSize, X.shape[2]), dtype=np.float32)
    patchesLabels = np.zeros((X.shape[0] * X.shape[1]))
    patchIndex = 0
    for r in range(margin, zeroPaddedX.shape[0] - margin):
        for c in range(margin, zeroPaddedX.shape[1] - margin):
            patch = zeroPaddedX[r - margin:r + margin + 1, c - margin:c + margin + 1].astype(np.float32)
            patchesData[patchIndex, :, :, :] = patch
            patchesLabels[patchIndex] = y[r-margin, c-margin]
            patchIndex = patchIndex + 1
    if removeZeroLabels:
        patchesData = patchesData[patchesLabels > 0, :, :, :]
        patchesLabels = patchesLabels[patchesLabels > 0]
        patchesLabels -= 1

    return patchesData, patchesLabels

def create_data_loader(dataset, kaggle_json_path, patch_size, train_samples, validation_ratio=0.2):
    # Define cache file name
    cache_dir = f'preprocessed_patches/{dataset}'
    os.makedirs(cache_dir, exist_ok=True)
    cache_file = f'{cache_dir}/patch_size_{patch_size}_samples_{train_samples}.pkl'

    # If cache file exists, load the data from it
    if os.path.exists(cache_file):
        print(f"\nLoading cached data for patch size {patch_size} and train samples {train_samples}...")
        with open(cache_file, 'rb') as f:
            data = pickle.load(f)  # Ensure this is a dictionary
            return data['train_loader'], data['val_loader'], data['test_loader'], data['all_data_loader'], data['y'], data['pca_components']

    # If cache does not exist, process the data as usual
    X, y, train_mask, test_mask = loadDataWrapper(dataset, kaggle_json_path, train_samples)
    pca_components = 30

    print('Hyperspectral data shape: ', X.shape)
    print('Label shape: ', y.shape)

    print(f'Train mask shape: {train_mask.shape}')
    print(f'Test mask shape: {test_mask.shape}')

    print(f"Patch (window) size: {patch_size}")

    print('\n... ... Create data cubes with masks (Before PCA to avoid data leakage) ... ...')
    Xtrain, ytrain = createImageCubesWithMask(X, y, train_mask, mask_type="Training mask", windowSize=patch_size)
    Xtest, ytest = createImageCubesWithMask(X, y, test_mask, mask_type="Testing mask", windowSize=patch_size)
    print('Xtrain shape: ', Xtrain.shape)
    print('Xtest shape: ', Xtest.shape)
    print('ytrain shape: ', ytrain.shape)
    print('ytest shape: ', ytest.shape)

    gc.collect()

    # Stratified split of the training data to ensure balance
    train_indices, val_indices = train_test_split(
        np.arange(len(ytrain)),
        test_size=validation_ratio,
        stratify=ytrain,
        random_state=42
    )

    X_train_split = Xtrain[train_indices]
    y_train_split = ytrain[train_indices].astype(int)
    X_val_split = Xtrain[val_indices]
    y_val_split = ytrain[val_indices].astype(int)

    unique_train, counts_train = np.unique(y_train_split, return_counts=True)
    train_distribution = dict(zip(unique_train, counts_train))
    print("\nTraining set class distribution after split", train_distribution)

    unique_val, counts_val = np.unique(y_val_split, return_counts=True)
    val_distribution = dict(zip(unique_val, counts_val))
    print("\nValidation set class distribution after split:", val_distribution)

    print('\n... ... PCA transformation ... ...')
    X_train_pca, X_val_pca, X_test_pca, pca_model = applyPCA(X_train_split, X_val_split, Xtest, pca_components)
    print('Xtrain shape after PCA: ', X_train_pca.shape)
    print('Xval shape after PCA: ', X_val_pca.shape)
    print('Xtest shape after PCA: ', X_test_pca.shape)

    X_train_pca = X_train_pca.reshape(-1, patch_size, patch_size, pca_model.n_components_, 1).transpose(0, 4, 3, 1, 2)
    X_val_pca = X_val_pca.reshape(-1, patch_size, patch_size, pca_model.n_components_, 1).transpose(0, 4, 3, 1, 2)
    X_test_pca = X_test_pca.reshape(-1, patch_size, patch_size, pca_model.n_components_, 1).transpose(0, 4, 3, 1, 2)

    trainset = TrainDS(X_train_pca, y_train_split)
    valset = TestDS(X_val_pca, y_val_split)
    testset = TestDS(X_test_pca, ytest)

    train_loader = torch.utils.data.DataLoader(dataset=trainset, batch_size=BATCH_SIZE_TRAIN, shuffle=True, num_workers=2)
    val_loader = torch.utils.data.DataLoader(dataset=valset, batch_size=BATCH_SIZE_TRAIN, shuffle=False, num_workers=2)
    test_loader = torch.utils.data.DataLoader(dataset=testset, batch_size=BATCH_SIZE_TRAIN, shuffle=False, num_workers=2)

    gc.collect()

    # For full scene classification
    newX = np.reshape(X, (-1, X.shape[-1])).astype(np.float32)
    X_pca_full = pca_model.transform(newX).astype(np.float32)
    X_pca_full = np.reshape(X_pca_full, (X.shape[0], X.shape[1], pca_model.n_components_))
    X_pca_full, y_all = createImageCubes(X_pca_full, y, windowSize=patch_size)
    X_pca_full = X_pca_full.reshape(-1, patch_size, patch_size, pca_model.n_components_, 1).transpose(0, 4, 3, 1, 2)
    all_data_loader = torch.utils.data.DataLoader(dataset=TestDS(X_pca_full, y_all), batch_size=BATCH_SIZE_TRAIN, shuffle=False, num_workers=2)

    # Save to cache as a dictionary
    with open(cache_file, 'wb') as f:
        pickle.dump({
            'train_loader': train_loader,
            'val_loader': val_loader,
            'test_loader': test_loader,
            'all_data_loader': all_data_loader,
            'y': y,
            'pca_components': pca_model.n_components_,
        }, f)

    return train_loader, val_loader, test_loader, all_data_loader, y, pca_model.n_components_

def train(train_loader, val_loader, num_classes, pca_components, lr, dropout, num_tokens, heads, epochs=100):
    device = xm.xla_device()
    print(f"PCA components passed: {pca_components}")
    net = SSFTTnet.SSFTTnet(
        in_channels=1,
        num_classes=num_classes,
        pca_components=pca_components,
        dropout=dropout,
        num_tokens=num_tokens,
        heads=heads).to(device)

    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)

    best_val_loss = float('inf')
    best_net = None

    training_losses = []
    validation_losses = []

    for epoch in range(epochs):
        total_loss = 0
        para_loader = pl.ParallelLoader(train_loader, [device])
        for i, (data, target) in enumerate(para_loader.per_device_loader(device)):
            data, target = data.to(device), target.to(device)
            outputs = net(data)
            loss = criterion(outputs, target)
            optimizer.zero_grad()
            loss.backward()
            xm.optimizer_step(optimizer)
            total_loss += loss.item()

        avg_training_loss = total_loss / len(train_loader)
        training_losses.append(avg_training_loss)

        # Validate
        para_loader_val = pl.ParallelLoader(val_loader, [device])
        net.eval()
        val_loss = 0
        with torch.no_grad():
            for i, (data, target) in enumerate(para_loader_val.per_device_loader(device)):
                data, target = data.to(device), target.to(device)
                outputs = net(data)
                loss = criterion(outputs, target)
                val_loss += loss.item()

        avg_validation_loss = val_loss / len(val_loader)
        validation_losses.append(avg_validation_loss)

        print('[Epoch: %d] [training loss avg: %.4f] [validation loss avg: %.4f]' %
              (epoch + 1, avg_training_loss, avg_validation_loss))

        # Save the best model based on validation loss
        if avg_validation_loss < best_val_loss:
            best_val_loss = avg_validation_loss
            best_net = net.state_dict()

        net.train()

    torch.save(best_net, f'{main_dir}/SSFTT/cls_params/SSFTTnet_params_{args.dataset}_best.pth')
    print('Best model saved with validation loss: {:.4f}'.format(best_val_loss))
    print("Finished training")

    return net, device, training_losses, validation_losses


def test(device, net, test_loader):
    count = 0
    net.eval()
    y_pred_test = []
    y_test = []
    para_loader = pl.ParallelLoader(test_loader, [device])

    for inputs, labels in para_loader.per_device_loader(device):
        inputs = inputs.to(device)
        outputs = net(inputs)
        outputs = torch.argmax(outputs, dim=1)

        if count == 0:
            y_pred_test = outputs.cpu().numpy()
            y_test = labels.cpu().numpy()
            count = 1
        else:
            y_pred_test = np.concatenate((y_pred_test, outputs.cpu().numpy()))
            y_test = np.concatenate((y_test, labels.cpu().numpy()))

    return y_pred_test, y_test

def objective(trial):
    lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
    dropout = trial.suggest_float('dropout', 0.1, 0.5)
    patch_size = trial.suggest_categorical('patch_size', [7,9,11,13,15])
    num_tokens = trial.suggest_categorical('num_tokens', [2, 4, 6, 8, 10, 12])
    heads = trial.suggest_categorical('heads', [1, 2, 4, 8, 16])

    train_loader, val_loader, test_loader, all_data_loader, y_all, pca_components = create_data_loader(
        args.dataset, args.kaggle_json_path, patch_size, train_samples=300
    )

    net, _, training_losses, validation_losses = train(
        train_loader,
        val_loader,
        num_classes=num_classes,
        pca_components=pca_components,
        lr=lr,
        dropout=dropout,
        num_tokens=num_tokens,
        heads=heads,
        epochs=100
    )

    # Return the minimum validation loss observed during training rather than last one
    # since we are saving the model parameters for that one
    best_val_loss = min(validation_losses)
    return best_val_loss

def AA_andEachClassAccuracy(confusion_matrix):
    list_diag = np.diag(confusion_matrix)
    list_raw_sum = np.sum(confusion_matrix, axis=1)
    each_acc = np.nan_to_num(np.divide(list_diag, list_raw_sum))
    average_acc = np.mean(each_acc)
    return each_acc, average_acc

def acc_reports(y_test, y_pred_test, dataset):
    target_mapping = {
    'HanChuan': ['Strawberry', 'Cowpea', 'Soybean', 'Sorghum', 'Water spinach', 'Watermelon', 'Greens', 'Trees', 'Grass', 'Red roof',
                 'Gray roof', 'Plastic', 'Bare soil', 'Road', 'Bright object', 'Water'],
    'HongHu': ['Red roof', 'Road', 'Bare soil', 'Cotton', 'Cotton firewood', 'Rape', 'Chinese cabbage', 'Pakchoi', 'Cabbage', 'Tuber mustard',
               'Brassica parachinensis', 'Brassica chinensis', 'Small Brassica chinensis', 'Lactuca sativa', 'Celtuce', 'Film covered lettuce',
               'Romaine lettuce', 'Carrot', 'White radish', 'Garlic sprout', 'Broad bean', 'Tree'],
    'LongKou': ['Corn', 'Cotton', 'Sesame', 'Broad-leaf soybean', 'Narrow-leaf soybean', 'Rice', 'Water', 'Roads and houses', 'Mixed weed']
    }
    target_names = target_mapping[dataset]
    classification = classification_report(y_test, y_pred_test, digits=4, target_names=target_names)
    oa = accuracy_score(y_test, y_pred_test)
    confusion = confusion_matrix(y_test, y_pred_test)
    each_acc, aa = AA_andEachClassAccuracy(confusion)
    kappa = cohen_kappa_score(y_test, y_pred_test)

    return classification, oa*100, confusion, each_acc*100, aa*100, kappa*100

class TestDS():
    def __init__(self, Xtest, ytest):
        self.len = Xtest.shape[0]
        self.x_data = torch.FloatTensor(Xtest)
        self.y_data = torch.LongTensor(ytest)

    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return self.len

class TrainDS():
    def __init__(self, Xtrain, ytrain):
        self.len = Xtrain.shape[0]
        self.x_data = torch.FloatTensor(Xtrain)
        self.y_data = torch.LongTensor(ytrain)

    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return self.len

class Args:
    def __init__(self, dataset, kaggle_json_path):
        self.dataset = dataset
        self.kaggle_json_path = kaggle_json_path

args = Args(dataset='HongHu', kaggle_json_path='~/.kaggle/kaggle.json')
num_classes = NUM_CLASSES[args.dataset]

study = optuna.create_study(direction='minimize', pruner=optuna.pruners.MedianPruner(n_warmup_steps=10))
study.optimize(objective, n_trials=50)

best_params = study.best_params
print("Best parameters: ", best_params)

train_loader, val_loader, test_loader, all_data_loader, y_all, pca_components = create_data_loader(
    args.dataset, args.kaggle_json_path, patch_size=best_params['patch_size'], train_samples=300
)

# Training model using the optimal parameters found
tic1 = time.perf_counter()
best_net, device, training_losses, validation_losses = train(train_loader, val_loader, num_classes, pca_components, lr=best_params['lr'], dropout=best_params['dropout'], num_tokens=best_params['num_tokens'], heads=best_params['heads'], epochs=100)
toc1 = time.perf_counter()

# Training/Validation loss plot
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(training_losses) + 1), training_losses, label='Training Loss')
plt.plot(range(1, len(validation_losses) + 1), validation_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss Curves')
plt.xticks(ticks=range(1, len(training_losses) + 1, 5))
plt.legend()
plt.grid(True)
plt.show()

torch.save(best_net.state_dict(), f'{main_dir}/SSFTT/cls_params/SSFTTnet_params_{args.dataset}.pth')

tic2 = time.perf_counter()
y_pred_test, y_test = test(device, best_net, test_loader)
toc2 = time.perf_counter()

classification, oa, confusion, each_acc, aa, kappa = acc_reports(y_test, y_pred_test, args.dataset)
print(f"{args.dataset} overall accuracy: {oa}")
classification = str(classification)
Training_Time = toc1 - tic1
Test_time = toc2 - tic2
report_file_name = f"{main_dir}/SSFTT/cls_results/classification_report_{args.dataset}.txt"
with open(report_file_name, 'w') as x_file:
    x_file.write('{} Training_Time (s)'.format(Training_Time))
    x_file.write('\n')
    x_file.write('{} Test_time (s)'.format(Test_time))
    x_file.write('\n')
    x_file.write('{} Kappa accuracy (%)'.format(kappa))
    x_file.write('\n')
    x_file.write('{} Overall accuracy (%)'.format(oa))
    x_file.write('\n')
    x_file.write('{} Average accuracy (%)'.format(aa))
    x_file.write('\n')
    x_file.write('{} Each accuracy (%)'.format(each_acc))
    x_file.write('\n')
    x_file.write('{}'.format(classification))
    x_file.write('\n')
    x_file.write('{}'.format(confusion))

get_cls_map.get_cls_map(best_net, device, all_data_loader, y_all, args.dataset, model="SSFTT")

[I 2024-08-31 20:33:17,517] A new study created in memory with name: no-name-ae5eea13-e841-4fd3-92d1-3e40c15369ad
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)


Dataset URL: https://www.kaggle.com/datasets/rupeshkumaryadav/whu-hyperspectral-dataset
Downloaded and extracted the dataset WHU-Hi-HongHu to /content/Data
Loading data from /content/Data/WHU-Hi-HongHu/WHU_Hi_HongHu.mat
Loading labels from /content/Data/WHU-Hi-HongHu/WHU_Hi_HongHu_gt.mat
Keys in data_mat: ['__header__', '__version__', '__globals__', 'WHU_Hi_HongHu']
Keys in labels_mat: ['__header__', '__version__', '__globals__', 'WHU_Hi_HongHu_gt']
Available keys in data_mat: ['__header__', '__version__', '__globals__', 'WHU_Hi_HongHu']
Available keys in labels_mat: ['__header__', '__version__', '__globals__', 'WHU_Hi_HongHu_gt']
Hyperspectral data shape:  (940, 475, 270)
Label shape:  (940, 475)
Train mask shape: (940, 475)
Test mask shape: (940, 475)
Patch (window) size: 7

... ... Create data cubes with masks (Before PCA to avoid data leakage) ... ...
Margin: 3
Padded X shape: (946, 481, 270)
Training mask statistics: {0: 439900, 1: 300, 2: 300, 3: 300, 4: 300, 5: 300, 6: 300, 7: 3

[I 2024-08-31 20:46:51,557] Trial 0 finished with value: 0.2940286827229318 and parameters: {'lr': 0.00020271762685680864, 'dropout': 0.2650193079515236, 'patch_size': 7, 'num_tokens': 10, 'heads': 2}. Best is trial 0 with value: 0.2940286827229318.


[Epoch: 100] [training loss avg: 0.0183] [validation loss avg: 0.6255]
Best model saved with validation loss: 0.2940
Finished training
Dataset /content/Data/WHU-Hi-HongHu already exists, skipping download.
Loading data from /content/Data/WHU-Hi-HongHu/WHU_Hi_HongHu.mat
Loading labels from /content/Data/WHU-Hi-HongHu/WHU_Hi_HongHu_gt.mat


  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)


Keys in data_mat: ['__header__', '__version__', '__globals__', 'WHU_Hi_HongHu']
Keys in labels_mat: ['__header__', '__version__', '__globals__', 'WHU_Hi_HongHu_gt']
Available keys in data_mat: ['__header__', '__version__', '__globals__', 'WHU_Hi_HongHu']
Available keys in labels_mat: ['__header__', '__version__', '__globals__', 'WHU_Hi_HongHu_gt']
Hyperspectral data shape:  (940, 475, 270)
Label shape:  (940, 475)
Train mask shape: (940, 475)
Test mask shape: (940, 475)
Patch (window) size: 9

... ... Create data cubes with masks (Before PCA to avoid data leakage) ... ...
Margin: 4
Padded X shape: (948, 483, 270)
Training mask statistics: {0: 439900, 1: 300, 2: 300, 3: 300, 4: 300, 5: 300, 6: 300, 7: 300, 8: 300, 9: 300, 10: 300, 11: 300, 12: 300, 13: 300, 14: 300, 15: 300, 16: 300, 17: 300, 18: 300, 19: 300, 20: 300, 21: 300, 22: 300}
y shape: (940, 475), type: <class 'numpy.ndarray'>
Training mask positions shape: (6600, 2), type: <class 'numpy.ndarray'>
_____________________________

[I 2024-08-31 21:00:48,672] Trial 1 finished with value: 0.2168125416196528 and parameters: {'lr': 5.109900453298649e-05, 'dropout': 0.22415287930017833, 'patch_size': 9, 'num_tokens': 2, 'heads': 1}. Best is trial 1 with value: 0.2168125416196528.


[Epoch: 100] [training loss avg: 0.0019] [validation loss avg: 0.3043]
Best model saved with validation loss: 0.2168
Finished training
Dataset /content/Data/WHU-Hi-HongHu already exists, skipping download.
Loading data from /content/Data/WHU-Hi-HongHu/WHU_Hi_HongHu.mat
Loading labels from /content/Data/WHU-Hi-HongHu/WHU_Hi_HongHu_gt.mat


  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)


Keys in data_mat: ['__header__', '__version__', '__globals__', 'WHU_Hi_HongHu']
Keys in labels_mat: ['__header__', '__version__', '__globals__', 'WHU_Hi_HongHu_gt']
Available keys in data_mat: ['__header__', '__version__', '__globals__', 'WHU_Hi_HongHu']
Available keys in labels_mat: ['__header__', '__version__', '__globals__', 'WHU_Hi_HongHu_gt']
Hyperspectral data shape:  (940, 475, 270)
Label shape:  (940, 475)
Train mask shape: (940, 475)
Test mask shape: (940, 475)
Patch (window) size: 15

... ... Create data cubes with masks (Before PCA to avoid data leakage) ... ...
Margin: 7
Padded X shape: (954, 489, 270)
Training mask statistics: {0: 439900, 1: 300, 2: 300, 3: 300, 4: 300, 5: 300, 6: 300, 7: 300, 8: 300, 9: 300, 10: 300, 11: 300, 12: 300, 13: 300, 14: 300, 15: 300, 16: 300, 17: 300, 18: 300, 19: 300, 20: 300, 21: 300, 22: 300}
y shape: (940, 475), type: <class 'numpy.ndarray'>
Training mask positions shape: (6600, 2), type: <class 'numpy.ndarray'>
____________________________

[I 2024-08-31 21:21:02,190] Trial 2 finished with value: 0.05220957204610819 and parameters: {'lr': 0.00020733959650261712, 'dropout': 0.33669326643658815, 'patch_size': 15, 'num_tokens': 12, 'heads': 2}. Best is trial 2 with value: 0.05220957204610819.


[Epoch: 100] [training loss avg: 0.0005] [validation loss avg: 0.0732]
Best model saved with validation loss: 0.0522
Finished training

Loading cached data for patch size 9 and train samples 300...


  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
  return torch.load(io.BytesIO(b))


PCA components passed: 30
[Epoch: 1] [training loss avg: 3.1410] [validation loss avg: 2.8815]
[Epoch: 2] [training loss avg: 2.8525] [validation loss avg: 2.5979]
[Epoch: 3] [training loss avg: 2.6109] [validation loss avg: 2.3377]
[Epoch: 4] [training loss avg: 2.4298] [validation loss avg: 2.1163]
[Epoch: 5] [training loss avg: 2.2300] [validation loss avg: 1.9139]
[Epoch: 6] [training loss avg: 2.0869] [validation loss avg: 1.7561]
[Epoch: 7] [training loss avg: 1.9441] [validation loss avg: 1.6225]
[Epoch: 8] [training loss avg: 1.8389] [validation loss avg: 1.5112]
[Epoch: 9] [training loss avg: 1.7480] [validation loss avg: 1.4125]
[Epoch: 10] [training loss avg: 1.6599] [validation loss avg: 1.3285]
[Epoch: 11] [training loss avg: 1.5893] [validation loss avg: 1.2545]
[Epoch: 12] [training loss avg: 1.5193] [validation loss avg: 1.1820]
[Epoch: 13] [training loss avg: 1.4292] [validation loss avg: 1.1204]
[Epoch: 14] [training loss avg: 1.3736] [validation loss avg: 1.0593]
[Ep

[I 2024-08-31 21:34:40,389] Trial 3 finished with value: 0.2061551401302928 and parameters: {'lr': 1.4840493310704266e-05, 'dropout': 0.4288824841580011, 'patch_size': 9, 'num_tokens': 8, 'heads': 1}. Best is trial 2 with value: 0.05220957204610819.


Dataset /content/Data/WHU-Hi-HongHu already exists, skipping download.
Loading data from /content/Data/WHU-Hi-HongHu/WHU_Hi_HongHu.mat
Loading labels from /content/Data/WHU-Hi-HongHu/WHU_Hi_HongHu_gt.mat
Keys in data_mat: ['__header__', '__version__', '__globals__', 'WHU_Hi_HongHu']
Keys in labels_mat: ['__header__', '__version__', '__globals__', 'WHU_Hi_HongHu_gt']
Available keys in data_mat: ['__header__', '__version__', '__globals__', 'WHU_Hi_HongHu']
Available keys in labels_mat: ['__header__', '__version__', '__globals__', 'WHU_Hi_HongHu_gt']
Hyperspectral data shape:  (940, 475, 270)
Label shape:  (940, 475)
Train mask shape: (940, 475)
Test mask shape: (940, 475)
Patch (window) size: 11

... ... Create data cubes with masks (Before PCA to avoid data leakage) ... ...
Margin: 5
Padded X shape: (950, 485, 270)
Training mask statistics: {0: 439900, 1: 300, 2: 300, 3: 300, 4: 300, 5: 300, 6: 300, 7: 300, 8: 300, 9: 300, 10: 300, 11: 300, 12: 300, 13: 300, 14: 300, 15: 300, 16: 300, 1

[I 2024-08-31 21:51:00,364] Trial 4 finished with value: 0.1143284458223553 and parameters: {'lr': 4.107358714277235e-05, 'dropout': 0.19715250626162784, 'patch_size': 11, 'num_tokens': 12, 'heads': 1}. Best is trial 2 with value: 0.05220957204610819.


[Epoch: 100] [training loss avg: 0.0130] [validation loss avg: 0.2219]
Best model saved with validation loss: 0.1143
Finished training

Loading cached data for patch size 15 and train samples 300...


  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
  return torch.load(io.BytesIO(b))


PCA components passed: 30
[Epoch: 1] [training loss avg: 2.2054] [validation loss avg: 1.2482]
[Epoch: 2] [training loss avg: 1.0655] [validation loss avg: 0.7227]
[Epoch: 3] [training loss avg: 0.6122] [validation loss avg: 0.4024]
[Epoch: 4] [training loss avg: 0.3663] [validation loss avg: 0.2331]
[Epoch: 5] [training loss avg: 0.2389] [validation loss avg: 0.2029]
[Epoch: 6] [training loss avg: 0.1622] [validation loss avg: 0.1613]
[Epoch: 7] [training loss avg: 0.1171] [validation loss avg: 0.1553]
[Epoch: 8] [training loss avg: 0.0971] [validation loss avg: 0.1595]
[Epoch: 9] [training loss avg: 0.0680] [validation loss avg: 0.2342]
[Epoch: 10] [training loss avg: 0.0608] [validation loss avg: 0.1053]
[Epoch: 11] [training loss avg: 0.0422] [validation loss avg: 0.1158]
[Epoch: 12] [training loss avg: 0.0347] [validation loss avg: 0.0962]
[Epoch: 13] [training loss avg: 0.0274] [validation loss avg: 0.1115]
[Epoch: 14] [training loss avg: 0.0257] [validation loss avg: 0.1182]
[Ep

[I 2024-08-31 22:07:07,990] Trial 5 finished with value: 0.08710166426109416 and parameters: {'lr': 0.00015852979210026166, 'dropout': 0.29291528480378126, 'patch_size': 15, 'num_tokens': 4, 'heads': 1}. Best is trial 2 with value: 0.05220957204610819.



Loading cached data for patch size 7 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 3.1021] [validation loss avg: 2.8826]
[Epoch: 2] [training loss avg: 2.8429] [validation loss avg: 2.5981]
[Epoch: 3] [training loss avg: 2.5706] [validation loss avg: 2.3084]
[Epoch: 4] [training loss avg: 2.3181] [validation loss avg: 2.0464]
[Epoch: 5] [training loss avg: 2.0916] [validation loss avg: 1.8360]
[Epoch: 6] [training loss avg: 1.9196] [validation loss avg: 1.6722]
[Epoch: 7] [training loss avg: 1.7566] [validation loss avg: 1.5316]
[Epoch: 8] [training loss avg: 1.6301] [validation loss avg: 1.4109]
[Epoch: 9] [training loss avg: 1.5149] [validation loss avg: 1.3071]
[Epoch: 10] [training loss avg: 1.3985] [validation loss avg: 1.2101]
[Epoch: 11] [training loss avg: 1.3019] [validation loss avg: 1.1287]
[Epoch: 12] [training loss avg: 1.2229] [validation loss avg: 1.0502]
[Epoch: 13] [training loss avg: 1.1531] [validation loss avg: 0.9880]
[Epoch: 14

[I 2024-08-31 22:20:50,594] Trial 6 finished with value: 0.4110558990921293 and parameters: {'lr': 2.0920905133730838e-05, 'dropout': 0.25861340077099354, 'patch_size': 7, 'num_tokens': 2, 'heads': 2}. Best is trial 2 with value: 0.05220957204610819.



Loading cached data for patch size 9 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 1.4228] [validation loss avg: 0.6224]
[Epoch: 2] [training loss avg: 0.5283] [validation loss avg: 0.3112]
[Epoch: 3] [training loss avg: 0.3215] [validation loss avg: 0.2909]
[Epoch: 4] [training loss avg: 0.2418] [validation loss avg: 0.2511]
[Epoch: 5] [training loss avg: 0.1696] [validation loss avg: 0.2087]
[Epoch: 6] [training loss avg: 0.1237] [validation loss avg: 0.2553]
[Epoch: 7] [training loss avg: 0.0855] [validation loss avg: 0.1684]
[Epoch: 8] [training loss avg: 0.0706] [validation loss avg: 0.2039]
[Epoch: 9] [training loss avg: 0.0851] [validation loss avg: 0.2266]
[Epoch: 10] [training loss avg: 0.0621] [validation loss avg: 0.2708]
[Epoch: 11] [training loss avg: 0.0490] [validation loss avg: 0.2093]
[Epoch: 12] [training loss avg: 0.0492] [validation loss avg: 0.2031]
[Epoch: 13] [training loss avg: 0.0376] [validation loss avg: 0.1971]
[Epoch: 14

[I 2024-08-31 22:35:14,879] Trial 7 finished with value: 0.16838875297634376 and parameters: {'lr': 0.0008304975657036694, 'dropout': 0.3265176846844078, 'patch_size': 9, 'num_tokens': 10, 'heads': 8}. Best is trial 2 with value: 0.05220957204610819.



Loading cached data for patch size 11 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 3.0971] [validation loss avg: 2.6989]
[Epoch: 2] [training loss avg: 2.6332] [validation loss avg: 2.2253]
[Epoch: 3] [training loss avg: 2.2407] [validation loss avg: 1.8263]
[Epoch: 4] [training loss avg: 1.9256] [validation loss avg: 1.5227]
[Epoch: 5] [training loss avg: 1.6835] [validation loss avg: 1.3067]
[Epoch: 6] [training loss avg: 1.4891] [validation loss avg: 1.1364]
[Epoch: 7] [training loss avg: 1.3605] [validation loss avg: 1.0002]
[Epoch: 8] [training loss avg: 1.2234] [validation loss avg: 0.8839]
[Epoch: 9] [training loss avg: 1.1129] [validation loss avg: 0.7850]
[Epoch: 10] [training loss avg: 1.0074] [validation loss avg: 0.6964]
[Epoch: 11] [training loss avg: 0.9229] [validation loss avg: 0.6205]
[Epoch: 12] [training loss avg: 0.8381] [validation loss avg: 0.5551]
[Epoch: 13] [training loss avg: 0.7699] [validation loss avg: 0.5040]
[Epoch: 1

[I 2024-08-31 22:50:25,267] Trial 8 finished with value: 0.08853985723995027 and parameters: {'lr': 2.6467022576868203e-05, 'dropout': 0.3746670849528738, 'patch_size': 11, 'num_tokens': 2, 'heads': 2}. Best is trial 2 with value: 0.05220957204610819.



Loading cached data for patch size 15 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 1.4415] [validation loss avg: 0.5674]
[Epoch: 2] [training loss avg: 0.4643] [validation loss avg: 0.2962]
[Epoch: 3] [training loss avg: 0.2501] [validation loss avg: 0.1576]
[Epoch: 4] [training loss avg: 0.1648] [validation loss avg: 0.1281]
[Epoch: 5] [training loss avg: 0.1054] [validation loss avg: 0.1241]
[Epoch: 6] [training loss avg: 0.0843] [validation loss avg: 0.1361]
[Epoch: 7] [training loss avg: 0.0633] [validation loss avg: 0.1024]
[Epoch: 8] [training loss avg: 0.0533] [validation loss avg: 0.1233]
[Epoch: 9] [training loss avg: 0.0434] [validation loss avg: 0.1269]
[Epoch: 10] [training loss avg: 0.0329] [validation loss avg: 0.0863]
[Epoch: 11] [training loss avg: 0.0333] [validation loss avg: 0.0869]
[Epoch: 12] [training loss avg: 0.0225] [validation loss avg: 0.1373]
[Epoch: 13] [training loss avg: 0.0253] [validation loss avg: 0.0898]
[Epoch: 1

[I 2024-08-31 23:07:26,585] Trial 9 finished with value: 0.07213975915718558 and parameters: {'lr': 0.00043281193670123883, 'dropout': 0.24039115343107764, 'patch_size': 15, 'num_tokens': 12, 'heads': 4}. Best is trial 2 with value: 0.05220957204610819.


Dataset /content/Data/WHU-Hi-HongHu already exists, skipping download.
Loading data from /content/Data/WHU-Hi-HongHu/WHU_Hi_HongHu.mat
Loading labels from /content/Data/WHU-Hi-HongHu/WHU_Hi_HongHu_gt.mat
Keys in data_mat: ['__header__', '__version__', '__globals__', 'WHU_Hi_HongHu']
Keys in labels_mat: ['__header__', '__version__', '__globals__', 'WHU_Hi_HongHu_gt']
Available keys in data_mat: ['__header__', '__version__', '__globals__', 'WHU_Hi_HongHu']
Available keys in labels_mat: ['__header__', '__version__', '__globals__', 'WHU_Hi_HongHu_gt']
Hyperspectral data shape:  (940, 475, 270)
Label shape:  (940, 475)
Train mask shape: (940, 475)
Test mask shape: (940, 475)
Patch (window) size: 13

... ... Create data cubes with masks (Before PCA to avoid data leakage) ... ...
Margin: 6
Padded X shape: (952, 487, 270)
Training mask statistics: {0: 439900, 1: 300, 2: 300, 3: 300, 4: 300, 5: 300, 6: 300, 7: 300, 8: 300, 9: 300, 10: 300, 11: 300, 12: 300, 13: 300, 14: 300, 15: 300, 16: 300, 1

[I 2024-08-31 23:27:20,610] Trial 10 finished with value: 0.08985235476644621 and parameters: {'lr': 9.828845109236366e-05, 'dropout': 0.10287826139197079, 'patch_size': 13, 'num_tokens': 6, 'heads': 16}. Best is trial 2 with value: 0.05220957204610819.


[Epoch: 100] [training loss avg: 0.0004] [validation loss avg: 0.1204]
Best model saved with validation loss: 0.0899
Finished training

Loading cached data for patch size 15 and train samples 300...


  lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
  return torch.load(io.BytesIO(b))


PCA components passed: 30
[Epoch: 1] [training loss avg: 1.7012] [validation loss avg: 0.5936]
[Epoch: 2] [training loss avg: 0.5993] [validation loss avg: 0.3060]
[Epoch: 3] [training loss avg: 0.3285] [validation loss avg: 0.2512]
[Epoch: 4] [training loss avg: 0.2139] [validation loss avg: 0.2014]
[Epoch: 5] [training loss avg: 0.1701] [validation loss avg: 0.1033]
[Epoch: 6] [training loss avg: 0.1627] [validation loss avg: 0.1195]
[Epoch: 7] [training loss avg: 0.1019] [validation loss avg: 0.1172]
[Epoch: 8] [training loss avg: 0.1027] [validation loss avg: 0.1144]
[Epoch: 9] [training loss avg: 0.0781] [validation loss avg: 0.1501]
[Epoch: 10] [training loss avg: 0.0628] [validation loss avg: 0.1816]
[Epoch: 11] [training loss avg: 0.0576] [validation loss avg: 0.1469]
[Epoch: 12] [training loss avg: 0.0560] [validation loss avg: 0.1494]
[Epoch: 13] [training loss avg: 0.0505] [validation loss avg: 0.1144]
[Epoch: 14] [training loss avg: 0.0451] [validation loss avg: 0.0832]
[Ep

[I 2024-08-31 23:44:16,975] Trial 11 finished with value: 0.07733837210287761 and parameters: {'lr': 0.0004949369486494965, 'dropout': 0.46800062588259206, 'patch_size': 15, 'num_tokens': 12, 'heads': 4}. Best is trial 2 with value: 0.05220957204610819.



Loading cached data for patch size 15 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 1.6093] [validation loss avg: 0.6942]
[Epoch: 2] [training loss avg: 0.5305] [validation loss avg: 0.2977]
[Epoch: 3] [training loss avg: 0.2560] [validation loss avg: 0.2576]
[Epoch: 4] [training loss avg: 0.1489] [validation loss avg: 0.1648]
[Epoch: 5] [training loss avg: 0.1210] [validation loss avg: 0.1792]
[Epoch: 6] [training loss avg: 0.0877] [validation loss avg: 0.1181]
[Epoch: 7] [training loss avg: 0.0550] [validation loss avg: 0.1296]
[Epoch: 8] [training loss avg: 0.0546] [validation loss avg: 0.1438]
[Epoch: 9] [training loss avg: 0.0584] [validation loss avg: 0.1212]
[Epoch: 10] [training loss avg: 0.0219] [validation loss avg: 0.0959]
[Epoch: 11] [training loss avg: 0.0193] [validation loss avg: 0.1160]
[Epoch: 12] [training loss avg: 0.0229] [validation loss avg: 0.2017]
[Epoch: 13] [training loss avg: 0.0193] [validation loss avg: 0.1206]
[Epoch: 1

[I 2024-09-01 00:00:39,367] Trial 12 finished with value: 0.059219851007496584 and parameters: {'lr': 0.00030893457114765713, 'dropout': 0.15748518713784407, 'patch_size': 15, 'num_tokens': 12, 'heads': 4}. Best is trial 2 with value: 0.05220957204610819.



Loading cached data for patch size 15 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 1.8120] [validation loss avg: 0.8872]
[Epoch: 2] [training loss avg: 0.6720] [validation loss avg: 0.4296]
[Epoch: 3] [training loss avg: 0.3234] [validation loss avg: 0.2215]
[Epoch: 4] [training loss avg: 0.1975] [validation loss avg: 0.2480]
[Epoch: 5] [training loss avg: 0.1400] [validation loss avg: 0.1551]
[Epoch: 6] [training loss avg: 0.0972] [validation loss avg: 0.1409]
[Epoch: 7] [training loss avg: 0.0699] [validation loss avg: 0.1247]
[Epoch: 8] [training loss avg: 0.0496] [validation loss avg: 0.1214]
[Epoch: 9] [training loss avg: 0.0444] [validation loss avg: 0.0773]
[Epoch: 10] [training loss avg: 0.0352] [validation loss avg: 0.1361]
[Epoch: 11] [training loss avg: 0.0305] [validation loss avg: 0.1093]
[Epoch: 12] [training loss avg: 0.0212] [validation loss avg: 0.1119]
[Epoch: 13] [training loss avg: 0.0225] [validation loss avg: 0.0922]
[Epoch: 1

[I 2024-09-01 00:16:58,654] Trial 13 finished with value: 0.06409411447211391 and parameters: {'lr': 0.0002719755549895266, 'dropout': 0.141094442959751, 'patch_size': 15, 'num_tokens': 12, 'heads': 4}. Best is trial 2 with value: 0.05220957204610819.



Loading cached data for patch size 15 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 2.4796] [validation loss avg: 1.7022]
[Epoch: 2] [training loss avg: 1.6344] [validation loss avg: 1.0618]
[Epoch: 3] [training loss avg: 1.1301] [validation loss avg: 0.6905]
[Epoch: 4] [training loss avg: 0.8077] [validation loss avg: 0.4610]
[Epoch: 5] [training loss avg: 0.5983] [validation loss avg: 0.3402]
[Epoch: 6] [training loss avg: 0.4555] [validation loss avg: 0.2689]
[Epoch: 7] [training loss avg: 0.3594] [validation loss avg: 0.2047]
[Epoch: 8] [training loss avg: 0.2781] [validation loss avg: 0.1857]
[Epoch: 9] [training loss avg: 0.2301] [validation loss avg: 0.1698]
[Epoch: 10] [training loss avg: 0.1998] [validation loss avg: 0.1626]
[Epoch: 11] [training loss avg: 0.1700] [validation loss avg: 0.1600]
[Epoch: 12] [training loss avg: 0.1370] [validation loss avg: 0.1391]
[Epoch: 13] [training loss avg: 0.1191] [validation loss avg: 0.1055]
[Epoch: 1

[I 2024-09-01 00:34:23,882] Trial 14 finished with value: 0.06630058319119382 and parameters: {'lr': 9.154585326652438e-05, 'dropout': 0.35229481460929607, 'patch_size': 15, 'num_tokens': 12, 'heads': 16}. Best is trial 2 with value: 0.05220957204610819.



Loading cached data for patch size 13 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 1.1803] [validation loss avg: 0.5311]
[Epoch: 2] [training loss avg: 0.3423] [validation loss avg: 0.2038]
[Epoch: 3] [training loss avg: 0.1820] [validation loss avg: 0.1466]
[Epoch: 4] [training loss avg: 0.1228] [validation loss avg: 0.1707]
[Epoch: 5] [training loss avg: 0.0834] [validation loss avg: 0.1446]
[Epoch: 6] [training loss avg: 0.0630] [validation loss avg: 0.1532]
[Epoch: 7] [training loss avg: 0.0458] [validation loss avg: 0.0929]
[Epoch: 8] [training loss avg: 0.0363] [validation loss avg: 0.1256]
[Epoch: 9] [training loss avg: 0.0324] [validation loss avg: 0.1084]
[Epoch: 10] [training loss avg: 0.0336] [validation loss avg: 0.1676]
[Epoch: 11] [training loss avg: 0.0351] [validation loss avg: 0.1530]
[Epoch: 12] [training loss avg: 0.0437] [validation loss avg: 0.1483]
[Epoch: 13] [training loss avg: 0.0436] [validation loss avg: 0.1270]
[Epoch: 1

[I 2024-09-01 00:51:03,683] Trial 15 finished with value: 0.09285946429840156 and parameters: {'lr': 0.0008542959182487291, 'dropout': 0.16592303778589695, 'patch_size': 13, 'num_tokens': 4, 'heads': 8}. Best is trial 2 with value: 0.05220957204610819.



Loading cached data for patch size 15 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 1.6923] [validation loss avg: 0.6708]
[Epoch: 2] [training loss avg: 0.6089] [validation loss avg: 0.3928]
[Epoch: 3] [training loss avg: 0.3278] [validation loss avg: 0.2235]
[Epoch: 4] [training loss avg: 0.2166] [validation loss avg: 0.1847]
[Epoch: 5] [training loss avg: 0.1605] [validation loss avg: 0.2581]
[Epoch: 6] [training loss avg: 0.1296] [validation loss avg: 0.1901]
[Epoch: 7] [training loss avg: 0.0983] [validation loss avg: 0.1188]
[Epoch: 8] [training loss avg: 0.0719] [validation loss avg: 0.0980]
[Epoch: 9] [training loss avg: 0.0608] [validation loss avg: 0.0836]
[Epoch: 10] [training loss avg: 0.0511] [validation loss avg: 0.0956]
[Epoch: 11] [training loss avg: 0.0472] [validation loss avg: 0.0754]
[Epoch: 12] [training loss avg: 0.0341] [validation loss avg: 0.1282]
[Epoch: 13] [training loss avg: 0.0373] [validation loss avg: 0.1112]
[Epoch: 1

[I 2024-09-01 01:09:09,858] Trial 16 finished with value: 0.0753529003732616 and parameters: {'lr': 0.00038260820772143597, 'dropout': 0.3950177758400702, 'patch_size': 15, 'num_tokens': 6, 'heads': 2}. Best is trial 2 with value: 0.05220957204610819.



Loading cached data for patch size 15 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 2.2617] [validation loss avg: 1.5112]
[Epoch: 2] [training loss avg: 1.2482] [validation loss avg: 0.7605]
[Epoch: 3] [training loss avg: 0.7533] [validation loss avg: 0.4726]
[Epoch: 4] [training loss avg: 0.4802] [validation loss avg: 0.2897]
[Epoch: 5] [training loss avg: 0.3304] [validation loss avg: 0.2212]
[Epoch: 6] [training loss avg: 0.2390] [validation loss avg: 0.1595]
[Epoch: 7] [training loss avg: 0.1778] [validation loss avg: 0.1484]
[Epoch: 8] [training loss avg: 0.1513] [validation loss avg: 0.1472]
[Epoch: 9] [training loss avg: 0.1148] [validation loss avg: 0.1102]
[Epoch: 10] [training loss avg: 0.0895] [validation loss avg: 0.1137]
[Epoch: 11] [training loss avg: 0.0738] [validation loss avg: 0.0940]
[Epoch: 12] [training loss avg: 0.0593] [validation loss avg: 0.0933]
[Epoch: 13] [training loss avg: 0.0534] [validation loss avg: 0.0858]
[Epoch: 1

[I 2024-09-01 01:27:32,679] Trial 17 finished with value: 0.06189928890671581 and parameters: {'lr': 0.00013867481940012748, 'dropout': 0.32999320764455203, 'patch_size': 15, 'num_tokens': 8, 'heads': 4}. Best is trial 2 with value: 0.05220957204610819.



Loading cached data for patch size 15 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 2.8328] [validation loss avg: 2.1378]
[Epoch: 2] [training loss avg: 2.0314] [validation loss avg: 1.4380]
[Epoch: 3] [training loss avg: 1.5670] [validation loss avg: 1.0596]
[Epoch: 4] [training loss avg: 1.2309] [validation loss avg: 0.7687]
[Epoch: 5] [training loss avg: 0.9951] [validation loss avg: 0.6052]
[Epoch: 6] [training loss avg: 0.7989] [validation loss avg: 0.4567]
[Epoch: 7] [training loss avg: 0.6567] [validation loss avg: 0.3636]
[Epoch: 8] [training loss avg: 0.5468] [validation loss avg: 0.3230]
[Epoch: 9] [training loss avg: 0.4599] [validation loss avg: 0.2480]
[Epoch: 10] [training loss avg: 0.3975] [validation loss avg: 0.2438]
[Epoch: 11] [training loss avg: 0.3381] [validation loss avg: 0.2075]
[Epoch: 12] [training loss avg: 0.2844] [validation loss avg: 0.1664]
[Epoch: 13] [training loss avg: 0.2469] [validation loss avg: 0.1524]
[Epoch: 1

[I 2024-09-01 01:45:01,905] Trial 18 finished with value: 0.0938031442208393 and parameters: {'lr': 6.903578511942849e-05, 'dropout': 0.49547822781810763, 'patch_size': 15, 'num_tokens': 12, 'heads': 2}. Best is trial 2 with value: 0.05220957204610819.



Loading cached data for patch size 13 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 1.9483] [validation loss avg: 0.9694]
[Epoch: 2] [training loss avg: 0.8226] [validation loss avg: 0.6015]
[Epoch: 3] [training loss avg: 0.4391] [validation loss avg: 0.3486]
[Epoch: 4] [training loss avg: 0.2697] [validation loss avg: 0.1952]
[Epoch: 5] [training loss avg: 0.1899] [validation loss avg: 0.1936]
[Epoch: 6] [training loss avg: 0.1285] [validation loss avg: 0.1817]
[Epoch: 7] [training loss avg: 0.0951] [validation loss avg: 0.1385]
[Epoch: 8] [training loss avg: 0.0795] [validation loss avg: 0.1046]
[Epoch: 9] [training loss avg: 0.0664] [validation loss avg: 0.0933]
[Epoch: 10] [training loss avg: 0.0459] [validation loss avg: 0.1014]
[Epoch: 11] [training loss avg: 0.0429] [validation loss avg: 0.1281]
[Epoch: 12] [training loss avg: 0.0442] [validation loss avg: 0.1476]
[Epoch: 13] [training loss avg: 0.0403] [validation loss avg: 0.1505]
[Epoch: 1

[I 2024-09-01 02:02:31,286] Trial 19 finished with value: 0.08542341089230918 and parameters: {'lr': 0.0002542587811514805, 'dropout': 0.29409523173955276, 'patch_size': 13, 'num_tokens': 12, 'heads': 4}. Best is trial 2 with value: 0.05220957204610819.



Loading cached data for patch size 7 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 1.7206] [validation loss avg: 0.7917]
[Epoch: 2] [training loss avg: 0.6809] [validation loss avg: 0.5003]
[Epoch: 3] [training loss avg: 0.4069] [validation loss avg: 0.4624]
[Epoch: 4] [training loss avg: 0.2595] [validation loss avg: 0.4088]
[Epoch: 5] [training loss avg: 0.1833] [validation loss avg: 0.3227]
[Epoch: 6] [training loss avg: 0.1112] [validation loss avg: 0.4310]
[Epoch: 7] [training loss avg: 0.1047] [validation loss avg: 0.3515]
[Epoch: 8] [training loss avg: 0.0511] [validation loss avg: 0.3124]
[Epoch: 9] [training loss avg: 0.0326] [validation loss avg: 0.3960]
[Epoch: 10] [training loss avg: 0.0306] [validation loss avg: 0.3324]
[Epoch: 11] [training loss avg: 0.0523] [validation loss avg: 0.3838]
[Epoch: 12] [training loss avg: 0.0511] [validation loss avg: 0.4175]
[Epoch: 13] [training loss avg: 0.0607] [validation loss avg: 0.4403]
[Epoch: 14

[I 2024-09-01 02:19:30,093] Trial 20 finished with value: 0.31244678014800664 and parameters: {'lr': 0.0004920736303493668, 'dropout': 0.17700074605616745, 'patch_size': 7, 'num_tokens': 12, 'heads': 16}. Best is trial 2 with value: 0.05220957204610819.



Loading cached data for patch size 15 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 2.0086] [validation loss avg: 1.1059]
[Epoch: 2] [training loss avg: 0.9519] [validation loss avg: 0.4808]
[Epoch: 3] [training loss avg: 0.5151] [validation loss avg: 0.3160]
[Epoch: 4] [training loss avg: 0.3175] [validation loss avg: 0.2261]
[Epoch: 5] [training loss avg: 0.2180] [validation loss avg: 0.1899]
[Epoch: 6] [training loss avg: 0.1477] [validation loss avg: 0.1376]
[Epoch: 7] [training loss avg: 0.1152] [validation loss avg: 0.1258]
[Epoch: 8] [training loss avg: 0.0905] [validation loss avg: 0.1091]
[Epoch: 9] [training loss avg: 0.0767] [validation loss avg: 0.1368]
[Epoch: 10] [training loss avg: 0.0703] [validation loss avg: 0.1189]
[Epoch: 11] [training loss avg: 0.0599] [validation loss avg: 0.1015]
[Epoch: 12] [training loss avg: 0.0452] [validation loss avg: 0.1511]
[Epoch: 13] [training loss avg: 0.0348] [validation loss avg: 0.0884]
[Epoch: 1

[I 2024-09-01 02:37:19,173] Trial 21 finished with value: 0.06639763792710644 and parameters: {'lr': 0.0001963823162889303, 'dropout': 0.3311224955818673, 'patch_size': 15, 'num_tokens': 8, 'heads': 4}. Best is trial 2 with value: 0.05220957204610819.



Loading cached data for patch size 15 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 2.3413] [validation loss avg: 1.4270]
[Epoch: 2] [training loss avg: 1.3072] [validation loss avg: 0.7431]
[Epoch: 3] [training loss avg: 0.8015] [validation loss avg: 0.4354]
[Epoch: 4] [training loss avg: 0.5190] [validation loss avg: 0.2892]
[Epoch: 5] [training loss avg: 0.3739] [validation loss avg: 0.2205]
[Epoch: 6] [training loss avg: 0.2772] [validation loss avg: 0.1785]
[Epoch: 7] [training loss avg: 0.2069] [validation loss avg: 0.1467]
[Epoch: 8] [training loss avg: 0.1688] [validation loss avg: 0.1326]
[Epoch: 9] [training loss avg: 0.1368] [validation loss avg: 0.1137]
[Epoch: 10] [training loss avg: 0.1180] [validation loss avg: 0.2264]
[Epoch: 11] [training loss avg: 0.1012] [validation loss avg: 0.0896]
[Epoch: 12] [training loss avg: 0.0813] [validation loss avg: 0.0832]
[Epoch: 13] [training loss avg: 0.0674] [validation loss avg: 0.0796]
[Epoch: 1

[I 2024-09-01 02:55:08,614] Trial 22 finished with value: 0.052084781462326646 and parameters: {'lr': 0.00013863489483455858, 'dropout': 0.410867796554047, 'patch_size': 15, 'num_tokens': 8, 'heads': 4}. Best is trial 22 with value: 0.052084781462326646.



Loading cached data for patch size 15 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 1.7899] [validation loss avg: 0.8447]
[Epoch: 2] [training loss avg: 0.7008] [validation loss avg: 0.3476]
[Epoch: 3] [training loss avg: 0.3627] [validation loss avg: 0.2467]
[Epoch: 4] [training loss avg: 0.2391] [validation loss avg: 0.2200]
[Epoch: 5] [training loss avg: 0.1694] [validation loss avg: 0.1344]
[Epoch: 6] [training loss avg: 0.1243] [validation loss avg: 0.1547]
[Epoch: 7] [training loss avg: 0.1066] [validation loss avg: 0.1038]
[Epoch: 8] [training loss avg: 0.0863] [validation loss avg: 0.1342]
[Epoch: 9] [training loss avg: 0.0684] [validation loss avg: 0.1314]
[Epoch: 10] [training loss avg: 0.0545] [validation loss avg: 0.0968]
[Epoch: 11] [training loss avg: 0.0506] [validation loss avg: 0.1302]
[Epoch: 12] [training loss avg: 0.0482] [validation loss avg: 0.0800]
[Epoch: 13] [training loss avg: 0.0366] [validation loss avg: 0.0990]
[Epoch: 1

[I 2024-09-01 03:13:02,605] Trial 23 finished with value: 0.0799847212841823 and parameters: {'lr': 0.00030833858419376965, 'dropout': 0.3944969226782589, 'patch_size': 15, 'num_tokens': 8, 'heads': 4}. Best is trial 22 with value: 0.052084781462326646.



Loading cached data for patch size 15 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 2.4104] [validation loss avg: 1.5168]
[Epoch: 2] [training loss avg: 1.4320] [validation loss avg: 0.8707]
[Epoch: 3] [training loss avg: 0.9379] [validation loss avg: 0.5137]
[Epoch: 4] [training loss avg: 0.6350] [validation loss avg: 0.3412]
[Epoch: 5] [training loss avg: 0.4442] [validation loss avg: 0.2485]
[Epoch: 6] [training loss avg: 0.3310] [validation loss avg: 0.1994]
[Epoch: 7] [training loss avg: 0.2529] [validation loss avg: 0.1681]
[Epoch: 8] [training loss avg: 0.2089] [validation loss avg: 0.1376]
[Epoch: 9] [training loss avg: 0.1675] [validation loss avg: 0.1499]
[Epoch: 10] [training loss avg: 0.1477] [validation loss avg: 0.1398]
[Epoch: 11] [training loss avg: 0.1194] [validation loss avg: 0.0995]
[Epoch: 12] [training loss avg: 0.1088] [validation loss avg: 0.1226]
[Epoch: 13] [training loss avg: 0.0829] [validation loss avg: 0.0882]
[Epoch: 1

[I 2024-09-01 03:30:54,088] Trial 24 finished with value: 0.07862667017339152 and parameters: {'lr': 0.0001259236282400165, 'dropout': 0.4397728936880088, 'patch_size': 15, 'num_tokens': 8, 'heads': 4}. Best is trial 22 with value: 0.052084781462326646.



Loading cached data for patch size 11 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 2.7757] [validation loss avg: 2.1762]
[Epoch: 2] [training loss avg: 2.1108] [validation loss avg: 1.6114]
[Epoch: 3] [training loss avg: 1.6718] [validation loss avg: 1.2008]
[Epoch: 4] [training loss avg: 1.3674] [validation loss avg: 0.9458]
[Epoch: 5] [training loss avg: 1.0990] [validation loss avg: 0.7375]
[Epoch: 6] [training loss avg: 0.8982] [validation loss avg: 0.5934]
[Epoch: 7] [training loss avg: 0.7398] [validation loss avg: 0.4820]
[Epoch: 8] [training loss avg: 0.6088] [validation loss avg: 0.3999]
[Epoch: 9] [training loss avg: 0.5094] [validation loss avg: 0.3354]
[Epoch: 10] [training loss avg: 0.4289] [validation loss avg: 0.2878]
[Epoch: 11] [training loss avg: 0.3636] [validation loss avg: 0.2697]
[Epoch: 12] [training loss avg: 0.3098] [validation loss avg: 0.2450]
[Epoch: 13] [training loss avg: 0.2599] [validation loss avg: 0.2295]
[Epoch: 1

[I 2024-09-01 03:49:42,257] Trial 25 finished with value: 0.10823729443585589 and parameters: {'lr': 6.525363494995978e-05, 'dropout': 0.40914996307864854, 'patch_size': 11, 'num_tokens': 10, 'heads': 8}. Best is trial 22 with value: 0.052084781462326646.



Loading cached data for patch size 15 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 1.3908] [validation loss avg: 0.5014]
[Epoch: 2] [training loss avg: 0.4208] [validation loss avg: 0.2240]
[Epoch: 3] [training loss avg: 0.2195] [validation loss avg: 0.1682]
[Epoch: 4] [training loss avg: 0.1501] [validation loss avg: 0.1455]
[Epoch: 5] [training loss avg: 0.1317] [validation loss avg: 0.1599]
[Epoch: 6] [training loss avg: 0.1071] [validation loss avg: 0.1182]
[Epoch: 7] [training loss avg: 0.0621] [validation loss avg: 0.0827]
[Epoch: 8] [training loss avg: 0.0500] [validation loss avg: 0.1037]
[Epoch: 9] [training loss avg: 0.0389] [validation loss avg: 0.0896]
[Epoch: 10] [training loss avg: 0.0462] [validation loss avg: 0.1159]
[Epoch: 11] [training loss avg: 0.0335] [validation loss avg: 0.1032]
[Epoch: 12] [training loss avg: 0.0266] [validation loss avg: 0.0671]
[Epoch: 13] [training loss avg: 0.0263] [validation loss avg: 0.0707]
[Epoch: 1

[I 2024-09-01 04:08:57,341] Trial 26 finished with value: 0.0670763265640874 and parameters: {'lr': 0.0006098959284469515, 'dropout': 0.3580418758833757, 'patch_size': 15, 'num_tokens': 4, 'heads': 2}. Best is trial 22 with value: 0.052084781462326646.



Loading cached data for patch size 15 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 1.8305] [validation loss avg: 0.8991]
[Epoch: 2] [training loss avg: 0.6645] [validation loss avg: 0.3673]
[Epoch: 3] [training loss avg: 0.3199] [validation loss avg: 0.3131]
[Epoch: 4] [training loss avg: 0.1911] [validation loss avg: 0.1585]
[Epoch: 5] [training loss avg: 0.1232] [validation loss avg: 0.1045]
[Epoch: 6] [training loss avg: 0.0863] [validation loss avg: 0.1558]
[Epoch: 7] [training loss avg: 0.0674] [validation loss avg: 0.1089]
[Epoch: 8] [training loss avg: 0.0497] [validation loss avg: 0.0806]
[Epoch: 9] [training loss avg: 0.0464] [validation loss avg: 0.2257]
[Epoch: 10] [training loss avg: 0.0348] [validation loss avg: 0.0885]
[Epoch: 11] [training loss avg: 0.0215] [validation loss avg: 0.1198]
[Epoch: 12] [training loss avg: 0.0199] [validation loss avg: 0.0759]
[Epoch: 13] [training loss avg: 0.0157] [validation loss avg: 0.1062]
[Epoch: 1

[I 2024-09-01 04:28:27,040] Trial 27 finished with value: 0.0573258196125694 and parameters: {'lr': 0.0002069452147442219, 'dropout': 0.1005788050257457, 'patch_size': 15, 'num_tokens': 6, 'heads': 4}. Best is trial 22 with value: 0.052084781462326646.



Loading cached data for patch size 15 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 1.8502] [validation loss avg: 0.9500]
[Epoch: 2] [training loss avg: 0.7096] [validation loss avg: 0.4326]
[Epoch: 3] [training loss avg: 0.3444] [validation loss avg: 0.2989]
[Epoch: 4] [training loss avg: 0.2130] [validation loss avg: 0.1944]
[Epoch: 5] [training loss avg: 0.1293] [validation loss avg: 0.1370]
[Epoch: 6] [training loss avg: 0.0945] [validation loss avg: 0.2660]
[Epoch: 7] [training loss avg: 0.0754] [validation loss avg: 0.1284]
[Epoch: 8] [training loss avg: 0.0523] [validation loss avg: 0.1682]
[Epoch: 9] [training loss avg: 0.0402] [validation loss avg: 0.1102]
[Epoch: 10] [training loss avg: 0.0265] [validation loss avg: 0.0774]
[Epoch: 11] [training loss avg: 0.0209] [validation loss avg: 0.0873]
[Epoch: 12] [training loss avg: 0.0156] [validation loss avg: 0.0806]
[Epoch: 13] [training loss avg: 0.0133] [validation loss avg: 0.1038]
[Epoch: 1

[I 2024-09-01 04:47:01,439] Trial 28 finished with value: 0.07623453197967527 and parameters: {'lr': 0.00018945411875925786, 'dropout': 0.10111569334983526, 'patch_size': 15, 'num_tokens': 6, 'heads': 4}. Best is trial 22 with value: 0.052084781462326646.



Loading cached data for patch size 7 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 2.5902] [validation loss avg: 1.8990]
[Epoch: 2] [training loss avg: 1.7469] [validation loss avg: 1.3012]
[Epoch: 3] [training loss avg: 1.2664] [validation loss avg: 0.9401]
[Epoch: 4] [training loss avg: 0.9415] [validation loss avg: 0.7224]
[Epoch: 5] [training loss avg: 0.7207] [validation loss avg: 0.5990]
[Epoch: 6] [training loss avg: 0.5509] [validation loss avg: 0.5023]
[Epoch: 7] [training loss avg: 0.4188] [validation loss avg: 0.4509]
[Epoch: 8] [training loss avg: 0.3214] [validation loss avg: 0.4427]
[Epoch: 9] [training loss avg: 0.2420] [validation loss avg: 0.4662]
[Epoch: 10] [training loss avg: 0.1982] [validation loss avg: 0.4072]
[Epoch: 11] [training loss avg: 0.1513] [validation loss avg: 0.4091]
[Epoch: 12] [training loss avg: 0.1218] [validation loss avg: 0.3814]
[Epoch: 13] [training loss avg: 0.0955] [validation loss avg: 0.4207]
[Epoch: 14

[I 2024-09-01 05:04:03,482] Trial 29 finished with value: 0.3813765694697698 and parameters: {'lr': 0.0001106017289703167, 'dropout': 0.26928845971767645, 'patch_size': 7, 'num_tokens': 6, 'heads': 2}. Best is trial 22 with value: 0.052084781462326646.



Loading cached data for patch size 9 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 2.3287] [validation loss avg: 1.4141]
[Epoch: 2] [training loss avg: 1.3638] [validation loss avg: 0.7713]
[Epoch: 3] [training loss avg: 0.8738] [validation loss avg: 0.5123]
[Epoch: 4] [training loss avg: 0.6229] [validation loss avg: 0.3954]
[Epoch: 5] [training loss avg: 0.4762] [validation loss avg: 0.2828]
[Epoch: 6] [training loss avg: 0.3595] [validation loss avg: 0.2629]
[Epoch: 7] [training loss avg: 0.2823] [validation loss avg: 0.2553]
[Epoch: 8] [training loss avg: 0.2270] [validation loss avg: 0.3048]
[Epoch: 9] [training loss avg: 0.1741] [validation loss avg: 0.2295]
[Epoch: 10] [training loss avg: 0.1385] [validation loss avg: 0.1966]
[Epoch: 11] [training loss avg: 0.1116] [validation loss avg: 0.2224]
[Epoch: 12] [training loss avg: 0.1021] [validation loss avg: 0.1809]
[Epoch: 13] [training loss avg: 0.0802] [validation loss avg: 0.2312]
[Epoch: 14

[I 2024-09-01 05:22:03,978] Trial 30 finished with value: 0.18089896937211355 and parameters: {'lr': 0.0001910205937113783, 'dropout': 0.4497807200450212, 'patch_size': 9, 'num_tokens': 6, 'heads': 2}. Best is trial 22 with value: 0.052084781462326646.



Loading cached data for patch size 15 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 1.6301] [validation loss avg: 0.7384]
[Epoch: 2] [training loss avg: 0.5191] [validation loss avg: 0.2957]
[Epoch: 3] [training loss avg: 0.2396] [validation loss avg: 0.1896]
[Epoch: 4] [training loss avg: 0.1463] [validation loss avg: 0.2052]
[Epoch: 5] [training loss avg: 0.1003] [validation loss avg: 0.3556]
[Epoch: 6] [training loss avg: 0.0630] [validation loss avg: 0.1190]
[Epoch: 7] [training loss avg: 0.0487] [validation loss avg: 0.1677]
[Epoch: 8] [training loss avg: 0.0421] [validation loss avg: 0.1499]
[Epoch: 9] [training loss avg: 0.0286] [validation loss avg: 0.0951]
[Epoch: 10] [training loss avg: 0.0318] [validation loss avg: 0.1370]
[Epoch: 11] [training loss avg: 0.0240] [validation loss avg: 0.1603]
[Epoch: 12] [training loss avg: 0.0213] [validation loss avg: 0.1133]
[Epoch: 13] [training loss avg: 0.0169] [validation loss avg: 0.1183]
[Epoch: 1

[I 2024-09-01 05:42:11,756] Trial 31 finished with value: 0.06520425532722757 and parameters: {'lr': 0.00029239638496607366, 'dropout': 0.13294955963831795, 'patch_size': 15, 'num_tokens': 10, 'heads': 4}. Best is trial 22 with value: 0.052084781462326646.



Loading cached data for patch size 15 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 1.8646] [validation loss avg: 0.9306]
[Epoch: 2] [training loss avg: 0.7062] [validation loss avg: 0.4249]
[Epoch: 3] [training loss avg: 0.3563] [validation loss avg: 0.2632]
[Epoch: 4] [training loss avg: 0.2115] [validation loss avg: 0.1808]
[Epoch: 5] [training loss avg: 0.1452] [validation loss avg: 0.1268]
[Epoch: 6] [training loss avg: 0.1033] [validation loss avg: 0.1461]
[Epoch: 7] [training loss avg: 0.0901] [validation loss avg: 0.1596]
[Epoch: 8] [training loss avg: 0.0712] [validation loss avg: 0.0964]
[Epoch: 9] [training loss avg: 0.0450] [validation loss avg: 0.1197]
[Epoch: 10] [training loss avg: 0.0485] [validation loss avg: 0.1050]
[Epoch: 11] [training loss avg: 0.0368] [validation loss avg: 0.1015]
[Epoch: 12] [training loss avg: 0.0315] [validation loss avg: 0.0926]
[Epoch: 13] [training loss avg: 0.0301] [validation loss avg: 0.0910]
[Epoch: 1

[I 2024-09-01 06:01:21,072] Trial 32 finished with value: 0.06887818133414146 and parameters: {'lr': 0.00021400553162710892, 'dropout': 0.20702423681853613, 'patch_size': 15, 'num_tokens': 6, 'heads': 4}. Best is trial 22 with value: 0.052084781462326646.



Loading cached data for patch size 15 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 1.4509] [validation loss avg: 0.5301]
[Epoch: 2] [training loss avg: 0.4227] [validation loss avg: 0.2413]
[Epoch: 3] [training loss avg: 0.2187] [validation loss avg: 0.1738]
[Epoch: 4] [training loss avg: 0.1447] [validation loss avg: 0.1519]
[Epoch: 5] [training loss avg: 0.0969] [validation loss avg: 0.1922]
[Epoch: 6] [training loss avg: 0.0723] [validation loss avg: 0.1206]
[Epoch: 7] [training loss avg: 0.0563] [validation loss avg: 0.1383]
[Epoch: 8] [training loss avg: 0.0547] [validation loss avg: 0.1136]
[Epoch: 9] [training loss avg: 0.0470] [validation loss avg: 0.1200]
[Epoch: 10] [training loss avg: 0.0333] [validation loss avg: 0.1117]
[Epoch: 11] [training loss avg: 0.0268] [validation loss avg: 0.1043]
[Epoch: 12] [training loss avg: 0.0188] [validation loss avg: 0.1336]
[Epoch: 13] [training loss avg: 0.0224] [validation loss avg: 0.0874]
[Epoch: 1

[I 2024-09-01 06:20:30,294] Trial 33 finished with value: 0.062273999970784644 and parameters: {'lr': 0.0003829043282894163, 'dropout': 0.12717717218706545, 'patch_size': 15, 'num_tokens': 8, 'heads': 4}. Best is trial 22 with value: 0.052084781462326646.



Loading cached data for patch size 11 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 2.4411] [validation loss avg: 1.7204]
[Epoch: 2] [training loss avg: 1.5282] [validation loss avg: 1.1367]
[Epoch: 3] [training loss avg: 1.0805] [validation loss avg: 0.7585]
[Epoch: 4] [training loss avg: 0.7757] [validation loss avg: 0.5455]
[Epoch: 5] [training loss avg: 0.5674] [validation loss avg: 0.4190]
[Epoch: 6] [training loss avg: 0.4263] [validation loss avg: 0.3186]
[Epoch: 7] [training loss avg: 0.3237] [validation loss avg: 0.2586]
[Epoch: 8] [training loss avg: 0.2489] [validation loss avg: 0.2337]
[Epoch: 9] [training loss avg: 0.1934] [validation loss avg: 0.1885]
[Epoch: 10] [training loss avg: 0.1580] [validation loss avg: 0.1752]
[Epoch: 11] [training loss avg: 0.1226] [validation loss avg: 0.1662]
[Epoch: 12] [training loss avg: 0.1080] [validation loss avg: 0.1463]
[Epoch: 13] [training loss avg: 0.0815] [validation loss avg: 0.1452]
[Epoch: 1

[I 2024-09-01 06:38:31,978] Trial 34 finished with value: 0.10242783331445285 and parameters: {'lr': 7.769720370769947e-05, 'dropout': 0.16230535166815857, 'patch_size': 11, 'num_tokens': 12, 'heads': 1}. Best is trial 22 with value: 0.052084781462326646.



Loading cached data for patch size 9 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 2.8085] [validation loss avg: 2.3266]
[Epoch: 2] [training loss avg: 2.1894] [validation loss avg: 1.7628]
[Epoch: 3] [training loss avg: 1.7458] [validation loss avg: 1.3654]
[Epoch: 4] [training loss avg: 1.4086] [validation loss avg: 1.0871]
[Epoch: 5] [training loss avg: 1.1347] [validation loss avg: 0.8737]
[Epoch: 6] [training loss avg: 0.9465] [validation loss avg: 0.7245]
[Epoch: 7] [training loss avg: 0.7900] [validation loss avg: 0.6132]
[Epoch: 8] [training loss avg: 0.6561] [validation loss avg: 0.5216]
[Epoch: 9] [training loss avg: 0.5511] [validation loss avg: 0.4656]
[Epoch: 10] [training loss avg: 0.4631] [validation loss avg: 0.4152]
[Epoch: 11] [training loss avg: 0.4036] [validation loss avg: 0.3817]
[Epoch: 12] [training loss avg: 0.3417] [validation loss avg: 0.3394]
[Epoch: 13] [training loss avg: 0.2903] [validation loss avg: 0.3139]
[Epoch: 14

[I 2024-09-01 06:56:32,234] Trial 35 finished with value: 0.2286813638749577 and parameters: {'lr': 4.8725465062010256e-05, 'dropout': 0.20089208135601272, 'patch_size': 9, 'num_tokens': 2, 'heads': 4}. Best is trial 22 with value: 0.052084781462326646.



Loading cached data for patch size 15 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 2.0040] [validation loss avg: 1.2323]
[Epoch: 2] [training loss avg: 0.9724] [validation loss avg: 0.5584]
[Epoch: 3] [training loss avg: 0.5235] [validation loss avg: 0.3200]
[Epoch: 4] [training loss avg: 0.3364] [validation loss avg: 0.2559]
[Epoch: 5] [training loss avg: 0.2177] [validation loss avg: 0.1633]
[Epoch: 6] [training loss avg: 0.1581] [validation loss avg: 0.1461]
[Epoch: 7] [training loss avg: 0.1170] [validation loss avg: 0.1171]
[Epoch: 8] [training loss avg: 0.0931] [validation loss avg: 0.1121]
[Epoch: 9] [training loss avg: 0.0733] [validation loss avg: 0.1453]
[Epoch: 10] [training loss avg: 0.0591] [validation loss avg: 0.1310]
[Epoch: 11] [training loss avg: 0.0518] [validation loss avg: 0.0947]
[Epoch: 12] [training loss avg: 0.0384] [validation loss avg: 0.0975]
[Epoch: 13] [training loss avg: 0.0297] [validation loss avg: 0.0751]
[Epoch: 1

[I 2024-09-01 07:17:02,892] Trial 36 finished with value: 0.07511792358543191 and parameters: {'lr': 0.00015649158958092727, 'dropout': 0.23157987400497693, 'patch_size': 15, 'num_tokens': 8, 'heads': 1}. Best is trial 22 with value: 0.052084781462326646.



Loading cached data for patch size 7 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 3.1516] [validation loss avg: 2.9833]
[Epoch: 2] [training loss avg: 2.9745] [validation loss avg: 2.8142]
[Epoch: 3] [training loss avg: 2.8154] [validation loss avg: 2.6536]
[Epoch: 4] [training loss avg: 2.6751] [validation loss avg: 2.5028]
[Epoch: 5] [training loss avg: 2.5420] [validation loss avg: 2.3561]
[Epoch: 6] [training loss avg: 2.3991] [validation loss avg: 2.2234]
[Epoch: 7] [training loss avg: 2.2883] [validation loss avg: 2.1030]
[Epoch: 8] [training loss avg: 2.1849] [validation loss avg: 2.0007]
[Epoch: 9] [training loss avg: 2.0867] [validation loss avg: 1.9109]
[Epoch: 10] [training loss avg: 2.0100] [validation loss avg: 1.8305]
[Epoch: 11] [training loss avg: 1.9478] [validation loss avg: 1.7535]
[Epoch: 12] [training loss avg: 1.8610] [validation loss avg: 1.6863]
[Epoch: 13] [training loss avg: 1.7908] [validation loss avg: 1.6231]
[Epoch: 14

[I 2024-09-01 07:34:56,801] Trial 37 finished with value: 0.3868661317087355 and parameters: {'lr': 1.0045289037968794e-05, 'dropout': 0.311202716418981, 'patch_size': 7, 'num_tokens': 12, 'heads': 8}. Best is trial 22 with value: 0.052084781462326646.



Loading cached data for patch size 15 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 1.7073] [validation loss avg: 0.6996]
[Epoch: 2] [training loss avg: 0.5504] [validation loss avg: 0.2986]
[Epoch: 3] [training loss avg: 0.2605] [validation loss avg: 0.2552]
[Epoch: 4] [training loss avg: 0.1697] [validation loss avg: 0.1859]
[Epoch: 5] [training loss avg: 0.1093] [validation loss avg: 0.1157]
[Epoch: 6] [training loss avg: 0.0844] [validation loss avg: 0.1358]
[Epoch: 7] [training loss avg: 0.0678] [validation loss avg: 0.1229]
[Epoch: 8] [training loss avg: 0.0477] [validation loss avg: 0.1022]
[Epoch: 9] [training loss avg: 0.0390] [validation loss avg: 0.0990]
[Epoch: 10] [training loss avg: 0.0253] [validation loss avg: 0.0766]
[Epoch: 11] [training loss avg: 0.0230] [validation loss avg: 0.1243]
[Epoch: 12] [training loss avg: 0.0257] [validation loss avg: 0.1268]
[Epoch: 13] [training loss avg: 0.0288] [validation loss avg: 0.1070]
[Epoch: 1

[I 2024-09-01 07:55:44,369] Trial 38 finished with value: 0.07354241725434327 and parameters: {'lr': 0.0003355188229977712, 'dropout': 0.2758152679300525, 'patch_size': 15, 'num_tokens': 4, 'heads': 4}. Best is trial 22 with value: 0.052084781462326646.



Loading cached data for patch size 13 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 2.1326] [validation loss avg: 1.0522]
[Epoch: 2] [training loss avg: 0.9777] [validation loss avg: 0.4870]
[Epoch: 3] [training loss avg: 0.5221] [validation loss avg: 0.3692]
[Epoch: 4] [training loss avg: 0.3132] [validation loss avg: 0.1867]
[Epoch: 5] [training loss avg: 0.2036] [validation loss avg: 0.1813]
[Epoch: 6] [training loss avg: 0.1548] [validation loss avg: 0.1391]
[Epoch: 7] [training loss avg: 0.1061] [validation loss avg: 0.1310]
[Epoch: 8] [training loss avg: 0.0843] [validation loss avg: 0.0966]
[Epoch: 9] [training loss avg: 0.0632] [validation loss avg: 0.1470]
[Epoch: 10] [training loss avg: 0.0447] [validation loss avg: 0.1340]
[Epoch: 11] [training loss avg: 0.0381] [validation loss avg: 0.1258]
[Epoch: 12] [training loss avg: 0.0339] [validation loss avg: 0.1248]
[Epoch: 13] [training loss avg: 0.0253] [validation loss avg: 0.1104]
[Epoch: 1

[I 2024-09-01 08:15:28,372] Trial 39 finished with value: 0.0966025467607237 and parameters: {'lr': 0.0002361941263457105, 'dropout': 0.4120311543955987, 'patch_size': 13, 'num_tokens': 2, 'heads': 2}. Best is trial 22 with value: 0.052084781462326646.



Loading cached data for patch size 9 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 1.4025] [validation loss avg: 0.5669]
[Epoch: 2] [training loss avg: 0.4642] [validation loss avg: 0.4238]
[Epoch: 3] [training loss avg: 0.2574] [validation loss avg: 0.2929]
[Epoch: 4] [training loss avg: 0.1491] [validation loss avg: 0.2677]
[Epoch: 5] [training loss avg: 0.0919] [validation loss avg: 0.2768]
[Epoch: 6] [training loss avg: 0.0701] [validation loss avg: 0.2636]
[Epoch: 7] [training loss avg: 0.0508] [validation loss avg: 0.2340]
[Epoch: 8] [training loss avg: 0.0370] [validation loss avg: 0.1823]
[Epoch: 9] [training loss avg: 0.0332] [validation loss avg: 0.3124]
[Epoch: 10] [training loss avg: 0.0370] [validation loss avg: 0.3024]
[Epoch: 11] [training loss avg: 0.0331] [validation loss avg: 0.2493]
[Epoch: 12] [training loss avg: 0.0231] [validation loss avg: 0.2191]
[Epoch: 13] [training loss avg: 0.0172] [validation loss avg: 0.2406]
[Epoch: 14

[I 2024-09-01 08:34:07,325] Trial 40 finished with value: 0.18232147413350286 and parameters: {'lr': 0.0006198153903162699, 'dropout': 0.11685899350719298, 'patch_size': 9, 'num_tokens': 10, 'heads': 16}. Best is trial 22 with value: 0.052084781462326646.



Loading cached data for patch size 15 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 2.2826] [validation loss avg: 1.4356]
[Epoch: 2] [training loss avg: 1.3071] [validation loss avg: 0.7543]
[Epoch: 3] [training loss avg: 0.8060] [validation loss avg: 0.4440]
[Epoch: 4] [training loss avg: 0.5384] [validation loss avg: 0.3251]
[Epoch: 5] [training loss avg: 0.3758] [validation loss avg: 0.2179]
[Epoch: 6] [training loss avg: 0.2800] [validation loss avg: 0.2373]
[Epoch: 7] [training loss avg: 0.2028] [validation loss avg: 0.1424]
[Epoch: 8] [training loss avg: 0.1641] [validation loss avg: 0.2842]
[Epoch: 9] [training loss avg: 0.1328] [validation loss avg: 0.1487]
[Epoch: 10] [training loss avg: 0.1048] [validation loss avg: 0.1170]
[Epoch: 11] [training loss avg: 0.0841] [validation loss avg: 0.1040]
[Epoch: 12] [training loss avg: 0.0714] [validation loss avg: 0.1650]
[Epoch: 13] [training loss avg: 0.0614] [validation loss avg: 0.1067]
[Epoch: 1

[I 2024-09-01 08:54:06,766] Trial 41 finished with value: 0.06988504271984788 and parameters: {'lr': 0.00013796847235599053, 'dropout': 0.3585186491991206, 'patch_size': 15, 'num_tokens': 8, 'heads': 4}. Best is trial 22 with value: 0.052084781462326646.



Loading cached data for patch size 15 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 2.2461] [validation loss avg: 1.4677]
[Epoch: 2] [training loss avg: 1.1764] [validation loss avg: 0.7559]
[Epoch: 3] [training loss avg: 0.7035] [validation loss avg: 0.4066]
[Epoch: 4] [training loss avg: 0.4421] [validation loss avg: 0.3168]
[Epoch: 5] [training loss avg: 0.3075] [validation loss avg: 0.2046]
[Epoch: 6] [training loss avg: 0.2160] [validation loss avg: 0.1974]
[Epoch: 7] [training loss avg: 0.1642] [validation loss avg: 0.2284]
[Epoch: 8] [training loss avg: 0.1296] [validation loss avg: 0.1678]
[Epoch: 9] [training loss avg: 0.1025] [validation loss avg: 0.1307]
[Epoch: 10] [training loss avg: 0.0809] [validation loss avg: 0.0988]
[Epoch: 11] [training loss avg: 0.0650] [validation loss avg: 0.0886]
[Epoch: 12] [training loss avg: 0.0596] [validation loss avg: 0.0886]
[Epoch: 13] [training loss avg: 0.0506] [validation loss avg: 0.1294]
[Epoch: 1

[I 2024-09-01 09:14:11,040] Trial 42 finished with value: 0.06779212367144369 and parameters: {'lr': 0.0001583156083026963, 'dropout': 0.32389397570997985, 'patch_size': 15, 'num_tokens': 8, 'heads': 4}. Best is trial 22 with value: 0.052084781462326646.



Loading cached data for patch size 15 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 2.4194] [validation loss avg: 1.6697]
[Epoch: 2] [training loss avg: 1.4070] [validation loss avg: 0.9548]
[Epoch: 3] [training loss avg: 0.8666] [validation loss avg: 0.5263]
[Epoch: 4] [training loss avg: 0.5345] [validation loss avg: 0.4317]
[Epoch: 5] [training loss avg: 0.3537] [validation loss avg: 0.2479]
[Epoch: 6] [training loss avg: 0.2537] [validation loss avg: 0.2142]
[Epoch: 7] [training loss avg: 0.1943] [validation loss avg: 0.1577]
[Epoch: 8] [training loss avg: 0.1493] [validation loss avg: 0.1363]
[Epoch: 9] [training loss avg: 0.1258] [validation loss avg: 0.1401]
[Epoch: 10] [training loss avg: 0.0953] [validation loss avg: 0.1067]
[Epoch: 11] [training loss avg: 0.0788] [validation loss avg: 0.1133]
[Epoch: 12] [training loss avg: 0.0642] [validation loss avg: 0.1174]
[Epoch: 13] [training loss avg: 0.0540] [validation loss avg: 0.0942]
[Epoch: 1

[I 2024-09-01 09:34:16,702] Trial 43 finished with value: 0.06893139341402621 and parameters: {'lr': 0.00011964805037554106, 'dropout': 0.25087483046016557, 'patch_size': 15, 'num_tokens': 8, 'heads': 4}. Best is trial 22 with value: 0.052084781462326646.



Loading cached data for patch size 15 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 2.3195] [validation loss avg: 1.5448]
[Epoch: 2] [training loss avg: 1.3308] [validation loss avg: 0.9446]
[Epoch: 3] [training loss avg: 0.8486] [validation loss avg: 0.6111]
[Epoch: 4] [training loss avg: 0.5628] [validation loss avg: 0.3811]
[Epoch: 5] [training loss avg: 0.3834] [validation loss avg: 0.3058]
[Epoch: 6] [training loss avg: 0.2742] [validation loss avg: 0.2665]
[Epoch: 7] [training loss avg: 0.2047] [validation loss avg: 0.2192]
[Epoch: 8] [training loss avg: 0.1470] [validation loss avg: 0.1421]
[Epoch: 9] [training loss avg: 0.1215] [validation loss avg: 0.2044]
[Epoch: 10] [training loss avg: 0.0918] [validation loss avg: 0.1316]
[Epoch: 11] [training loss avg: 0.0752] [validation loss avg: 0.1000]
[Epoch: 12] [training loss avg: 0.0573] [validation loss avg: 0.2213]
[Epoch: 13] [training loss avg: 0.0465] [validation loss avg: 0.1272]
[Epoch: 1

[I 2024-09-01 09:54:29,733] Trial 44 finished with value: 0.06980475302719112 and parameters: {'lr': 8.58896297919662e-05, 'dropout': 0.15391947695910893, 'patch_size': 15, 'num_tokens': 8, 'heads': 4}. Best is trial 22 with value: 0.052084781462326646.



Loading cached data for patch size 11 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 2.2945] [validation loss avg: 1.4190]
[Epoch: 2] [training loss avg: 1.2900] [validation loss avg: 0.7513]
[Epoch: 3] [training loss avg: 0.7701] [validation loss avg: 0.4374]
[Epoch: 4] [training loss avg: 0.4931] [validation loss avg: 0.2577]
[Epoch: 5] [training loss avg: 0.3483] [validation loss avg: 0.2281]
[Epoch: 6] [training loss avg: 0.2436] [validation loss avg: 0.1957]
[Epoch: 7] [training loss avg: 0.1882] [validation loss avg: 0.1715]
[Epoch: 8] [training loss avg: 0.1463] [validation loss avg: 0.1687]
[Epoch: 9] [training loss avg: 0.1184] [validation loss avg: 0.1515]
[Epoch: 10] [training loss avg: 0.0906] [validation loss avg: 0.1235]
[Epoch: 11] [training loss avg: 0.0818] [validation loss avg: 0.1221]
[Epoch: 12] [training loss avg: 0.0627] [validation loss avg: 0.1240]
[Epoch: 13] [training loss avg: 0.0551] [validation loss avg: 0.1338]
[Epoch: 1

[I 2024-09-01 10:13:24,761] Trial 45 finished with value: 0.11938031102042823 and parameters: {'lr': 0.00016365402396805964, 'dropout': 0.3769712987421313, 'patch_size': 11, 'num_tokens': 12, 'heads': 1}. Best is trial 22 with value: 0.052084781462326646.



Loading cached data for patch size 15 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 1.9848] [validation loss avg: 0.9988]
[Epoch: 2] [training loss avg: 0.7710] [validation loss avg: 0.4032]
[Epoch: 3] [training loss avg: 0.3999] [validation loss avg: 0.2467]
[Epoch: 4] [training loss avg: 0.2464] [validation loss avg: 0.2052]
[Epoch: 5] [training loss avg: 0.1634] [validation loss avg: 0.1480]
[Epoch: 6] [training loss avg: 0.1139] [validation loss avg: 0.1552]
[Epoch: 7] [training loss avg: 0.0933] [validation loss avg: 0.1560]
[Epoch: 8] [training loss avg: 0.0752] [validation loss avg: 0.1054]
[Epoch: 9] [training loss avg: 0.0594] [validation loss avg: 0.1359]
[Epoch: 10] [training loss avg: 0.0439] [validation loss avg: 0.0955]
[Epoch: 11] [training loss avg: 0.0453] [validation loss avg: 0.1295]
[Epoch: 12] [training loss avg: 0.0362] [validation loss avg: 0.1229]
[Epoch: 13] [training loss avg: 0.0316] [validation loss avg: 0.1160]
[Epoch: 1

[I 2024-09-01 10:33:33,634] Trial 46 finished with value: 0.07863371205028324 and parameters: {'lr': 0.0002412771023799147, 'dropout': 0.2846648858220955, 'patch_size': 15, 'num_tokens': 6, 'heads': 4}. Best is trial 22 with value: 0.052084781462326646.



Loading cached data for patch size 15 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 3.0208] [validation loss avg: 2.6338]
[Epoch: 2] [training loss avg: 2.5502] [validation loss avg: 2.1672]
[Epoch: 3] [training loss avg: 2.1686] [validation loss avg: 1.7610]
[Epoch: 4] [training loss avg: 1.8409] [validation loss avg: 1.4582]
[Epoch: 5] [training loss avg: 1.6001] [validation loss avg: 1.2421]
[Epoch: 6] [training loss avg: 1.4221] [validation loss avg: 1.0610]
[Epoch: 7] [training loss avg: 1.2671] [validation loss avg: 0.9237]
[Epoch: 8] [training loss avg: 1.1210] [validation loss avg: 0.8059]
[Epoch: 9] [training loss avg: 1.0006] [validation loss avg: 0.6907]
[Epoch: 10] [training loss avg: 0.9038] [validation loss avg: 0.6062]
[Epoch: 11] [training loss avg: 0.8009] [validation loss avg: 0.5277]
[Epoch: 12] [training loss avg: 0.7186] [validation loss avg: 0.4744]
[Epoch: 13] [training loss avg: 0.6566] [validation loss avg: 0.4237]
[Epoch: 1

[I 2024-09-01 10:53:47,109] Trial 47 finished with value: 0.06889345143766452 and parameters: {'lr': 2.538602661618843e-05, 'dropout': 0.34764753433553686, 'patch_size': 15, 'num_tokens': 12, 'heads': 4}. Best is trial 22 with value: 0.052084781462326646.



Loading cached data for patch size 15 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 2.8319] [validation loss avg: 2.2973]
[Epoch: 2] [training loss avg: 2.1039] [validation loss avg: 1.6381]
[Epoch: 3] [training loss avg: 1.6366] [validation loss avg: 1.2648]
[Epoch: 4] [training loss avg: 1.3396] [validation loss avg: 1.0127]
[Epoch: 5] [training loss avg: 1.1241] [validation loss avg: 0.8068]
[Epoch: 6] [training loss avg: 0.9519] [validation loss avg: 0.6659]
[Epoch: 7] [training loss avg: 0.8112] [validation loss avg: 0.5540]
[Epoch: 8] [training loss avg: 0.6943] [validation loss avg: 0.4791]
[Epoch: 9] [training loss avg: 0.6105] [validation loss avg: 0.4003]
[Epoch: 10] [training loss avg: 0.5235] [validation loss avg: 0.3361]
[Epoch: 11] [training loss avg: 0.4509] [validation loss avg: 0.2986]
[Epoch: 12] [training loss avg: 0.3909] [validation loss avg: 0.2685]
[Epoch: 13] [training loss avg: 0.3499] [validation loss avg: 0.2260]
[Epoch: 1

[I 2024-09-01 11:15:10,637] Trial 48 finished with value: 0.07748409324059528 and parameters: {'lr': 3.735961603635548e-05, 'dropout': 0.31040957241360007, 'patch_size': 15, 'num_tokens': 8, 'heads': 2}. Best is trial 22 with value: 0.052084781462326646.



Loading cached data for patch size 13 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 2.6958] [validation loss avg: 1.9111]
[Epoch: 2] [training loss avg: 1.8115] [validation loss avg: 1.1094]
[Epoch: 3] [training loss avg: 1.2258] [validation loss avg: 0.6761]
[Epoch: 4] [training loss avg: 0.8697] [validation loss avg: 0.4427]
[Epoch: 5] [training loss avg: 0.6364] [validation loss avg: 0.3296]
[Epoch: 6] [training loss avg: 0.4875] [validation loss avg: 0.2456]
[Epoch: 7] [training loss avg: 0.3937] [validation loss avg: 0.2113]
[Epoch: 8] [training loss avg: 0.3047] [validation loss avg: 0.1714]
[Epoch: 9] [training loss avg: 0.2484] [validation loss avg: 0.1565]
[Epoch: 10] [training loss avg: 0.2126] [validation loss avg: 0.1556]
[Epoch: 11] [training loss avg: 0.1765] [validation loss avg: 0.1375]
[Epoch: 12] [training loss avg: 0.1501] [validation loss avg: 0.1267]
[Epoch: 13] [training loss avg: 0.1202] [validation loss avg: 0.1314]
[Epoch: 1

[I 2024-09-01 11:35:35,857] Trial 49 finished with value: 0.11699580724927641 and parameters: {'lr': 0.00010826066946916244, 'dropout': 0.48956532852700513, 'patch_size': 13, 'num_tokens': 2, 'heads': 8}. Best is trial 22 with value: 0.052084781462326646.


Best parameters:  {'lr': 0.00013863489483455858, 'dropout': 0.410867796554047, 'patch_size': 15, 'num_tokens': 8, 'heads': 4}

Loading cached data for patch size 15 and train samples 300...
PCA components passed: 30
[Epoch: 1] [training loss avg: 2.3836] [validation loss avg: 1.4990]
[Epoch: 2] [training loss avg: 1.4025] [validation loss avg: 0.8203]
[Epoch: 3] [training loss avg: 0.8839] [validation loss avg: 0.5208]
[Epoch: 4] [training loss avg: 0.5812] [validation loss avg: 0.3031]
[Epoch: 5] [training loss avg: 0.4008] [validation loss avg: 0.2077]
[Epoch: 6] [training loss avg: 0.2951] [validation loss avg: 0.1654]
[Epoch: 7] [training loss avg: 0.2126] [validation loss avg: 0.1463]
[Epoch: 8] [training loss avg: 0.1783] [validation loss avg: 0.1194]
[Epoch: 9] [training loss avg: 0.1401] [validation loss avg: 0.0942]
[Epoch: 10] [training loss avg: 0.1200] [validation loss avg: 0.1071]
[Epoch: 11] [training loss avg: 0.0943] [validation loss avg: 0.1047]
[Epoch: 12] [training l

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
fig = optuna.visualization.plot_contour(study)
fig.update_layout(width=1250, height=850)
fig.show()

In [None]:
fig = optuna.visualization.plot_slice(study)
fig.update_layout(width=1250, height=650)
fig.show()

In [None]:
fig = optuna.visualization.plot_rank(study)
fig.update_layout(width=1250, height=650)
fig.show()

##HanChuan


---



In [None]:
main_dir = '/content/Spectral-Spatial-Transformers-for-Precise-Crop-Classification-from-UAV-borne-Hyperspectral-Images'
sys.path.append(main_dir)

data_dir = '/content/Data'
sys.path.append(data_dir)

from data_fetcher import loadData
from SSFTT import SSFTTnet
import get_cls_map

BATCH_SIZE_TRAIN = 64

NUM_CLASSES = {
    'HanChuan': 16,
    'HongHu': 22,
    'LongKou': 9
}

dataset_mapping = {
    'HanChuan': 'WHU-Hi-HanChuan',
    'HongHu': 'WHU-Hi-HongHu',
    'LongKou': 'WHU-Hi-LongKou'
}

def loadDataWrapper(dataset, kaggle_json_path, train_samples):
    dataset_name = dataset_mapping[dataset]
    full_data_file = f'{data_dir}/{dataset_name}/WHU_Hi_{dataset}.mat'
    train_file = f'{data_dir}/{dataset_name}/Training samples and test samples/Train{train_samples}.mat'
    test_file = f'{data_dir}/{dataset_name}/Training samples and test samples/Test{train_samples}.mat'

    data, labels = loadData(dataset_name, kaggle_json_path)

    if dataset == 'HongHu':
        train_mask = sio.loadmat(train_file)[f'HHCYtrain{train_samples}']
        test_mask = sio.loadmat(test_file)[f'HHCYtest{train_samples}']
    elif dataset == 'HanChuan':
        train_mask = sio.loadmat(train_file)[f'Train{train_samples}']
        test_mask = sio.loadmat(test_file)[f'Test{train_samples}']
    elif dataset == 'LongKou':
        train_mask = sio.loadmat(train_file)[f'LKtrain{train_samples}']
        test_mask = sio.loadmat(test_file)[f'LKtest{train_samples}']

    return data, labels, train_mask, test_mask

def applyPCA(X_train, X_val, X_test, pca_components=30):
    newX_train = np.reshape(X_train, (-1, X_train.shape[-1])).astype(np.float32)
    newX_val = np.reshape(X_val, (-1, X_val.shape[-1])).astype(np.float32)
    newX_test = np.reshape(X_test, (-1, X_test.shape[-1])).astype(np.float32)

    pca = PCA(n_components=pca_components, whiten=True)
    newX_train = pca.fit_transform(newX_train).astype(np.float32)
    newX_val = pca.transform(newX_val).astype(np.float32)
    newX_test = pca.transform(newX_test).astype(np.float32)

    valid_components = pca.n_components_
    print(f"Valid components: {valid_components}")

    newX_train = np.reshape(newX_train, (X_train.shape[0], X_train.shape[1], X_train.shape[2], valid_components))
    newX_val = np.reshape(newX_val, (X_val.shape[0], X_val.shape[1], X_val.shape[2], valid_components))
    newX_test = np.reshape(newX_test, (X_test.shape[0], X_test.shape[1], X_test.shape[2], valid_components))

    return newX_train, newX_val, newX_test, pca

def padWithZeros(X, margin=2):
    newX = np.zeros((X.shape[0] + 2 * margin, X.shape[1] + 2 * margin, X.shape[2]), dtype=np.float32)
    x_offset = margin
    y_offset = margin
    newX[x_offset:X.shape[0] + x_offset, y_offset:X.shape[1] + y_offset, :] = X.astype(np.float32)
    return newX

def createImageCubesWithMask(X, y, mask, mask_type, windowSize=13):
    margin = int((windowSize - 1) / 2)
    print(f"Margin: {margin}")
    zeroPaddedX = padWithZeros(X, margin=margin)
    print(f"Padded X shape: {zeroPaddedX.shape}")
    positions = np.argwhere(mask >= 1)
    patchesData = np.zeros((len(positions), windowSize, windowSize, X.shape[2]), dtype=np.float32)
    patchesLabels = np.zeros(len(positions))

    unique, counts = np.unique(mask, return_counts=True)
    mask_stats = dict(zip(unique, counts))
    print(f"{mask_type} statistics: {mask_stats}")

    print(f"y shape: {y.shape}, type: {type(y)}")
    print(f"{mask_type} positions shape: {positions.shape}, type: {type(positions)}")
    print("_________________________________________________________________________\n")

    for i, pos in enumerate(positions):
        x, y_pos = pos
        x_padded = x + margin
        y_padded = y_pos + margin
        patch = zeroPaddedX[x_padded - margin:x_padded + margin + 1, y_padded - margin:y_padded + margin + 1, :].astype(np.float32)
        patchesData[i, :, :, :] = patch
        patchesLabels[i] = y[x, y_pos] - 1

    return patchesData, patchesLabels

def createImageCubes(X, y, windowSize=13, removeZeroLabels=True):
    margin = int((windowSize - 1) / 2)
    zeroPaddedX = padWithZeros(X, margin=margin)
    patchesData = np.zeros((X.shape[0] * X.shape[1], windowSize, windowSize, X.shape[2]), dtype=np.float32)
    patchesLabels = np.zeros((X.shape[0] * X.shape[1]))
    patchIndex = 0
    for r in range(margin, zeroPaddedX.shape[0] - margin):
        for c in range(margin, zeroPaddedX.shape[1] - margin):
            patch = zeroPaddedX[r - margin:r + margin + 1, c - margin:c + margin + 1].astype(np.float32)
            patchesData[patchIndex, :, :, :] = patch
            patchesLabels[patchIndex] = y[r-margin, c-margin]
            patchIndex = patchIndex + 1
    if removeZeroLabels:
        patchesData = patchesData[patchesLabels > 0, :, :, :]
        patchesLabels = patchesLabels[patchesLabels > 0]
        patchesLabels -= 1

    return patchesData, patchesLabels

def create_data_loader(dataset, kaggle_json_path, patch_size, train_samples, validation_ratio=0.2):
    # Define cache file name
    cache_dir = f'preprocessed_patches/{dataset}'
    os.makedirs(cache_dir, exist_ok=True)
    cache_file = f'{cache_dir}/patch_size_{patch_size}_samples_{train_samples}.pkl'

    # If cache file exists, load the data from it
    if os.path.exists(cache_file):
        print(f"\nLoading cached data for patch size {patch_size} and train samples {train_samples}...")
        with open(cache_file, 'rb') as f:
            data = pickle.load(f)  # Ensure this is a dictionary
            return data['train_loader'], data['val_loader'], data['test_loader'], data['all_data_loader'], data['y'], data['pca_components']

    # If cache does not exist, process the data as usual
    X, y, train_mask, test_mask = loadDataWrapper(dataset, kaggle_json_path, train_samples)
    pca_components = 30

    print('Hyperspectral data shape: ', X.shape)
    print('Label shape: ', y.shape)

    print(f'Train mask shape: {train_mask.shape}')
    print(f'Test mask shape: {test_mask.shape}')

    print(f"Patch (window) size: {patch_size}")

    print('\n... ... Create data cubes with masks (Before PCA to avoid data leakage) ... ...')
    Xtrain, ytrain = createImageCubesWithMask(X, y, train_mask, mask_type="Training mask", windowSize=patch_size)
    Xtest, ytest = createImageCubesWithMask(X, y, test_mask, mask_type="Testing mask", windowSize=patch_size)
    print('Xtrain shape: ', Xtrain.shape)
    print('Xtest shape: ', Xtest.shape)
    print('ytrain shape: ', ytrain.shape)
    print('ytest shape: ', ytest.shape)

    gc.collect()

    # Stratified split of the training data to ensure balance
    train_indices, val_indices = train_test_split(
        np.arange(len(ytrain)),
        test_size=validation_ratio,
        stratify=ytrain,
        random_state=42
    )

    X_train_split = Xtrain[train_indices]
    y_train_split = ytrain[train_indices].astype(int)
    X_val_split = Xtrain[val_indices]
    y_val_split = ytrain[val_indices].astype(int)

    unique_train, counts_train = np.unique(y_train_split, return_counts=True)
    train_distribution = dict(zip(unique_train, counts_train))
    print("\nTraining set class distribution after split", train_distribution)

    unique_val, counts_val = np.unique(y_val_split, return_counts=True)
    val_distribution = dict(zip(unique_val, counts_val))
    print("\nValidation set class distribution after split:", val_distribution)

    print('\n... ... PCA transformation ... ...')
    X_train_pca, X_val_pca, X_test_pca, pca_model = applyPCA(X_train_split, X_val_split, Xtest, pca_components)
    print('Xtrain shape after PCA: ', X_train_pca.shape)
    print('Xval shape after PCA: ', X_val_pca.shape)
    print('Xtest shape after PCA: ', X_test_pca.shape)

    X_train_pca = X_train_pca.reshape(-1, patch_size, patch_size, pca_model.n_components_, 1).transpose(0, 4, 3, 1, 2)
    X_val_pca = X_val_pca.reshape(-1, patch_size, patch_size, pca_model.n_components_, 1).transpose(0, 4, 3, 1, 2)
    X_test_pca = X_test_pca.reshape(-1, patch_size, patch_size, pca_model.n_components_, 1).transpose(0, 4, 3, 1, 2)

    trainset = TrainDS(X_train_pca, y_train_split)
    valset = TestDS(X_val_pca, y_val_split)
    testset = TestDS(X_test_pca, ytest)

    train_loader = torch.utils.data.DataLoader(dataset=trainset, batch_size=BATCH_SIZE_TRAIN, shuffle=True, num_workers=2)
    val_loader = torch.utils.data.DataLoader(dataset=valset, batch_size=BATCH_SIZE_TRAIN, shuffle=False, num_workers=2)
    test_loader = torch.utils.data.DataLoader(dataset=testset, batch_size=BATCH_SIZE_TRAIN, shuffle=False, num_workers=2)

    gc.collect()

    # For full scene classification
    newX = np.reshape(X, (-1, X.shape[-1])).astype(np.float32)
    X_pca_full = pca_model.transform(newX).astype(np.float32)
    X_pca_full = np.reshape(X_pca_full, (X.shape[0], X.shape[1], pca_model.n_components_))
    X_pca_full, y_all = createImageCubes(X_pca_full, y, windowSize=patch_size)
    X_pca_full = X_pca_full.reshape(-1, patch_size, patch_size, pca_model.n_components_, 1).transpose(0, 4, 3, 1, 2)
    all_data_loader = torch.utils.data.DataLoader(dataset=TestDS(X_pca_full, y_all), batch_size=BATCH_SIZE_TRAIN, shuffle=False, num_workers=2)

    # Save to cache as a dictionary
    with open(cache_file, 'wb') as f:
        pickle.dump({
            'train_loader': train_loader,
            'val_loader': val_loader,
            'test_loader': test_loader,
            'all_data_loader': all_data_loader,
            'y': y,
            'pca_components': pca_model.n_components_,
        }, f)

    return train_loader, val_loader, test_loader, all_data_loader, y, pca_model.n_components_

def train(train_loader, val_loader, num_classes, pca_components, lr, dropout, num_tokens, heads, epochs=100):
    device = xm.xla_device()
    print(f"PCA components passed: {pca_components}")
    net = SSFTTnet.SSFTTnet(
        in_channels=1,
        num_classes=num_classes,
        pca_components=pca_components,
        dropout=dropout,
        num_tokens=num_tokens,
        heads=heads).to(device)

    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)

    best_val_loss = float('inf')
    best_net = None

    training_losses = []
    validation_losses = []

    for epoch in range(epochs):
        total_loss = 0
        para_loader = pl.ParallelLoader(train_loader, [device])
        for i, (data, target) in enumerate(para_loader.per_device_loader(device)):
            data, target = data.to(device), target.to(device)
            outputs = net(data)
            loss = criterion(outputs, target)
            optimizer.zero_grad()
            loss.backward()
            xm.optimizer_step(optimizer)
            total_loss += loss.item()

        avg_training_loss = total_loss / len(train_loader)
        training_losses.append(avg_training_loss)

        # Validate
        para_loader_val = pl.ParallelLoader(val_loader, [device])
        net.eval()
        val_loss = 0
        with torch.no_grad():
            for i, (data, target) in enumerate(para_loader_val.per_device_loader(device)):
                data, target = data.to(device), target.to(device)
                outputs = net(data)
                loss = criterion(outputs, target)
                val_loss += loss.item()

        avg_validation_loss = val_loss / len(val_loader)
        validation_losses.append(avg_validation_loss)

        print('[Epoch: %d] [training loss avg: %.4f] [validation loss avg: %.4f]' %
              (epoch + 1, avg_training_loss, avg_validation_loss))

        # Save the best model based on validation loss
        if avg_validation_loss < best_val_loss:
            best_val_loss = avg_validation_loss
            best_net = net.state_dict()

        net.train()

    torch.save(best_net, f'{main_dir}/SSFTT/cls_params/SSFTTnet_params_{args.dataset}_best.pth')
    print('Best model saved with validation loss: {:.4f}'.format(best_val_loss))
    print("Finished training")

    return net, device, training_losses, validation_losses


def test(device, net, test_loader):
    count = 0
    net.eval()
    y_pred_test = []
    y_test = []
    para_loader = pl.ParallelLoader(test_loader, [device])

    for inputs, labels in para_loader.per_device_loader(device):
        inputs = inputs.to(device)
        outputs = net(inputs)
        outputs = torch.argmax(outputs, dim=1)

        if count == 0:
            y_pred_test = outputs.cpu().numpy()
            y_test = labels.cpu().numpy()
            count = 1
        else:
            y_pred_test = np.concatenate((y_pred_test, outputs.cpu().numpy()))
            y_test = np.concatenate((y_test, labels.cpu().numpy()))

    return y_pred_test, y_test

def objective(trial):
    lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
    dropout = trial.suggest_float('dropout', 0.1, 0.5)
    patch_size = trial.suggest_categorical('patch_size', [7,9,11,13,15])
    num_tokens = trial.suggest_categorical('num_tokens', [2, 4, 6, 8, 10, 12])
    heads = trial.suggest_categorical('heads', [1, 2, 4, 8, 16])

    train_loader, val_loader, test_loader, all_data_loader, y_all, pca_components = create_data_loader(
        args.dataset, args.kaggle_json_path, patch_size, train_samples=300
    )

    net, _, training_losses, validation_losses = train(
        train_loader,
        val_loader,
        num_classes=num_classes,
        pca_components=pca_components,
        lr=lr,
        dropout=dropout,
        num_tokens=num_tokens,
        heads=heads,
        epochs=100
    )

    # Return the minimum validation loss observed during training rather than last one
    # since we are saving the model parameters for that one
    best_val_loss = min(validation_losses)
    return best_val_loss

def AA_andEachClassAccuracy(confusion_matrix):
    list_diag = np.diag(confusion_matrix)
    list_raw_sum = np.sum(confusion_matrix, axis=1)
    each_acc = np.nan_to_num(np.divide(list_diag, list_raw_sum))
    average_acc = np.mean(each_acc)
    return each_acc, average_acc

def acc_reports(y_test, y_pred_test, dataset):
    target_mapping = {
    'HanChuan': ['Strawberry', 'Cowpea', 'Soybean', 'Sorghum', 'Water spinach', 'Watermelon', 'Greens', 'Trees', 'Grass', 'Red roof',
                 'Gray roof', 'Plastic', 'Bare soil', 'Road', 'Bright object', 'Water'],
    'HongHu': ['Red roof', 'Road', 'Bare soil', 'Cotton', 'Cotton firewood', 'Rape', 'Chinese cabbage', 'Pakchoi', 'Cabbage', 'Tuber mustard',
               'Brassica parachinensis', 'Brassica chinensis', 'Small Brassica chinensis', 'Lactuca sativa', 'Celtuce', 'Film covered lettuce',
               'Romaine lettuce', 'Carrot', 'White radish', 'Garlic sprout', 'Broad bean', 'Tree'],
    'LongKou': ['Corn', 'Cotton', 'Sesame', 'Broad-leaf soybean', 'Narrow-leaf soybean', 'Rice', 'Water', 'Roads and houses', 'Mixed weed']
    }
    target_names = target_mapping[dataset]
    classification = classification_report(y_test, y_pred_test, digits=4, target_names=target_names)
    oa = accuracy_score(y_test, y_pred_test)
    confusion = confusion_matrix(y_test, y_pred_test)
    each_acc, aa = AA_andEachClassAccuracy(confusion)
    kappa = cohen_kappa_score(y_test, y_pred_test)

    return classification, oa*100, confusion, each_acc*100, aa*100, kappa*100

class TestDS():
    def __init__(self, Xtest, ytest):
        self.len = Xtest.shape[0]
        self.x_data = torch.FloatTensor(Xtest)
        self.y_data = torch.LongTensor(ytest)

    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return self.len

class TrainDS():
    def __init__(self, Xtrain, ytrain):
        self.len = Xtrain.shape[0]
        self.x_data = torch.FloatTensor(Xtrain)
        self.y_data = torch.LongTensor(ytrain)

    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return self.len

class Args:
    def __init__(self, dataset, kaggle_json_path):
        self.dataset = dataset
        self.kaggle_json_path = kaggle_json_path

args = Args(dataset='HanChuan', kaggle_json_path='~/.kaggle/kaggle.json')
num_classes = NUM_CLASSES[args.dataset]

study = optuna.create_study(direction='minimize', pruner=optuna.pruners.MedianPruner(n_warmup_steps=10))
study.optimize(objective, n_trials=50)

best_params = study.best_params
print("Best parameters: ", best_params)

train_loader, val_loader, test_loader, all_data_loader, y_all, pca_components = create_data_loader(
    args.dataset, args.kaggle_json_path, patch_size=best_params['patch_size'], train_samples=300
)

# Training model using the optimal parameters found
tic1 = time.perf_counter()
best_net, device, training_losses, validation_losses = train(train_loader, val_loader, num_classes, pca_components, lr=best_params['lr'], dropout=best_params['dropout'], num_tokens=best_params['num_tokens'], heads=best_params['heads'], epochs=100)
toc1 = time.perf_counter()

# Training/Validation loss plot
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(training_losses) + 1), training_losses, label='Training Loss')
plt.plot(range(1, len(validation_losses) + 1), validation_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss Curves')
plt.xticks(ticks=range(1, len(training_losses) + 1, 5))
plt.legend()
plt.grid(True)
plt.show()

torch.save(best_net.state_dict(), f'{main_dir}/SSFTT/cls_params/SSFTTnet_params_{args.dataset}.pth')

tic2 = time.perf_counter()
y_pred_test, y_test = test(device, best_net, test_loader)
toc2 = time.perf_counter()

classification, oa, confusion, each_acc, aa, kappa = acc_reports(y_test, y_pred_test, args.dataset)
print(f"{args.dataset} overall accuracy: {oa}")
classification = str(classification)
Training_Time = toc1 - tic1
Test_time = toc2 - tic2
report_file_name = f"{main_dir}/SSFTT/cls_results/classification_report_{args.dataset}.txt"
with open(report_file_name, 'w') as x_file:
    x_file.write('{} Training_Time (s)'.format(Training_Time))
    x_file.write('\n')
    x_file.write('{} Test_time (s)'.format(Test_time))
    x_file.write('\n')
    x_file.write('{} Kappa accuracy (%)'.format(kappa))
    x_file.write('\n')
    x_file.write('{} Overall accuracy (%)'.format(oa))
    x_file.write('\n')
    x_file.write('{} Average accuracy (%)'.format(aa))
    x_file.write('\n')
    x_file.write('{} Each accuracy (%)'.format(each_acc))
    x_file.write('\n')
    x_file.write('{}'.format(classification))
    x_file.write('\n')
    x_file.write('{}'.format(confusion))

get_cls_map.get_cls_map(best_net, device, all_data_loader, y_all, args.dataset, model="SSFTT")

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
fig = optuna.visualization.plot_contour(study)
fig.update_layout(width=1250, height=850)
fig.show()

In [None]:
fig = optuna.visualization.plot_slice(study)
fig.update_layout(width=1250, height=650)
fig.show()

In [None]:
fig = optuna.visualization.plot_rank(study)
fig.update_layout(width=1250, height=650)
fig.show()

##LongKou


---



In [None]:
main_dir = '/content/Spectral-Spatial-Transformers-for-Precise-Crop-Classification-from-UAV-borne-Hyperspectral-Images'
sys.path.append(main_dir)

data_dir = '/content/Data'
sys.path.append(data_dir)

from data_fetcher import loadData
from SSFTT import SSFTTnet
import get_cls_map

BATCH_SIZE_TRAIN = 64

NUM_CLASSES = {
    'HanChuan': 16,
    'HongHu': 22,
    'LongKou': 9
}

dataset_mapping = {
    'HanChuan': 'WHU-Hi-HanChuan',
    'HongHu': 'WHU-Hi-HongHu',
    'LongKou': 'WHU-Hi-LongKou'
}

def loadDataWrapper(dataset, kaggle_json_path, train_samples):
    dataset_name = dataset_mapping[dataset]
    full_data_file = f'{data_dir}/{dataset_name}/WHU_Hi_{dataset}.mat'
    train_file = f'{data_dir}/{dataset_name}/Training samples and test samples/Train{train_samples}.mat'
    test_file = f'{data_dir}/{dataset_name}/Training samples and test samples/Test{train_samples}.mat'

    data, labels = loadData(dataset_name, kaggle_json_path)

    if dataset == 'HongHu':
        train_mask = sio.loadmat(train_file)[f'HHCYtrain{train_samples}']
        test_mask = sio.loadmat(test_file)[f'HHCYtest{train_samples}']
    elif dataset == 'HanChuan':
        train_mask = sio.loadmat(train_file)[f'Train{train_samples}']
        test_mask = sio.loadmat(test_file)[f'Test{train_samples}']
    elif dataset == 'LongKou':
        train_mask = sio.loadmat(train_file)[f'LKtrain{train_samples}']
        test_mask = sio.loadmat(test_file)[f'LKtest{train_samples}']

    return data, labels, train_mask, test_mask

def applyPCA(X_train, X_val, X_test, pca_components=30):
    newX_train = np.reshape(X_train, (-1, X_train.shape[-1])).astype(np.float32)
    newX_val = np.reshape(X_val, (-1, X_val.shape[-1])).astype(np.float32)
    newX_test = np.reshape(X_test, (-1, X_test.shape[-1])).astype(np.float32)

    pca = PCA(n_components=pca_components, whiten=True)
    newX_train = pca.fit_transform(newX_train).astype(np.float32)
    newX_val = pca.transform(newX_val).astype(np.float32)
    newX_test = pca.transform(newX_test).astype(np.float32)

    valid_components = pca.n_components_
    print(f"Valid components: {valid_components}")

    newX_train = np.reshape(newX_train, (X_train.shape[0], X_train.shape[1], X_train.shape[2], valid_components))
    newX_val = np.reshape(newX_val, (X_val.shape[0], X_val.shape[1], X_val.shape[2], valid_components))
    newX_test = np.reshape(newX_test, (X_test.shape[0], X_test.shape[1], X_test.shape[2], valid_components))

    return newX_train, newX_val, newX_test, pca

def padWithZeros(X, margin=2):
    newX = np.zeros((X.shape[0] + 2 * margin, X.shape[1] + 2 * margin, X.shape[2]), dtype=np.float32)
    x_offset = margin
    y_offset = margin
    newX[x_offset:X.shape[0] + x_offset, y_offset:X.shape[1] + y_offset, :] = X.astype(np.float32)
    return newX

def createImageCubesWithMask(X, y, mask, mask_type, windowSize=13):
    margin = int((windowSize - 1) / 2)
    print(f"Margin: {margin}")
    zeroPaddedX = padWithZeros(X, margin=margin)
    print(f"Padded X shape: {zeroPaddedX.shape}")
    positions = np.argwhere(mask >= 1)
    patchesData = np.zeros((len(positions), windowSize, windowSize, X.shape[2]), dtype=np.float32)
    patchesLabels = np.zeros(len(positions))

    unique, counts = np.unique(mask, return_counts=True)
    mask_stats = dict(zip(unique, counts))
    print(f"{mask_type} statistics: {mask_stats}")

    print(f"y shape: {y.shape}, type: {type(y)}")
    print(f"{mask_type} positions shape: {positions.shape}, type: {type(positions)}")
    print("_________________________________________________________________________\n")

    for i, pos in enumerate(positions):
        x, y_pos = pos
        x_padded = x + margin
        y_padded = y_pos + margin
        patch = zeroPaddedX[x_padded - margin:x_padded + margin + 1, y_padded - margin:y_padded + margin + 1, :].astype(np.float32)
        patchesData[i, :, :, :] = patch
        patchesLabels[i] = y[x, y_pos] - 1

    return patchesData, patchesLabels

def createImageCubes(X, y, windowSize=13, removeZeroLabels=True):
    margin = int((windowSize - 1) / 2)
    zeroPaddedX = padWithZeros(X, margin=margin)
    patchesData = np.zeros((X.shape[0] * X.shape[1], windowSize, windowSize, X.shape[2]), dtype=np.float32)
    patchesLabels = np.zeros((X.shape[0] * X.shape[1]))
    patchIndex = 0
    for r in range(margin, zeroPaddedX.shape[0] - margin):
        for c in range(margin, zeroPaddedX.shape[1] - margin):
            patch = zeroPaddedX[r - margin:r + margin + 1, c - margin:c + margin + 1].astype(np.float32)
            patchesData[patchIndex, :, :, :] = patch
            patchesLabels[patchIndex] = y[r-margin, c-margin]
            patchIndex = patchIndex + 1
    if removeZeroLabels:
        patchesData = patchesData[patchesLabels > 0, :, :, :]
        patchesLabels = patchesLabels[patchesLabels > 0]
        patchesLabels -= 1

    return patchesData, patchesLabels

def create_data_loader(dataset, kaggle_json_path, patch_size, train_samples, validation_ratio=0.2):
    # Define cache file name
    cache_dir = f'preprocessed_patches/{dataset}'
    os.makedirs(cache_dir, exist_ok=True)
    cache_file = f'{cache_dir}/patch_size_{patch_size}_samples_{train_samples}.pkl'

    # If cache file exists, load the data from it
    if os.path.exists(cache_file):
        print(f"\nLoading cached data for patch size {patch_size} and train samples {train_samples}...")
        with open(cache_file, 'rb') as f:
            data = pickle.load(f)  # Ensure this is a dictionary
            return data['train_loader'], data['val_loader'], data['test_loader'], data['all_data_loader'], data['y'], data['pca_components']

    # If cache does not exist, process the data as usual
    X, y, train_mask, test_mask = loadDataWrapper(dataset, kaggle_json_path, train_samples)
    pca_components = 30

    print('Hyperspectral data shape: ', X.shape)
    print('Label shape: ', y.shape)

    print(f'Train mask shape: {train_mask.shape}')
    print(f'Test mask shape: {test_mask.shape}')

    print(f"Patch (window) size: {patch_size}")

    print('\n... ... Create data cubes with masks (Before PCA to avoid data leakage) ... ...')
    Xtrain, ytrain = createImageCubesWithMask(X, y, train_mask, mask_type="Training mask", windowSize=patch_size)
    Xtest, ytest = createImageCubesWithMask(X, y, test_mask, mask_type="Testing mask", windowSize=patch_size)
    print('Xtrain shape: ', Xtrain.shape)
    print('Xtest shape: ', Xtest.shape)
    print('ytrain shape: ', ytrain.shape)
    print('ytest shape: ', ytest.shape)

    gc.collect()

    # Stratified split of the training data to ensure balance
    train_indices, val_indices = train_test_split(
        np.arange(len(ytrain)),
        test_size=validation_ratio,
        stratify=ytrain,
        random_state=42
    )

    X_train_split = Xtrain[train_indices]
    y_train_split = ytrain[train_indices].astype(int)
    X_val_split = Xtrain[val_indices]
    y_val_split = ytrain[val_indices].astype(int)

    unique_train, counts_train = np.unique(y_train_split, return_counts=True)
    train_distribution = dict(zip(unique_train, counts_train))
    print("\nTraining set class distribution after split", train_distribution)

    unique_val, counts_val = np.unique(y_val_split, return_counts=True)
    val_distribution = dict(zip(unique_val, counts_val))
    print("\nValidation set class distribution after split:", val_distribution)

    print('\n... ... PCA transformation ... ...')
    X_train_pca, X_val_pca, X_test_pca, pca_model = applyPCA(X_train_split, X_val_split, Xtest, pca_components)
    print('Xtrain shape after PCA: ', X_train_pca.shape)
    print('Xval shape after PCA: ', X_val_pca.shape)
    print('Xtest shape after PCA: ', X_test_pca.shape)

    X_train_pca = X_train_pca.reshape(-1, patch_size, patch_size, pca_model.n_components_, 1).transpose(0, 4, 3, 1, 2)
    X_val_pca = X_val_pca.reshape(-1, patch_size, patch_size, pca_model.n_components_, 1).transpose(0, 4, 3, 1, 2)
    X_test_pca = X_test_pca.reshape(-1, patch_size, patch_size, pca_model.n_components_, 1).transpose(0, 4, 3, 1, 2)

    trainset = TrainDS(X_train_pca, y_train_split)
    valset = TestDS(X_val_pca, y_val_split)
    testset = TestDS(X_test_pca, ytest)

    train_loader = torch.utils.data.DataLoader(dataset=trainset, batch_size=BATCH_SIZE_TRAIN, shuffle=True, num_workers=2)
    val_loader = torch.utils.data.DataLoader(dataset=valset, batch_size=BATCH_SIZE_TRAIN, shuffle=False, num_workers=2)
    test_loader = torch.utils.data.DataLoader(dataset=testset, batch_size=BATCH_SIZE_TRAIN, shuffle=False, num_workers=2)

    gc.collect()

    # For full scene classification
    newX = np.reshape(X, (-1, X.shape[-1])).astype(np.float32)
    X_pca_full = pca_model.transform(newX).astype(np.float32)
    X_pca_full = np.reshape(X_pca_full, (X.shape[0], X.shape[1], pca_model.n_components_))
    X_pca_full, y_all = createImageCubes(X_pca_full, y, windowSize=patch_size)
    X_pca_full = X_pca_full.reshape(-1, patch_size, patch_size, pca_model.n_components_, 1).transpose(0, 4, 3, 1, 2)
    all_data_loader = torch.utils.data.DataLoader(dataset=TestDS(X_pca_full, y_all), batch_size=BATCH_SIZE_TRAIN, shuffle=False, num_workers=2)

    # Save to cache as a dictionary
    with open(cache_file, 'wb') as f:
        pickle.dump({
            'train_loader': train_loader,
            'val_loader': val_loader,
            'test_loader': test_loader,
            'all_data_loader': all_data_loader,
            'y': y,
            'pca_components': pca_model.n_components_,
        }, f)

    return train_loader, val_loader, test_loader, all_data_loader, y, pca_model.n_components_

def train(train_loader, val_loader, num_classes, pca_components, lr, dropout, num_tokens, heads, epochs=100):
    device = xm.xla_device()
    print(f"PCA components passed: {pca_components}")
    net = SSFTTnet.SSFTTnet(
        in_channels=1,
        num_classes=num_classes,
        pca_components=pca_components,
        dropout=dropout,
        num_tokens=num_tokens,
        heads=heads).to(device)

    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)

    best_val_loss = float('inf')
    best_net = None

    training_losses = []
    validation_losses = []

    for epoch in range(epochs):
        total_loss = 0
        para_loader = pl.ParallelLoader(train_loader, [device])
        for i, (data, target) in enumerate(para_loader.per_device_loader(device)):
            data, target = data.to(device), target.to(device)
            outputs = net(data)
            loss = criterion(outputs, target)
            optimizer.zero_grad()
            loss.backward()
            xm.optimizer_step(optimizer)
            total_loss += loss.item()

        avg_training_loss = total_loss / len(train_loader)
        training_losses.append(avg_training_loss)

        # Validate
        para_loader_val = pl.ParallelLoader(val_loader, [device])
        net.eval()
        val_loss = 0
        with torch.no_grad():
            for i, (data, target) in enumerate(para_loader_val.per_device_loader(device)):
                data, target = data.to(device), target.to(device)
                outputs = net(data)
                loss = criterion(outputs, target)
                val_loss += loss.item()

        avg_validation_loss = val_loss / len(val_loader)
        validation_losses.append(avg_validation_loss)

        print('[Epoch: %d] [training loss avg: %.4f] [validation loss avg: %.4f]' %
              (epoch + 1, avg_training_loss, avg_validation_loss))

        # Save the best model based on validation loss
        if avg_validation_loss < best_val_loss:
            best_val_loss = avg_validation_loss
            best_net = net.state_dict()

        net.train()

    torch.save(best_net, f'{main_dir}/SSFTT/cls_params/SSFTTnet_params_{args.dataset}_best.pth')
    print('Best model saved with validation loss: {:.4f}'.format(best_val_loss))
    print("Finished training")

    return net, device, training_losses, validation_losses


def test(device, net, test_loader):
    count = 0
    net.eval()
    y_pred_test = []
    y_test = []
    para_loader = pl.ParallelLoader(test_loader, [device])

    for inputs, labels in para_loader.per_device_loader(device):
        inputs = inputs.to(device)
        outputs = net(inputs)
        outputs = torch.argmax(outputs, dim=1)

        if count == 0:
            y_pred_test = outputs.cpu().numpy()
            y_test = labels.cpu().numpy()
            count = 1
        else:
            y_pred_test = np.concatenate((y_pred_test, outputs.cpu().numpy()))
            y_test = np.concatenate((y_test, labels.cpu().numpy()))

    return y_pred_test, y_test

def objective(trial):
    lr = trial.suggest_loguniform('lr', 1e-5, 1e-3)
    dropout = trial.suggest_float('dropout', 0.1, 0.5)
    patch_size = trial.suggest_categorical('patch_size', [7,9,11,13,15])
    num_tokens = trial.suggest_categorical('num_tokens', [2, 4, 6, 8, 10, 12])
    heads = trial.suggest_categorical('heads', [1, 2, 4, 8, 16])

    train_loader, val_loader, test_loader, all_data_loader, y_all, pca_components = create_data_loader(
        args.dataset, args.kaggle_json_path, patch_size, train_samples=300
    )

    net, _, training_losses, validation_losses = train(
        train_loader,
        val_loader,
        num_classes=num_classes,
        pca_components=pca_components,
        lr=lr,
        dropout=dropout,
        num_tokens=num_tokens,
        heads=heads,
        epochs=100
    )

    # Return the minimum validation loss observed during training rather than last one
    # since we are saving the model parameters for that one
    best_val_loss = min(validation_losses)
    return best_val_loss

def AA_andEachClassAccuracy(confusion_matrix):
    list_diag = np.diag(confusion_matrix)
    list_raw_sum = np.sum(confusion_matrix, axis=1)
    each_acc = np.nan_to_num(np.divide(list_diag, list_raw_sum))
    average_acc = np.mean(each_acc)
    return each_acc, average_acc

def acc_reports(y_test, y_pred_test, dataset):
    target_mapping = {
    'HanChuan': ['Strawberry', 'Cowpea', 'Soybean', 'Sorghum', 'Water spinach', 'Watermelon', 'Greens', 'Trees', 'Grass', 'Red roof',
                 'Gray roof', 'Plastic', 'Bare soil', 'Road', 'Bright object', 'Water'],
    'HongHu': ['Red roof', 'Road', 'Bare soil', 'Cotton', 'Cotton firewood', 'Rape', 'Chinese cabbage', 'Pakchoi', 'Cabbage', 'Tuber mustard',
               'Brassica parachinensis', 'Brassica chinensis', 'Small Brassica chinensis', 'Lactuca sativa', 'Celtuce', 'Film covered lettuce',
               'Romaine lettuce', 'Carrot', 'White radish', 'Garlic sprout', 'Broad bean', 'Tree'],
    'LongKou': ['Corn', 'Cotton', 'Sesame', 'Broad-leaf soybean', 'Narrow-leaf soybean', 'Rice', 'Water', 'Roads and houses', 'Mixed weed']
    }
    target_names = target_mapping[dataset]
    classification = classification_report(y_test, y_pred_test, digits=4, target_names=target_names)
    oa = accuracy_score(y_test, y_pred_test)
    confusion = confusion_matrix(y_test, y_pred_test)
    each_acc, aa = AA_andEachClassAccuracy(confusion)
    kappa = cohen_kappa_score(y_test, y_pred_test)

    return classification, oa*100, confusion, each_acc*100, aa*100, kappa*100

class TestDS():
    def __init__(self, Xtest, ytest):
        self.len = Xtest.shape[0]
        self.x_data = torch.FloatTensor(Xtest)
        self.y_data = torch.LongTensor(ytest)

    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return self.len

class TrainDS():
    def __init__(self, Xtrain, ytrain):
        self.len = Xtrain.shape[0]
        self.x_data = torch.FloatTensor(Xtrain)
        self.y_data = torch.LongTensor(ytrain)

    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return self.len

class Args:
    def __init__(self, dataset, kaggle_json_path):
        self.dataset = dataset
        self.kaggle_json_path = kaggle_json_path

args = Args(dataset='LongKou', kaggle_json_path='~/.kaggle/kaggle.json')
num_classes = NUM_CLASSES[args.dataset]

study = optuna.create_study(direction='minimize', pruner=optuna.pruners.MedianPruner(n_warmup_steps=10))
study.optimize(objective, n_trials=50)

best_params = study.best_params
print("Best parameters: ", best_params)

train_loader, val_loader, test_loader, all_data_loader, y_all, pca_components = create_data_loader(
    args.dataset, args.kaggle_json_path, patch_size=best_params['patch_size'], train_samples=300
)

# Training model using the optimal parameters found
tic1 = time.perf_counter()
best_net, device, training_losses, validation_losses = train(train_loader, val_loader, num_classes, pca_components, lr=best_params['lr'], dropout=best_params['dropout'], num_tokens=best_params['num_tokens'], heads=best_params['heads'], epochs=100)
toc1 = time.perf_counter()

# Training/Validation loss plot
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(training_losses) + 1), training_losses, label='Training Loss')
plt.plot(range(1, len(validation_losses) + 1), validation_losses, label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss Curves')
plt.xticks(ticks=range(1, len(training_losses) + 1, 5))
plt.legend()
plt.grid(True)
plt.show()

torch.save(best_net.state_dict(), f'{main_dir}/SSFTT/cls_params/SSFTTnet_params_{args.dataset}.pth')

tic2 = time.perf_counter()
y_pred_test, y_test = test(device, best_net, test_loader)
toc2 = time.perf_counter()

classification, oa, confusion, each_acc, aa, kappa = acc_reports(y_test, y_pred_test, args.dataset)
print(f"{args.dataset} overall accuracy: {oa}")
classification = str(classification)
Training_Time = toc1 - tic1
Test_time = toc2 - tic2
report_file_name = f"{main_dir}/SSFTT/cls_results/classification_report_{args.dataset}.txt"
with open(report_file_name, 'w') as x_file:
    x_file.write('{} Training_Time (s)'.format(Training_Time))
    x_file.write('\n')
    x_file.write('{} Test_time (s)'.format(Test_time))
    x_file.write('\n')
    x_file.write('{} Kappa accuracy (%)'.format(kappa))
    x_file.write('\n')
    x_file.write('{} Overall accuracy (%)'.format(oa))
    x_file.write('\n')
    x_file.write('{} Average accuracy (%)'.format(aa))
    x_file.write('\n')
    x_file.write('{} Each accuracy (%)'.format(each_acc))
    x_file.write('\n')
    x_file.write('{}'.format(classification))
    x_file.write('\n')
    x_file.write('{}'.format(confusion))

get_cls_map.get_cls_map(best_net, device, all_data_loader, y_all, args.dataset, model="SSFTT")

In [None]:
optuna.visualization.plot_param_importances(study)

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
fig = optuna.visualization.plot_contour(study)
fig.update_layout(width=1250, height=850)
fig.show()

In [None]:
fig = optuna.visualization.plot_slice(study)
fig.update_layout(width=1250, height=650)
fig.show()

In [None]:
fig = optuna.visualization.plot_rank(study)
fig.update_layout(width=1250, height=650)
fig.show()