In [1]:
import os
from glob import glob
import pandas as pd
import numpy as np
import torch
import timm
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataset import random_split
from torch import nn, optim
from torch.nn import functional as F
from torchvision import transforms
import matplotlib.pyplot as plt
from math import ceil
from tqdm.notebook import tqdm
import wandb
from torch_lr_finder import LRFinder
from utils import EarlyStopping
from pickle import load
import utils
from sklearn.preprocessing import StandardScaler
import plotly.express as px
from scipy import ndimage

In [2]:
from models.Hang2020 import Hang2020, spectral_network, spatial_network

In [3]:
# run = wandb.init(project='HyperView-initial', reinit=True)
wandb.init(mode="disabled") # in case of testing code

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.




### Paths definition

In [4]:
train_data_directory = r'../../train_data'
test_data_directory = r'../../test_data'
saved_models_directory = r'../../saved_models'
submissions_directory = r'../../submissions'

### Load mean and std values of training dataset

In [5]:
means_stds_arrays = np.load(os.path.join(
    train_data_directory, 'means_stds_values_training_data.npz'))
means = np.array(means_stds_arrays['means'])
stds = np.array(means_stds_arrays['stds'])

### Load scalers for parameters

In [6]:
parameters_scalers = dict()
parameters_scalers['P'] = load((open(os.path.join(train_data_directory, 'standard_scaler_P.pkl'), 'rb')))
parameters_scalers['K'] = load((open(os.path.join(train_data_directory, 'standard_scaler_K.pkl'), 'rb')))
parameters_scalers['Mg'] = load((open(os.path.join(train_data_directory, 'standard_scaler_Mg.pkl'), 'rb')))
parameters_scalers['pH'] = load((open(os.path.join(train_data_directory, 'standard_scaler_pH.pkl'), 'rb')))

In [7]:
parameters_scalers['P'].mean_

array([70.30265589])

### Dataset definition

In [8]:
class HyperViewDataset(Dataset):
    def __init__(self, gt_file, img_dir, transform=True, means=None, stds=None, train_transforms=False, parameters_scalers=None):
        self.img_dir = img_dir
        if gt_file is not None:
            self.gt = pd.read_csv(gt_file)[['sample_index', 'P', 'K', 'Mg', 'pH']]
        else:
            self.gt = None
        self.transform = transform  # whether to perform transformation of input data
        # whether to perform transformations like on training data
        self.train_transforms = train_transforms

        if self.train_transforms:
            self.training_transforms_composition = transforms.Compose([
                transforms.RandomHorizontalFlip(p=0.4), transforms.RandomVerticalFlip(p=0.4)])

        self.means = means  # mean values for every band used to normalize data
        self.stds = stds  # std values for every band used to normalize data

        self.img_files = np.array(
            sorted(
                glob(os.path.join(img_dir, "*.npz")),
                key=lambda x: int(os.path.basename(x).replace(".npz", "")),
            )
        )

        if parameters_scalers is not None:
            self.scaler_P = parameters_scalers['P']
            self.scaler_K = parameters_scalers['K']
            self.scaler_Mg = parameters_scalers['Mg']
            self.scaler_pH = parameters_scalers['pH']
        else:
            self.scaler_P = None
            self.scaler_K = None
            self.scaler_Mg = None
            self.scaler_pH = None

    def pad_to_minimum_size(self, min_size, image):
        # pads numpy array to defined in `min_size` minimum size of array (w, h)
        c, h, w = image.shape
        h_diff = (min_size - h)/2
        w_diff = (min_size - w)/2

        if not h_diff.is_integer():
            h_pad1 = int(h_diff - 0.5)
            h_pad2 = int(h_diff + 0.5)
        else:
            h_pad1 = h_pad2 = int(h_diff)

        if not w_diff.is_integer():
            w_pad1 = int(w_diff - 0.5)
            w_pad2 = int(w_diff + 0.5)
        else:
            w_pad1 = w_pad2 = int(w_diff)

        # check if any padding is bigger than zero
        if h_pad1+h_pad2+w_pad1+h_pad2 == 0:
            return image
        else:
            return transforms.functional.pad(image, (w_pad1, h_pad1, w_pad2, h_pad2), fill=0)

    def __len__(self):
        return len(self.img_files)

    def __getitem__(self, idx):
        # load hyperspectral image to array
        img_arr = np.ma.MaskedArray(**np.load(self.img_files[idx]))
        img_tensor = torch.as_tensor(img_arr.data, dtype=torch.float)
        # Inverting mask is necessary due to masking method in numpy
        img_tensor_mask = ~torch.as_tensor(img_arr.mask)
        img_masked_tensor = torch.mul(img_tensor, img_tensor_mask)

        if self.transform:
            img_masked_tensor_transformed = transforms.functional.normalize(
                img_masked_tensor, mean=self.means.tolist(), std=self.stds.tolist())
            img_masked_tensor_transformed = self.pad_to_minimum_size(
                300, img_masked_tensor_transformed)
            if self.train_transforms:
                img_masked_tensor_transformed = self.training_transforms_composition(
                    img_masked_tensor_transformed)
            img_result_tensor = img_masked_tensor_transformed
        else:
            img_result_tensor = img_masked_tensor
        # load labels
        if self.gt is not None:
            P_gt = self.gt.loc[idx, 'P']
            K_gt = self.gt.loc[idx, 'K']
            Mg_gt = self.gt.loc[idx, 'Mg']
            pH_gt = self.gt.loc[idx, 'pH']
            sample_index = self.gt.loc[idx, 'sample_index']
            
            if self.scaler_P is not None:
                assert self.scaler_K is not None
                assert self.scaler_Mg is not None
                assert self.scaler_pH is not None
                
                P_gt = self.scaler_P.transform(np.array(P_gt).reshape(1, -1))[0][0]
                K_gt = self.scaler_K.transform(np.array(K_gt).reshape(1, -1))[0][0]
                Mg_gt = self.scaler_Mg.transform(np.array(Mg_gt).reshape(1, -1))[0][0]
                pH_gt = self.scaler_pH.transform(np.array(pH_gt).reshape(1, -1))[0][0]
                

            sample = {'image': img_result_tensor, 'P': P_gt,
                      'K': K_gt, 'Mg': Mg_gt, 'pH': pH_gt, 'sample_index': sample_index}
        else:
            sample = {'image': img_result_tensor}

        return sample

In [9]:
BATCH_SIZE = 12

In [10]:
test_dataset = HyperViewDataset(None, os.path.join(
    test_data_directory), True, means, stds, train_transforms=False, parameters_scalers=parameters_scalers)

In [11]:
test_dataloader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, pin_memory=True, drop_last=False, num_workers=4, shuffle=False)

### Model Definition

In [12]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [13]:
model = Hang2020(bands=150, classes=1).to(device)

### TEST: Loading saved weights of single parameters training

In [14]:
weights_filename_base = 'checkpoint_state_dict_Hang2020_22_05_2022_Single_Parameter_Scaled__'

In [15]:
predict_parameter = 'P'

In [16]:
saved_weights = os.path.join(saved_models_directory, weights_filename_base + predict_parameter + '.pth')
saved_weights

'../../saved_models/checkpoint_state_dict_Hang2020_22_05_2022_Single_Parameter_Scaled__P.pth'

In [17]:
model.load_state_dict(torch.load(saved_weights))

<All keys matched successfully>

In [18]:
output_list = []
bar = tqdm(test_dataloader, position=0, leave=False, desc=f'test data')
for test_data in bar:  # for each test step
    img_data_device = test_data['image'].to(device, dtype=torch.float)

    output = model(img_data_device)
    output_cpu = output.detach().cpu().numpy()
    output_list.append(output_cpu)

test data:   0%|          | 0/97 [00:00<?, ?it/s]



In [19]:
output_list

[array([[-0.0698475 ],
        [-0.20007445],
        [-0.04594931],
        [-0.08181936],
        [-0.30630815],
        [-0.22588164],
        [-0.20057458],
        [ 0.07253776],
        [-0.46689594],
        [-0.24954824],
        [-0.226569  ],
        [-0.1377998 ]], dtype=float32),
 array([[-0.15758865],
        [-0.34280503],
        [-0.19356109],
        [-0.1573303 ],
        [-0.03501627],
        [-0.11360831],
        [-0.4231033 ],
        [-0.20765315],
        [-0.11382517],
        [-0.09884457],
        [-0.10772184],
        [-0.18870284]], dtype=float32),
 array([[-0.47194794],
        [-0.5236665 ],
        [-0.12871604],
        [-0.21469519],
        [-0.01996941],
        [-0.094427  ],
        [-0.26324928],
        [-0.4467948 ],
        [-0.1905087 ],
        [-0.05906172],
        [-0.0439859 ],
        [-0.15195438]], dtype=float32),
 array([[ 0.17605539],
        [-0.17444198],
        [-0.15766034],
        [-0.20900235],
        [-0.3558914 ],
      

In [20]:
single_param_temp_df = pd.DataFrame([item for sublist in output_list for item in sublist], columns=[predict_parameter])
single_param_temp_df

Unnamed: 0,P
0,-0.069848
1,-0.200074
2,-0.045949
3,-0.081819
4,-0.306308
...,...
1149,-7.703190
1150,-6.953344
1151,-2.209797
1152,-5.833663


### Predict in loop

In [23]:
weights_filename_base = 'checkpoint_state_dict_Hang2020_22_05_2022_Single_Parameter_Scaled__'

In [24]:
predict_parameters = ['P', 'K', 'Mg', 'pH'] # change to dict with: param -> weights_filename

In [26]:
param_df_list_unscaled = []
param_df_list_scaled = []

for predict_param in predict_parameters:
    saved_weights = os.path.join(saved_models_directory, weights_filename_base + predict_param + '.pth')
    scaler = getattr(test_dataset, f'scaler_{predict_param}')
    
    model = Hang2020(bands=150, classes=1).to(device)
    model.load_state_dict(torch.load(saved_weights))
    model.eval()
    
    
    single_param_scaled_output_list = []
    single_param_unscaled_output_list = []
    bar = tqdm(test_dataloader, position=0, leave=False, desc=f'test data')
    for test_data in bar:  # for each test step
        img_data_device = test_data['image'].to(device, dtype=torch.float)

        output = model(img_data_device)
        output_cpu = output.detach().cpu().numpy()
        output_unscaled_cpu = scaler.inverse_transform(output_cpu)
        single_param_scaled_output_list.append(output_cpu)
        single_param_unscaled_output_list.append(output_unscaled_cpu)
    
    single_param_scaled_temp_df = pd.DataFrame([item for sublist in single_param_scaled_output_list for item in sublist], columns=[predict_param])
    single_param_unscaled_temp_df = pd.DataFrame([item for sublist in single_param_unscaled_output_list for item in sublist], columns=[predict_param])

    param_df_list_scaled.append(single_param_scaled_temp_df)
    param_df_list_unscaled.append(single_param_unscaled_temp_df)

test data:   0%|          | 0/97 [00:00<?, ?it/s]



test data:   0%|          | 0/97 [00:00<?, ?it/s]

test data:   0%|          | 0/97 [00:00<?, ?it/s]

test data:   0%|          | 0/97 [00:00<?, ?it/s]

In [27]:
param_df_list_scaled

[             P
 0     0.052434
 1     0.036308
 2     0.058605
 3     0.054545
 4     0.008949
 ...        ...
 1149 -0.393884
 1150 -0.276559
 1151 -0.467401
 1152 -0.340061
 1153 -0.230357
 
 [1154 rows x 1 columns],
              K
 0    -0.010171
 1     0.073452
 2    -0.014445
 3    -0.003352
 4    -0.013944
 ...        ...
 1149 -1.451440
 1150 -1.458352
 1151  0.042335
 1152 -0.931045
 1153  0.175643
 
 [1154 rows x 1 columns],
             Mg
 0     0.019945
 1     0.051004
 2     0.017754
 3     0.036723
 4     0.020117
 ...        ...
 1149  0.237638
 1150 -0.045160
 1151  0.231424
 1152  0.098109
 1153  0.138830
 
 [1154 rows x 1 columns],
             pH
 0     0.367946
 1     0.368333
 2     0.367577
 3     0.367878
 4     0.479848
 ...        ...
 1149 -0.508581
 1150 -0.425118
 1151 -1.811569
 1152 -0.593585
 1153 -0.700174
 
 [1154 rows x 1 columns]]

In [28]:
param_df_list_unscaled

[              P
 0     71.849251
 1     71.373604
 2     72.031296
 3     71.911537
 4     70.566612
 ...         ...
 1149  58.684547
 1150  62.145214
 1151  56.516075
 1152  60.272141
 1153  63.507980
 
 [1154 rows x 1 columns],
                K
 0     227.359177
 1     232.533264
 2     227.094742
 3     227.781113
 4     227.125732
 ...          ...
 1149  138.181961
 1150  137.754257
 1151  230.607941
 1152  170.380905
 1153  238.856262
 
 [1154 rows x 1 columns],
               Mg
 0     160.076233
 1     161.314270
 2     159.988907
 3     160.745041
 4     160.083099
 ...          ...
 1149  168.753586
 1150  157.481140
 1151  168.505859
 1152  163.191910
 1153  164.815048
 
 [1154 rows x 1 columns],
             pH
 0     6.878468
 1     6.878568
 2     6.878371
 3     6.878450
 4     6.907587
 ...        ...
 1149  6.650375
 1150  6.672094
 1151  6.311307
 1152  6.628255
 1153  6.600518
 
 [1154 rows x 1 columns]]

#### Merge DFs and convert to CSV

In [29]:
params_df = pd.concat(param_df_list_unscaled, axis=1)

In [30]:
params_df

Unnamed: 0,P,K,Mg,pH
0,71.849251,227.359177,160.076233,6.878468
1,71.373604,232.533264,161.314270,6.878568
2,72.031296,227.094742,159.988907,6.878371
3,71.911537,227.781113,160.745041,6.878450
4,70.566612,227.125732,160.083099,6.907587
...,...,...,...,...
1149,58.684547,138.181961,168.753586,6.650375
1150,62.145214,137.754257,157.481140,6.672094
1151,56.516075,230.607941,168.505859,6.311307
1152,60.272141,170.380905,163.191910,6.628255


In [31]:
params_df.to_csv(os.path.join(submissions_directory,
                  "checkpoint_state_dict_Hang2020_22_05_2022_Single_Parameter_Scaled.csv"), index_label="sample_index")