In [1]:
import os
from glob import glob
import pandas as pd
import numpy as np
import torch
import timm
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataset import random_split
from torch import nn, optim
from torch.nn import functional as F
from torchvision import transforms
import matplotlib.pyplot as plt
from math import ceil
from tqdm.notebook import tqdm
import wandb
from torch_lr_finder import LRFinder
from utils import EarlyStopping
from pickle import load
import utils
from sklearn.preprocessing import StandardScaler
import plotly.express as px

In [2]:
from scipy import ndimage

In [3]:
# run = wandb.init(project='HyperView-initial', reinit=True)
wandb.init(mode="disabled") # in case of testing code

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.




### Paths definition

In [4]:
train_data_directory = r'../../train_data'
test_data_directory = r'../../test_data'
saved_models_directory = r'../../saved_models'
submissions_directory = r'../../submissions'

### Load mean and std values of training dataset

In [5]:
means_stds_arrays = np.load(os.path.join(
    train_data_directory, 'means_stds_values_training_data.npz'))
means = np.array(means_stds_arrays['means'])
stds = np.array(means_stds_arrays['stds'])

### Load scalers for parameters

In [6]:
parameters_scalers = dict()
parameters_scalers['P'] = load((open(os.path.join(train_data_directory, 'standard_scaler_P.pkl'), 'rb')))
parameters_scalers['K'] = load((open(os.path.join(train_data_directory, 'standard_scaler_K.pkl'), 'rb')))
parameters_scalers['Mg'] = load((open(os.path.join(train_data_directory, 'standard_scaler_Mg.pkl'), 'rb')))
parameters_scalers['pH'] = load((open(os.path.join(train_data_directory, 'standard_scaler_pH.pkl'), 'rb')))

In [7]:
parameters_scalers['P'].mean_

array([70.30265589])

### Dataset definition

In [8]:
class HyperViewDataset(Dataset):
    def __init__(self, gt_file, img_dir, transform=True, means=None, stds=None, train_transforms=False, parameters_scalers=None):
        self.img_dir = img_dir
        if gt_file is not None:
            self.gt = pd.read_csv(gt_file)[['sample_index', 'P', 'K', 'Mg', 'pH']]
        else:
            self.gt = None
        self.transform = transform  # whether to perform transformation of input data
        # whether to perform transformations like on training data
        self.train_transforms = train_transforms

        if self.train_transforms:
            self.training_transforms_composition = transforms.Compose([
                transforms.RandomHorizontalFlip(p=0.4), transforms.RandomVerticalFlip(p=0.4)])

        self.means = means  # mean values for every band used to normalize data
        self.stds = stds  # std values for every band used to normalize data

        self.img_files = np.array(
            sorted(
                glob(os.path.join(img_dir, "*.npz")),
                key=lambda x: int(os.path.basename(x).replace(".npz", "")),
            )
        )

        if parameters_scalers is not None:
            self.scaler_P = parameters_scalers['P']
            self.scaler_K = parameters_scalers['K']
            self.scaler_Mg = parameters_scalers['Mg']
            self.scaler_pH = parameters_scalers['pH']
        else:
            self.scaler_P = None
            self.scaler_K = None
            self.scaler_Mg = None
            self.scaler_pH = None

    def pad_to_minimum_size(self, min_size, image):
        # pads numpy array to defined in `min_size` minimum size of array (w, h)
        c, h, w = image.shape
        h_diff = (min_size - h)/2
        w_diff = (min_size - w)/2

        if not h_diff.is_integer():
            h_pad1 = int(h_diff - 0.5)
            h_pad2 = int(h_diff + 0.5)
        else:
            h_pad1 = h_pad2 = int(h_diff)

        if not w_diff.is_integer():
            w_pad1 = int(w_diff - 0.5)
            w_pad2 = int(w_diff + 0.5)
        else:
            w_pad1 = w_pad2 = int(w_diff)

        # check if any padding is bigger than zero
        if h_pad1+h_pad2+w_pad1+h_pad2 == 0:
            return image
        else:
            return transforms.functional.pad(image, (w_pad1, h_pad1, w_pad2, h_pad2), fill=0)

    def __len__(self):
        return len(self.img_files)

    def __getitem__(self, idx):
        # load hyperspectral image to array
        img_arr = np.ma.MaskedArray(**np.load(self.img_files[idx]))
        img_tensor = torch.as_tensor(img_arr.data, dtype=torch.float)
        # Inverting mask is necessary due to masking method in numpy
        img_tensor_mask = ~torch.as_tensor(img_arr.mask)
        img_masked_tensor = torch.mul(img_tensor, img_tensor_mask)

        if self.transform:
            img_masked_tensor_transformed = transforms.functional.normalize(
                img_masked_tensor, mean=self.means.tolist(), std=self.stds.tolist())
            img_masked_tensor_transformed = self.pad_to_minimum_size(
                300, img_masked_tensor_transformed)
            if self.train_transforms:
                img_masked_tensor_transformed = self.training_transforms_composition(
                    img_masked_tensor_transformed)
            img_result_tensor = img_masked_tensor_transformed
        else:
            img_result_tensor = img_masked_tensor
        # load labels
        if self.gt is not None:
            P_gt = self.gt.loc[idx, 'P']
            K_gt = self.gt.loc[idx, 'K']
            Mg_gt = self.gt.loc[idx, 'Mg']
            pH_gt = self.gt.loc[idx, 'pH']
            sample_index = self.gt.loc[idx, 'sample_index']
            
            if self.scaler_P is not None:
                assert self.scaler_K is not None
                assert self.scaler_Mg is not None
                assert self.scaler_pH is not None
                
                P_gt = self.scaler_P.transform(np.array(P_gt).reshape(1, -1))[0][0]
                K_gt = self.scaler_K.transform(np.array(K_gt).reshape(1, -1))[0][0]
                Mg_gt = self.scaler_Mg.transform(np.array(Mg_gt).reshape(1, -1))[0][0]
                pH_gt = self.scaler_pH.transform(np.array(pH_gt).reshape(1, -1))[0][0]
                

            sample = {'image': img_result_tensor, 'P': P_gt,
                      'K': K_gt, 'Mg': Mg_gt, 'pH': pH_gt, 'sample_index': sample_index}
        else:
            sample = {'image': img_result_tensor}

        return sample

In [9]:
BATCH_SIZE = 12

In [10]:
test_dataset = HyperViewDataset(None, os.path.join(
    test_data_directory), True, means, stds, train_transforms=False, parameters_scalers=parameters_scalers)

In [11]:
test_dataloader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, pin_memory=True, drop_last=False, num_workers=4, shuffle=False)

### Model Definition

In [12]:
import torchvision
from torchvision import datasets, models, transforms

In [13]:
class ResNetCNN(nn.Module):
    def __init__(self, pretrained):
        super().__init__()
        self.resnet_model = models.resnet50(pretrained=False)

        self.resnet_model.conv1 = nn.Conv2d(150, 64, kernel_size=(
            7, 7), stride=(2, 2), padding=(3, 3), bias=False)
        self.resnet_model.fc = nn.Linear(
            in_features=2048, out_features=1, bias=True)

    def forward(self, x):
        x = self.resnet_model(x)
#         return {'P': x[0], 'K': x[1], 'Mg': x[2], 'pH': x[3]}
        return x

In [14]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [15]:
model = ResNetCNN(pretrained=False).to(device)

### TEST: Loading saved weights of single parameters training

In [17]:
weights_filename_base = 'checkpoint_state_dict_ResNet50_16_05_2022_Single_Parameter_Scaled_PretrainedModel_'

In [18]:
predict_parameter = 'P'

In [19]:
saved_weights = os.path.join(saved_models_directory, weights_filename_base + predict_parameter + '.pth')
saved_weights

'../../saved_models/checkpoint_state_dict_ResNet50_16_05_2022_Single_Parameter_Scaled_PretrainedModel_P.pth'

In [20]:
model.load_state_dict(torch.load(saved_weights))

<All keys matched successfully>

In [21]:
output_list = []
bar = tqdm(test_dataloader, position=0, leave=False, desc=f'test data')
for test_data in bar:  # for each test step
    img_data_device = test_data['image'].to(device, dtype=torch.float)

    output = model(img_data_device)
    output_cpu = output.detach().cpu().numpy()
    output_list.append(output_cpu)

test data:   0%|          | 0/97 [00:00<?, ?it/s]

In [22]:
output_list

[array([[-0.07942121],
        [-0.07610013],
        [-0.09160368],
        [-0.08796418],
        [-0.07087678],
        [-0.0635623 ],
        [-0.05667718],
        [-0.08550307],
        [-0.08735105],
        [-0.0403284 ],
        [-0.0794038 ],
        [-0.09448231]], dtype=float32),
 array([[-0.08732539],
        [-0.08300471],
        [-0.07762612],
        [-0.0381611 ],
        [-0.08889561],
        [-0.08931116],
        [-0.07317815],
        [-0.01739733],
        [-0.09048146],
        [-0.09832066],
        [-0.08889659],
        [-0.06176452]], dtype=float32),
 array([[-0.08799025],
        [-0.09139728],
        [-0.07873696],
        [-0.05848919],
        [-0.03010822],
        [-0.08965241],
        [-0.08536447],
        [-0.09293584],
        [-0.03542522],
        [-0.08352081],
        [-0.09644251],
        [-0.07052288]], dtype=float32),
 array([[-0.08279754],
        [-0.10178246],
        [-0.06757417],
        [-0.05925556],
        [-0.09107632],
      

In [23]:
single_param_temp_df = pd.DataFrame([item for sublist in output_list for item in sublist], columns=[predict_parameter])
single_param_temp_df

Unnamed: 0,P
0,-0.079421
1,-0.076100
2,-0.091604
3,-0.087964
4,-0.070877
...,...
1149,0.002470
1150,-0.002142
1151,-0.088323
1152,-0.051912


### Predict in loop

In [16]:
weights_filename_base = 'checkpoint_state_dict_ResNet50_19_05_2022_Single_Parameter_Scaled_PretrainedModel_AdaptiveLoss_'

In [17]:
predict_parameters = ['P', 'K', 'Mg', 'pH'] # change to dict with: param -> weights_filename

In [18]:
param_df_list_unscaled = []
param_df_list_scaled = []

for predict_param in predict_parameters:
    saved_weights = os.path.join(saved_models_directory, weights_filename_base + predict_param + '.pth')
    scaler = getattr(test_dataset, f'scaler_{predict_param}')
    
    model = ResNetCNN(pretrained=False)
    model.to(device)
    model.load_state_dict(torch.load(saved_weights))
    model.eval()
    
    
    single_param_scaled_output_list = []
    single_param_unscaled_output_list = []
    bar = tqdm(test_dataloader, position=0, leave=False, desc=f'test data')
    for test_data in bar:  # for each test step
        img_data_device = test_data['image'].to(device, dtype=torch.float)

        output = model(img_data_device)
        output_cpu = output.detach().cpu().numpy()
        output_unscaled_cpu = scaler.inverse_transform(output_cpu)
        single_param_scaled_output_list.append(output_cpu)
        single_param_unscaled_output_list.append(output_unscaled_cpu)
    
    single_param_scaled_temp_df = pd.DataFrame([item for sublist in single_param_scaled_output_list for item in sublist], columns=[predict_param])
    single_param_unscaled_temp_df = pd.DataFrame([item for sublist in single_param_unscaled_output_list for item in sublist], columns=[predict_param])

    param_df_list_scaled.append(single_param_scaled_temp_df)
    param_df_list_unscaled.append(single_param_unscaled_temp_df)

test data:   0%|          | 0/97 [00:00<?, ?it/s]

test data:   0%|          | 0/97 [00:00<?, ?it/s]

test data:   0%|          | 0/97 [00:00<?, ?it/s]

test data:   0%|          | 0/97 [00:00<?, ?it/s]

In [19]:
param_df_list_scaled

[             P
 0    -0.156474
 1    -0.156474
 2    -0.156474
 3    -0.156474
 4    -0.156474
 ...        ...
 1149 -0.174306
 1150 -0.175263
 1151 -0.156473
 1152 -0.162766
 1153 -0.156473
 
 [1154 rows x 1 columns],
              K
 0    -0.102215
 1    -0.102215
 2    -0.102215
 3    -0.102215
 4    -0.102215
 ...        ...
 1149 -0.102215
 1150 -0.102215
 1151 -0.102215
 1152 -0.102215
 1153 -0.102215
 
 [1154 rows x 1 columns],
             Mg
 0     0.032029
 1     0.044065
 2     0.042634
 3     0.047944
 4     0.029526
 ...        ...
 1149 -0.155653
 1150 -0.190738
 1151  0.551598
 1152 -0.018784
 1153  0.395221
 
 [1154 rows x 1 columns],
             pH
 0     0.318580
 1     0.318547
 2     0.318541
 3     0.318494
 4     0.318736
 ...        ...
 1149 -0.358029
 1150 -0.425797
 1151 -0.964099
 1152 -0.410841
 1153 -0.479370
 
 [1154 rows x 1 columns]]

In [20]:
param_df_list_unscaled

[              P
 0     65.687256
 1     65.687256
 2     65.687256
 3     65.687256
 4     65.687256
 ...         ...
 1149  65.161293
 1150  65.133049
 1151  65.687279
 1152  65.501671
 1153  65.687302
 
 [1154 rows x 1 columns],
                K
 0     221.664017
 1     221.664017
 2     221.664017
 3     221.664017
 4     221.664017
 ...          ...
 1149  221.664017
 1150  221.664017
 1151  221.664017
 1152  221.664017
 1153  221.664017
 
 [1154 rows x 1 columns],
               Mg
 0     160.557922
 1     161.037689
 2     160.980652
 3     161.192307
 4     160.458160
 ...          ...
 1149  153.076843
 1150  151.678345
 1151  181.268097
 1152  158.532486
 1153  175.034866
 
 [1154 rows x 1 columns],
             pH
 0     6.865621
 1     6.865613
 2     6.865611
 3     6.865599
 4     6.865662
 ...        ...
 1149  6.689552
 1150  6.671917
 1151  6.531838
 1152  6.675809
 1153  6.657976
 
 [1154 rows x 1 columns]]

#### Merge DFs and convert to CSV

In [21]:
params_df = pd.concat(param_df_list_unscaled, axis=1)

In [22]:
params_df

Unnamed: 0,P,K,Mg,pH
0,65.687256,221.664017,160.557922,6.865621
1,65.687256,221.664017,161.037689,6.865613
2,65.687256,221.664017,160.980652,6.865611
3,65.687256,221.664017,161.192307,6.865599
4,65.687256,221.664017,160.458160,6.865662
...,...,...,...,...
1149,65.161293,221.664017,153.076843,6.689552
1150,65.133049,221.664017,151.678345,6.671917
1151,65.687279,221.664017,181.268097,6.531838
1152,65.501671,221.664017,158.532486,6.675809


In [23]:
params_df.to_csv(os.path.join(submissions_directory,
                  "checkpoint_state_dict_ResNet50_19_05_2022_Single_Parameter_Scaled_PretrainedModel_AdaptiveLoss.csv"), index_label="sample_index")