In [21]:
import os
from glob import glob
import random
import pandas as pd
import numpy as np
import torch
import timm
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataset import random_split
from torch import nn, optim
from torch.nn import functional as F
from torchvision import transforms
import matplotlib.pyplot as plt
from math import ceil
from tqdm.notebook import tqdm
import wandb
from torch_lr_finder import LRFinder
from utils import EarlyStopping
from pickle import load
import utils
from sklearn.preprocessing import StandardScaler
import plotly.express as px
from albumentations.augmentations.crops.functional import random_crop as functional_random_crop

In [2]:
from scipy import ndimage

In [3]:
# run = wandb.init(project='HyperView-initial', reinit=True)
wandb.init(mode="disabled") # in case of testing code

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.




### Paths definition

In [4]:
train_data_directory = r'../../train_data'
test_data_directory = r'../../test_data'
saved_models_directory = r'../../saved_models'
submissions_directory = r'../../submissions'

### Load mean and std values of training dataset

In [5]:
means_stds_arrays = np.load(os.path.join(
    train_data_directory, 'means_stds_values_training_data.npz'))
means = np.array(means_stds_arrays['means'])
stds = np.array(means_stds_arrays['stds'])

### Load scalers for parameters

In [6]:
parameters_scalers = dict()
parameters_scalers['P'] = load((open(os.path.join(train_data_directory, 'standard_scaler_P.pkl'), 'rb')))
parameters_scalers['K'] = load((open(os.path.join(train_data_directory, 'standard_scaler_K.pkl'), 'rb')))
parameters_scalers['Mg'] = load((open(os.path.join(train_data_directory, 'standard_scaler_Mg.pkl'), 'rb')))
parameters_scalers['pH'] = load((open(os.path.join(train_data_directory, 'standard_scaler_pH.pkl'), 'rb')))

In [7]:
parameters_scalers['P'].mean_

array([70.30265589])

### Dataset definition

In [22]:
class HyperViewDatasetRandomCrop(Dataset):
    def __init__(self, gt_file, img_dir, normalize=True, means=None, stds=None, train_transforms=False, parameters_scalers=None, crop_size=(11,11)):
        self.img_dir = img_dir
        if gt_file is not None:
            self.gt = pd.read_csv(gt_file)[['sample_index', 'P', 'K', 'Mg', 'pH']]
        else:
            self.gt = None
        self.normalize = normalize  # whether to perform transformation of input data
        # whether to perform transformations like on training data
        self.train_transforms = train_transforms

        if self.train_transforms:
            self.training_transforms_composition = transforms.Compose([
                transforms.RandomHorizontalFlip(p=0.4), transforms.RandomVerticalFlip(p=0.4)])

        self.means = means  # mean values for every band used to normalize data
        self.stds = stds  # std values for every band used to normalize data

        self.img_files = np.array(
            sorted(
                glob(os.path.join(img_dir, "*.npz")),
                key=lambda x: int(os.path.basename(x).replace(".npz", "")),
            )
        )

        if parameters_scalers is not None:
            self.scaler_P = parameters_scalers['P']
            self.scaler_K = parameters_scalers['K']
            self.scaler_Mg = parameters_scalers['Mg']
            self.scaler_pH = parameters_scalers['pH']
        else:
            self.scaler_P = None
            self.scaler_K = None
            self.scaler_Mg = None
            self.scaler_pH = None
        

        self.h_crop = crop_size[0]
        self.w_crop = crop_size[1]

    def __len__(self):
        return len(self.img_files)

    def __getitem__(self, idx):
        # load hyperspectral image to array
        img_arr = np.ma.MaskedArray(**np.load(self.img_files[idx]))
        img_tensor = torch.as_tensor(img_arr.data, dtype=torch.float)
        # Inverting mask is necessary due to masking method in numpy
        img_tensor_mask = ~torch.as_tensor(img_arr.mask)
        img_masked_tensor = torch.mul(img_tensor, img_tensor_mask)
        
        # Crop randomly from image with given sizes
        not_sufficient_pixels = True
        while not_sufficient_pixels:
            img_masked_array = img_masked_tensor.cpu().numpy()
            img_masked_array_cropped = np.transpose(functional_random_crop(np.transpose(img_masked_array, (1,2,0)), self.h_crop, self.w_crop, 
                                                           random.random(), random.random()), (2,0,1))
            # Test if number of "informative" (nonzero) pixels is sufficient.
            # 60 is chosen as minimum number of nonzero-pixels in dataset is 73 and sometimes mask may be on border with zero pixels
            if np.count_nonzero(img_masked_array_cropped[0]) > 60:
                not_sufficient_pixels = False
        img_masked_tensor = torch.as_tensor(img_masked_array_cropped, dtype=torch.float)

        if self.normalize:
            img_masked_tensor_transformed = transforms.functional.normalize(
                img_masked_tensor, mean=self.means.tolist(), std=self.stds.tolist())
            if self.train_transforms:
                img_masked_tensor_transformed = self.training_transforms_composition(
                    img_masked_tensor_transformed)
            img_result_tensor = img_masked_tensor_transformed
        else:
            img_result_tensor = img_masked_tensor
        
        # load labels
        if self.gt is not None:
            P_gt = self.gt.loc[idx, 'P']
            K_gt = self.gt.loc[idx, 'K']
            Mg_gt = self.gt.loc[idx, 'Mg']
            pH_gt = self.gt.loc[idx, 'pH']
            sample_index = self.gt.loc[idx, 'sample_index']
            
            if self.scaler_P is not None:
                assert self.scaler_K is not None
                assert self.scaler_Mg is not None
                assert self.scaler_pH is not None
                
                P_gt = self.scaler_P.transform(np.array(P_gt).reshape(1, -1))[0][0]
                K_gt = self.scaler_K.transform(np.array(K_gt).reshape(1, -1))[0][0]
                Mg_gt = self.scaler_Mg.transform(np.array(Mg_gt).reshape(1, -1))[0][0]
                pH_gt = self.scaler_pH.transform(np.array(pH_gt).reshape(1, -1))[0][0]
                

            sample = {'image': img_result_tensor, 'P': P_gt,
                      'K': K_gt, 'Mg': Mg_gt, 'pH': pH_gt, 'sample_index': sample_index}
        else:
            sample = {'image': img_result_tensor}

        return sample

In [23]:
BATCH_SIZE = 12

In [24]:
test_dataset = HyperViewDatasetRandomCrop(None, os.path.join(
    test_data_directory), normalize=True, means=means, stds=stds, train_transforms=False, parameters_scalers=parameters_scalers, crop_size=(11,11))

In [25]:
test_dataloader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, pin_memory=True, drop_last=False, num_workers=4, shuffle=False)

### Model Definition

In [12]:
import torchvision
from torchvision import datasets, models, transforms

In [13]:
class ResNetCNN(nn.Module):
    def __init__(self, pretrained):
        super().__init__()
        self.resnet_model = models.resnet50(pretrained=False)

        self.resnet_model.conv1 = nn.Conv2d(150, 64, kernel_size=(
            7, 7), stride=(2, 2), padding=(3, 3), bias=False)
        self.resnet_model.fc = nn.Linear(
            in_features=2048, out_features=1, bias=True)

    def forward(self, x):
        x = self.resnet_model(x)
#         return {'P': x[0], 'K': x[1], 'Mg': x[2], 'pH': x[3]}
        return x

In [14]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [15]:
model = ResNetCNN(pretrained=False)
model.to(device)

ResNetCNN(
  (resnet_model): ResNet(
    (conv1): Conv2d(150, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (downsample): Sequential(
 

### TEST: Loading saved weights of single parameters training

In [16]:
weights_filename_base = 'checkpoint_state_dict_ResNet50_15_05_2022_Single_Parameter_Scaled_RandomCrop_'

In [17]:
predict_parameter = 'K'

In [18]:
saved_weights = os.path.join(saved_models_directory, weights_filename_base + predict_parameter + '.pth')
saved_weights

'../../saved_models/checkpoint_state_dict_ResNet50_15_05_2022_Single_Parameter_Scaled_RandomCrop_K.pth'

In [19]:
model.load_state_dict(torch.load(saved_weights))

<All keys matched successfully>

In [26]:
output_list = []
bar = tqdm(test_dataloader, position=0, leave=False, desc=f'test data')
for test_data in bar:  # for each test step
    img_data_device = test_data['image'].to(device, dtype=torch.float)

    output = model(img_data_device)
    output_cpu = output.detach().cpu().numpy()
    output_list.append(output_cpu)

test data:   0%|          | 0/97 [00:00<?, ?it/s]

In [27]:
output_list

[array([[ 0.06606149],
        [-0.02244276],
        [ 0.13453151],
        [ 0.0332446 ],
        [-0.03166998],
        [ 0.04276955],
        [ 0.16000548],
        [-0.5832029 ],
        [-0.09626211],
        [ 0.16405378],
        [-0.04109843],
        [ 0.19714142]], dtype=float32),
 array([[ 0.2020391 ],
        [-0.07624876],
        [ 0.10207331],
        [ 0.14956367],
        [-0.08818438],
        [-0.5723578 ],
        [-0.02428131],
        [ 0.24660951],
        [ 0.0708935 ],
        [-0.05119639],
        [ 0.04253118],
        [-0.04511033]], dtype=float32),
 array([[-0.01401436],
        [-0.09043369],
        [ 0.16584738],
        [ 0.0103658 ],
        [ 0.10816946],
        [-0.01870666],
        [-0.02661506],
        [-0.07438101],
        [ 0.15099502],
        [-0.58291626],
        [ 0.21326983],
        [ 0.16948469]], dtype=float32),
 array([[-0.5830487 ],
        [ 0.04804127],
        [ 0.04609092],
        [ 0.10849857],
        [-0.08835766],
      

In [28]:
single_param_temp_df = pd.DataFrame([item for sublist in output_list for item in sublist], columns=[predict_parameter])
single_param_temp_df

Unnamed: 0,K
0,0.066061
1,-0.022443
2,0.134532
3,0.033245
4,-0.031670
...,...
1149,0.262111
1150,-0.583105
1151,0.218737
1152,-1.711966


### Predict in loop

In [29]:
weights_filename_base = 'checkpoint_state_dict_ResNet50_15_05_2022_Single_Parameter_Scaled_RandomCrop_'

In [30]:
predict_parameters = ['P', 'K', 'Mg', 'pH'] # change to dict with: param -> weights_filename

In [31]:
param_df_list_unscaled = []
param_df_list_scaled = []

for predict_param in predict_parameters:
    saved_weights = os.path.join(saved_models_directory, weights_filename_base + predict_param + '.pth')
    scaler = getattr(test_dataset, f'scaler_{predict_param}')
    
    model = ResNetCNN(pretrained=False)
    model.to(device)
    model.load_state_dict(torch.load(saved_weights))
    model.eval()
    
    
    single_param_scaled_output_list = []
    single_param_unscaled_output_list = []
    bar = tqdm(test_dataloader, position=0, leave=False, desc=f'test data')
    for test_data in bar:  # for each test step
        img_data_device = test_data['image'].to(device, dtype=torch.float)

        output = model(img_data_device)
        output_cpu = output.detach().cpu().numpy()
        output_unscaled_cpu = scaler.inverse_transform(output_cpu)
        single_param_scaled_output_list.append(output_cpu)
        single_param_unscaled_output_list.append(output_unscaled_cpu)
    
    single_param_scaled_temp_df = pd.DataFrame([item for sublist in single_param_scaled_output_list for item in sublist], columns=[predict_param])
    single_param_unscaled_temp_df = pd.DataFrame([item for sublist in single_param_unscaled_output_list for item in sublist], columns=[predict_param])

    param_df_list_scaled.append(single_param_scaled_temp_df)
    param_df_list_unscaled.append(single_param_unscaled_temp_df)

test data:   0%|          | 0/97 [00:00<?, ?it/s]

test data:   0%|          | 0/97 [00:00<?, ?it/s]

test data:   0%|          | 0/97 [00:00<?, ?it/s]

test data:   0%|          | 0/97 [00:00<?, ?it/s]

In [32]:
param_df_list_scaled

[             P
 0     0.254697
 1     0.260561
 2     0.276851
 3     0.071856
 4     0.290923
 ...        ...
 1149  0.300228
 1150  0.260683
 1151  0.262912
 1152  0.266012
 1153  0.272598
 
 [1154 rows x 1 columns],
              K
 0     0.044334
 1    -0.049303
 2     0.004576
 3    -1.148717
 4    -0.072239
 ...        ...
 1149  0.069958
 1150  0.085443
 1151  0.083396
 1152  0.081881
 1153  0.090823
 
 [1154 rows x 1 columns],
             Mg
 0    -0.003340
 1    -0.018709
 2    -0.248105
 3    -0.091929
 4     0.022274
 ...        ...
 1149 -0.171816
 1150 -0.110334
 1151  0.117606
 1152  0.030987
 1153 -0.189095
 
 [1154 rows x 1 columns],
             pH
 0     0.313465
 1     0.334991
 2     0.330776
 3     0.324682
 4     0.321841
 ...        ...
 1149 -0.090091
 1150 -0.195444
 1151 -0.362510
 1152 -0.165528
 1153 -0.191914
 
 [1154 rows x 1 columns]]

In [33]:
param_df_list_unscaled

[              P
 0     77.815277
 1     77.988235
 2     78.468742
 3     72.422142
 4     78.883797
 ...         ...
 1149  79.158257
 1150  77.991844
 1151  78.057587
 1152  78.149025
 1153  78.343262
 
 [1154 rows x 1 columns],
                K
 0     230.731644
 1     224.937927
 2     228.271652
 3     156.912674
 4     223.518768
 ...          ...
 1149  232.317093
 1150  233.275238
 1151  233.148560
 1152  233.054825
 1153  233.608078
 
 [1154 rows x 1 columns],
               Mg
 0     159.148087
 1     158.535477
 2     149.391708
 3     155.616898
 4     160.169098
 ...          ...
 1149  152.432617
 1150  154.883286
 1151  163.969055
 1152  160.516373
 1153  151.743835
 
 [1154 rows x 1 columns],
             pH
 0     6.864290
 1     6.869892
 2     6.868795
 3     6.867209
 4     6.866470
 ...        ...
 1149  6.759275
 1150  6.731860
 1151  6.688386
 1152  6.739645
 1153  6.732779
 
 [1154 rows x 1 columns]]

#### Merge DFs and convert to CSV

In [34]:
params_df = pd.concat(param_df_list_unscaled, axis=1)

In [35]:
params_df

Unnamed: 0,P,K,Mg,pH
0,77.815277,230.731644,159.148087,6.864290
1,77.988235,224.937927,158.535477,6.869892
2,78.468742,228.271652,149.391708,6.868795
3,72.422142,156.912674,155.616898,6.867209
4,78.883797,223.518768,160.169098,6.866470
...,...,...,...,...
1149,79.158257,232.317093,152.432617,6.759275
1150,77.991844,233.275238,154.883286,6.731860
1151,78.057587,233.148560,163.969055,6.688386
1152,78.149025,233.054825,160.516373,6.739645


In [36]:
params_df.to_csv(os.path.join(submissions_directory,
                  "checkpoint_state_dict_ResNet50_15_05_2022_Single_Parameter_Scaled_RandomCrop.csv"), index_label="sample_index")