In [1]:
import os
from glob import glob
import pandas as pd
import numpy as np
import torch
import timm
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataset import random_split
from torch import nn, optim
from torch.nn import functional as F
from torchvision import transforms
import matplotlib.pyplot as plt
from math import ceil
from tqdm.notebook import tqdm
import wandb
from torch_lr_finder import LRFinder
from utils import EarlyStopping
import utils
import plotly.express as px

In [2]:
from scipy import ndimage

In [3]:
# run = wandb.init(project='HyperView-initial', reinit=True)
wandb.init(mode="disabled") # in case of testing code

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.




### Paths definition

In [4]:
train_data_directory = r'../../train_data'
test_data_directory = r'../../test_data'
saved_models_directory = r'../../saved_models'
submissions_directory = r'../../submissions'

### Load mean and std values of training dataset

In [5]:
means_stds_arrays = np.load(os.path.join(
    train_data_directory, 'means_stds_values_training_data.npz'))
means = np.array(means_stds_arrays['means'])
stds = np.array(means_stds_arrays['stds'])

### Dataset definition

In [6]:
class HyperViewDataset(Dataset):
    def __init__(self, gt_file, img_dir, transform=True, means=None, stds=None, train_transforms=False):
        self.img_dir = img_dir
        if gt_file is not None:
            self.gt = pd.read_csv(gt_file)[['sample_index', 'P', 'K', 'Mg', 'pH']]
        else:
            self.gt = None
        self.transform = transform  # whether to perform transformation of input data
        # whether to perform transformations like on training data
        self.train_transforms = train_transforms

        if self.train_transforms:
            self.training_transforms_composition = transforms.Compose([
                transforms.RandomHorizontalFlip(p=0.4), transforms.RandomVerticalFlip(p=0.4)])

        self.means = means  # mean values for every band used to normalize data
        self.stds = stds  # std values for every band used to normalize data

        self.img_files = np.array(
            sorted(
                glob(os.path.join(img_dir, "*.npz")),
                key=lambda x: int(os.path.basename(x).replace(".npz", "")),
            )
        )

    def pad_to_minimum_size(self, min_size, image):
        # pads numpy array to defined in `min_size` minimum size of array (w, h)
        c, h, w = image.shape
        h_diff = (min_size - h)/2
        w_diff = (min_size - w)/2

        if not h_diff.is_integer():
            h_pad1 = int(h_diff - 0.5)
            h_pad2 = int(h_diff + 0.5)
        else:
            h_pad1 = h_pad2 = int(h_diff)

        if not w_diff.is_integer():
            w_pad1 = int(w_diff - 0.5)
            w_pad2 = int(w_diff + 0.5)
        else:
            w_pad1 = w_pad2 = int(w_diff)

        # check if any padding is bigger than zero
        if h_pad1+h_pad2+w_pad1+h_pad2 == 0:
            return image
        else:
            return transforms.functional.pad(image, (w_pad1, h_pad1, w_pad2, h_pad2), fill=0)

    def __len__(self):
        return len(self.img_files)

    def __getitem__(self, idx):
        # load hyperspectral image to array
        img_arr = np.ma.MaskedArray(**np.load(self.img_files[idx]))
        img_tensor = torch.as_tensor(img_arr.data, dtype=torch.float)
        # Inverting mask is necessary due to masking method in numpy
        img_tensor_mask = ~torch.as_tensor(img_arr.mask)
        img_masked_tensor = torch.mul(img_tensor, img_tensor_mask)

        if self.transform:
            img_masked_tensor_transformed = transforms.functional.normalize(
                img_masked_tensor, mean=self.means.tolist(), std=self.stds.tolist())
            img_masked_tensor_transformed = self.pad_to_minimum_size(
                300, img_masked_tensor_transformed)
            if self.train_transforms:
                img_masked_tensor_transformed = self.training_transforms_composition(
                    img_masked_tensor_transformed)
            img_result_tensor = img_masked_tensor_transformed
        else:
            img_result_tensor = img_masked_tensor
        # load labels
        if self.gt is not None:
            P_gt = self.gt.loc[idx, 'P']
            K_gt = self.gt.loc[idx, 'K']
            Mg_gt = self.gt.loc[idx, 'Mg']
            pH_gt = self.gt.loc[idx, 'pH']
            sample_index = self.gt.loc[idx, 'sample_index']

            sample = {'image': img_result_tensor, 'P': P_gt,
                      'K': K_gt, 'Mg': Mg_gt, 'pH': pH_gt, 'sample_index': sample_index}
        else:
            sample = {'image': img_result_tensor}

        return sample

In [14]:
BATCH_SIZE = 12

In [15]:
test_data = HyperViewDataset(None, os.path.join(
    test_data_directory), True, means, stds, train_transforms=False)

In [16]:
test_dataloader = DataLoader(
    test_data, batch_size=BATCH_SIZE, pin_memory=True, drop_last=False, num_workers=4, shuffle=False)

### Model Definition

In [17]:
import torchvision
from torchvision import datasets, models, transforms

In [18]:
class ResNetCNN(nn.Module):
    def __init__(self, pretrained):
        super().__init__()
        self.resnet_model = models.resnet50(pretrained=False)

        self.resnet_model.conv1 = nn.Conv2d(150, 64, kernel_size=(
            7, 7), stride=(2, 2), padding=(3, 3), bias=False)
        self.resnet_model.fc = nn.Linear(
            in_features=2048, out_features=1, bias=True)

    def forward(self, x):
        x = self.resnet_model(x)
#         return {'P': x[0], 'K': x[1], 'Mg': x[2], 'pH': x[3]}
        return x

In [19]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [20]:
model = ResNetCNN(pretrained=False)
model.to(device)

ResNetCNN(
  (resnet_model): ResNet(
    (conv1): Conv2d(150, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): Bottleneck(
        (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (downsample): Sequential(
 

### TEST: Loading saved weights of single parameters training

In [25]:
weights_filename_base = 'checkpoint_state_dict_ResNet50_08_05_2022_Single_Parameter_'

In [21]:
predict_parameter = 'K'

In [26]:
saved_weights = os.path.join(saved_models_directory, weights_filename_base + predict_parameter + '.pth')
saved_weights

'../../saved_models/checkpoint_state_dict_ResNet50_08_05_2022_Single_Parameter_K.pth'

In [28]:
model.load_state_dict(torch.load(saved_weights))

<All keys matched successfully>

In [30]:
output_list = []
bar = tqdm(test_dataloader, position=0, leave=False, desc=f'test data')
for test_data in bar:  # for each test step
    img_data_device = test_data['image'].to(device, dtype=torch.float)

    output = model(img_data_device)
    output_cpu = output.detach().cpu().numpy()
    output_list.append(output_cpu)

test data:   0%|          | 0/97 [00:00<?, ?it/s]

In [31]:
output_list

[array([[219.73125],
        [219.90688],
        [218.74104],
        [218.73386],
        [227.79045],
        [233.96481],
        [220.0466 ],
        [219.46327],
        [224.69005],
        [226.60704],
        [218.40288],
        [218.6404 ]], dtype=float32),
 array([[221.46309],
        [236.82191],
        [221.27827],
        [221.30946],
        [220.82127],
        [220.66008],
        [231.09438],
        [219.86742],
        [220.95522],
        [221.18033],
        [221.31923],
        [221.73761]], dtype=float32),
 array([[222.70563],
        [223.15962],
        [220.68198],
        [221.88658],
        [222.80907],
        [221.92252],
        [223.6396 ],
        [226.2228 ],
        [224.56297],
        [220.63718],
        [220.42104],
        [221.7581 ]], dtype=float32),
 array([[221.99109],
        [221.23796],
        [223.0386 ],
        [224.07095],
        [231.92618],
        [224.57991],
        [222.18024],
        [222.35135],
        [224.97554],
    

In [34]:
single_param_temp_df = pd.DataFrame([item for sublist in output_list for item in sublist], columns=[predict_parameter])
single_param_temp_df

Unnamed: 0,K
0,219.731247
1,219.906876
2,218.741043
3,218.733856
4,227.790451
...,...
1149,231.113312
1150,232.966354
1151,236.959106
1152,213.100266


### Predict in loop

In [36]:
weights_filename_base = 'checkpoint_state_dict_ResNet50_08_05_2022_Single_Parameter_'

In [41]:
predict_parameters = ['P', 'K', 'Mg', 'pH'] # change to dict with: param -> weights_filename

In [37]:
param_df_list = []

for predict_param in predict_parameters:
    saved_weights = os.path.join(saved_models_directory, weights_filename_base + predict_param + '.pth')
    
    model = ResNetCNN(pretrained=False)
    model.to(device)
    model.load_state_dict(torch.load(saved_weights))
    model.eval()
    
    
    single_param_output_list = []
    bar = tqdm(test_dataloader, position=0, leave=False, desc=f'test data')
    for test_data in bar:  # for each test step
        img_data_device = test_data['image'].to(device, dtype=torch.float)

        output = model(img_data_device)
        output_cpu = output.detach().cpu().numpy()
        single_param_output_list.append(output_cpu)
    
    single_param_temp_df = pd.DataFrame([item for sublist in single_param_output_list for item in sublist], columns=[predict_param])
    param_df_list.append(single_param_temp_df)

test data:   0%|          | 0/97 [00:00<?, ?it/s]

test data:   0%|          | 0/97 [00:00<?, ?it/s]

test data:   0%|          | 0/97 [00:00<?, ?it/s]

FileNotFoundError: [Errno 2] No such file or directory: '../../saved_models/checkpoint_state_dict_ResNet50_08_05_2022_Single_Parameter_pH.pth'

In [43]:
param_df_list

[              P
 0     67.745232
 1     67.843025
 2     67.839531
 3     67.838814
 4     66.534027
 ...         ...
 1149  62.561455
 1150  60.745144
 1151  67.522720
 1152  61.413960
 1153  61.764919
 
 [1154 rows x 1 columns],
                K
 0     216.807999
 1     216.584045
 2     216.491150
 3     216.467422
 4     217.687988
 ...          ...
 1149  205.652237
 1150  204.982651
 1151  296.332428
 1152  246.414383
 1153  340.746460
 
 [1154 rows x 1 columns],
               Mg
 0     160.636673
 1     160.900467
 2     160.872208
 3     161.024094
 4     160.645462
 ...          ...
 1149  164.617508
 1150  165.262253
 1151  177.614548
 1152  165.780685
 1153  150.607285
 
 [1154 rows x 1 columns],
             pH
 0     6.831513
 1     6.829779
 2     6.824730
 3     6.827704
 4     6.835938
 ...        ...
 1149  6.831393
 1150  6.749685
 1151  7.654590
 1152  6.779434
 1153  6.752251
 
 [1154 rows x 1 columns]]

#### Merge DFs and convert to CSV

In [46]:
params_df = pd.concat(param_df_list, axis=1)

In [47]:
params_df

Unnamed: 0,P,K,Mg,pH
0,67.745232,216.807999,160.636673,6.831513
1,67.843025,216.584045,160.900467,6.829779
2,67.839531,216.491150,160.872208,6.824730
3,67.838814,216.467422,161.024094,6.827704
4,66.534027,217.687988,160.645462,6.835938
...,...,...,...,...
1149,62.561455,205.652237,164.617508,6.831393
1150,60.745144,204.982651,165.262253,6.749685
1151,67.522720,296.332428,177.614548,7.654590
1152,61.413960,246.414383,165.780685,6.779434


In [49]:
params_df.to_csv(os.path.join(submissions_directory,
                  "checkpoint_state_dict_ResNet50_10_05_2022_1ParamModels.csv"), index_label="sample_index")