In [1]:
import tensorflow as tf
import numpy as np
import os
from tensorflow.keras import datasets, layers, models, regularizers, losses
from load_data import DataLoader
from interpolated_datagenerator import InterpolatedDataGenerator
import netCDF4 as nc

In [2]:
measurements = ['SO2', 'CO', 'H2O', 'HCl', 'HNO3', 'N2O', 'Temperature']
# start from 2004246

time = [2004246, 2024010]
lon = [-177.5, 177.5]
lat = [-88, 88]
lev = [0, 152]

dataloader = DataLoader(measurements=measurements, time_period=time)
X_files, y_files = dataloader.get_files()


train_size = len(X_files) * 6 // 10
validation_size = len(X_files) * 2 // 10
test_size = len(X_files) * 2 // 10


X_train = X_files[:train_size]
y_train = y_files[:train_size]

X_validation = X_files[train_size:train_size + validation_size]
y_validation = y_files[train_size:train_size + validation_size]

X_test = X_files[train_size + validation_size:]
y_test = y_files[train_size + validation_size:]

In [3]:

# for file in y_files:
#     with nc.Dataset(file[0]) as file_data:
#         data = file_data.groups['O3 PressureGrid']['value'][:]
#         if isinstance(data, np.ma.MaskedArray):
#             print("True for file", np.max(data), file)  # Returns True if all values are masked



# with nc.Dataset(file_path) as file:
#             data = file.groups[f'{measure} PressureGrid']['value'][:]
#             final_data = []
#             data = data[0]

#             lat_start_idx = self.lat_to_index[self.lat_range[0]]
#             lat_end_idx = self.lat_to_index[self.lat_range[1]] + 1
#             lon_start_idx = self.lon_to_index[self.lon_range[0]]
#             lon_end_idx = self.lon_to_index[self.lon_range[1]] + 1
#             add_slice = False

#             sliced_data = data[:, lon_start_idx:lon_end_idx, lat_start_idx:lat_end_idx]

#             sliced_data = np.ma.masked_less_equal(sliced_data, 0)

#             lev_arr = file.groups[f'{measure} PressureGrid']['lev'][:]
#             for lev in range(sliced_data.shape[0]):
#                 if self.lev_range[0] <= lev_arr[lev] <= self.lev_range[1]:
#                     for lat in range(data.shape[2]):
#                         masked_arr = sliced_data[lev, :, lat]

#                         if np.ma.is_masked(masked_arr):
#                             non_masked = np.where(~masked_arr.mask)[0]
#                             masked = np.where(masked_arr.mask)[0]

#                             if non_masked.size > 0:
#                                 interpolated_arr = np.copy(masked_arr)
#                                 interpolated_values = np.interp(masked, non_masked, masked_arr[non_masked])
#                                 add_slice = True
#                                 interpolated_arr[masked] = interpolated_values

#                                 sliced_data[lev, :, lat] = interpolated_arr
#                             else:
#                                 # This error message will pop up heaps so commented to avoid spam 
#                                 # sys.stderr.write("Array with no non masked values found")
#                                 continue
#                     if self.normalise:
#                         sliced_data[lev] -= self.norm_dict[measure][0]
#                         sliced_data[lev] /= self.norm_dict[measure][1]

#                     if self.normalise and add_slice and np.min(sliced_data) >= -100:
#                         final_data.append(sliced_data[lev])
#                         add_slice = False

#             final_data = np.array(final_data)

#         return final_data

In [4]:
training_data = InterpolatedDataGenerator(X_files=X_train, y_files=y_train, batch_size=128, shuffle=False, data_aug=False, normalise=True, normalise_sample=500, lev_range=lev)

norm_dict = training_data.get_norm_dict()

validation_data = InterpolatedDataGenerator(X_files=X_validation, y_files=y_validation, batch_size=128, shuffle=False, data_aug=False, normalise=True, norm_dict=norm_dict, lev_range = lev)
test_data = InterpolatedDataGenerator(X_test, y_test, batch_size=128, shuffle=False, data_aug=False, normalise=True, norm_dict=norm_dict, lev_range=lev)


  0% (0 of 500) |                        | Elapsed Time: 0:00:00 ETA:  --:--:--
  1% (5 of 500) |                        | Elapsed Time: 0:00:00 ETA:   0:00:09
  2% (12 of 500) |                       | Elapsed Time: 0:00:00 ETA:   0:00:09
  3% (16 of 500) |                       | Elapsed Time: 0:00:00 ETA:   0:00:09
  3% (19 of 500) |                       | Elapsed Time: 0:00:00 ETA:   0:00:09
  5% (25 of 500) |#                      | Elapsed Time: 0:00:00 ETA:   0:00:09
  6% (30 of 500) |#                      | Elapsed Time: 0:00:00 ETA:   0:00:09
  6% (32 of 500) |#                      | Elapsed Time: 0:00:00 ETA:   0:00:10
  7% (38 of 500) |#                      | Elapsed Time: 0:00:00 ETA:   0:00:10
  9% (45 of 500) |##                     | Elapsed Time: 0:00:00 ETA:   0:00:09
 10% (51 of 500) |##                     | Elapsed Time: 0:00:01 ETA:   0:00:09
 11% (55 of 500) |##                     | Elapsed Time: 0:00:01 ETA:   0:00:09
 12% (62 of 500) |##                    

In [5]:
batch_X, batch_y = training_data[6]
input_shape = batch_X.shape[1:]
output_shape = batch_y.shape[1:]
H, W, C = output_shape

output_shape = (H, W, C)
print("Input shape:", input_shape)
print("Output shape:", output_shape)

Input shape: (72, 45, 308)
Output shape: (72, 45, 35)


In [6]:
print(X_files[0])

('\\Users\\simma362\\Desktop\\Code\\Baseline-CNN\\Data\\SO2\\MLS-Aura_L3DB-SO2_v04-23-c01_2004d246.nc', '\\Users\\simma362\\Desktop\\Code\\Baseline-CNN\\Data\\CO\\MLS-Aura_L3DB-CO_v04-23-c01_2004d246.nc', '\\Users\\simma362\\Desktop\\Code\\Baseline-CNN\\Data\\H2O\\MLS-Aura_L3DB-H2O_v05-01-c06_2004d246.nc', '\\Users\\simma362\\Desktop\\Code\\Baseline-CNN\\Data\\HCl\\MLS-Aura_L3DB-HCl_v04-23-c01_2004d246.nc', '\\Users\\simma362\\Desktop\\Code\\Baseline-CNN\\Data\\HNO3\\MLS-Aura_L3DB-HNO3_v04-23-c01_2004d246.nc', '\\Users\\simma362\\Desktop\\Code\\Baseline-CNN\\Data\\N2O\\MLS-Aura_L3DB-N2O_v05-01-c06_2004d246.nc', '\\Users\\simma362\\Desktop\\Code\\Baseline-CNN\\Data\\Temperature\\MLS-Aura_L3DB-Temperature_v04-23-c01_2004d246.nc')


In [7]:
print(X_files[9])

('\\Users\\simma362\\Desktop\\Code\\Baseline-CNN\\Data\\SO2\\MLS-Aura_L3DB-SO2_v04-23-c01_2004d256.nc', '\\Users\\simma362\\Desktop\\Code\\Baseline-CNN\\Data\\CO\\MLS-Aura_L3DB-CO_v04-23-c01_2004d256.nc', '\\Users\\simma362\\Desktop\\Code\\Baseline-CNN\\Data\\H2O\\MLS-Aura_L3DB-H2O_v05-01-c06_2004d256.nc', '\\Users\\simma362\\Desktop\\Code\\Baseline-CNN\\Data\\HCl\\MLS-Aura_L3DB-HCl_v04-23-c01_2004d256.nc', '\\Users\\simma362\\Desktop\\Code\\Baseline-CNN\\Data\\HNO3\\MLS-Aura_L3DB-HNO3_v04-23-c01_2004d256.nc', '\\Users\\simma362\\Desktop\\Code\\Baseline-CNN\\Data\\N2O\\MLS-Aura_L3DB-N2O_v05-01-c06_2004d256.nc', '\\Users\\simma362\\Desktop\\Code\\Baseline-CNN\\Data\\Temperature\\MLS-Aura_L3DB-Temperature_v04-23-c01_2004d256.nc')


In [8]:
print('training data')
for batch_X, batch_y in training_data:
    print("Max min batch X: ", np.min(batch_X), np.max(batch_X))
    print("Max min batch y: ", np.min(batch_y), np.max(batch_y))
    # print(batch_X.shape, batch_y.shape)

print('validation data')
for batch_X, batch_y in validation_data:
    print("Max min batch X: ", np.min(batch_X), np.max(batch_X))
    print("Max min batch y: ", np.min(batch_y), np.max(batch_y))
    # print(batch_X.shape, batch_y.shape)

print('test data')
for batch_X, batch_y in test_data:
    print("Max min batch X: ", np.min(batch_X), np.max(batch_X))
    print("Max min batch y: ", np.min(batch_y), np.max(batch_y))
    # print(batch_X.shape, batch_y.shape)

training data
Max min batch X:  -999.99 0.9225868
Max min batch y:  -999.99 0.9677968
Max min batch X:  -999.99 0.9024142
Max min batch y:  -999.99 0.8246265
Max min batch X:  -999.99 0.973067
Max min batch y:  -999.99 0.885713
Max min batch X:  -999.99 0.970185
Max min batch y:  -999.99 0.94215137
Max min batch X:  -999.99 0.89364433
Max min batch y:  -999.99 0.8812817
Max min batch X:  -999.99 1.0471516
Max min batch y:  -999.99 0.7755967
Max min batch X:  -999.99 0.9537482
Max min batch y:  -999.99 0.89827025
Max min batch X:  -999.99 0.93259805
Max min batch y:  -999.99 0.90291524
Max min batch X:  -999.99 0.94720304
Max min batch y:  -999.99 0.80021244
Max min batch X:  -999.99 1.0057644
Max min batch y:  -999.99 0.8400959
Max min batch X:  -999.99 0.90834284
Max min batch y:  -999.99 0.87958896
Max min batch X:  -999.99 0.95862585
Max min batch y:  -999.99 0.7917706
Max min batch X:  -999.99 0.92003965
Max min batch y:  -999.99 0.8818979
Max min batch X:  -999.99 0.89117235
Max m