-
Notifications
You must be signed in to change notification settings - Fork 11
/
utils_pytorch.py
331 lines (294 loc) · 12.6 KB
/
utils_pytorch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
import os
import numpy as np
import pandas as pd
import h5py
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
os.environ["HDF5_USE_FILE_LOCKING"] = 'FALSE'
TYPES = ['vis', 'ir069', 'ir107', 'vil', 'lght']
DEFAULT_CATALOG = '/home/gridsan/groups/EarthIntelligence/datasets/SEVIR/CATALOG.csv'
DEFAULT_DATA_HOME = '/home/gridsan/groups/EarthIntelligence/datasets/SEVIR/data/'
FRAME_TIMES = np.arange(-120.0, 120.0, 5) * 60
# TODO : MANY!
# Example function of how to create pytorch dataloader with sevir dataset
def get_sevir_loader(self,
batch_size=3,
x_img_types=['vil'],
y_img_types=None,
catalog=DEFAULT_CATALOG,
start_date=None,
end_date=None,
datetime_filter=None,
catalog_filter=None,
unwrap_time=False,
sevir_data_home=DEFAULT_DATA_HOME,
shuffle=False,
pin_memory=True,
output_type=np.float32,
normalize_x=None,
normalize_y=None,
num_workers=0,
):
dataset = SEVIR(x_img_types,
y_img_types,
catalog,
start_date,
end_date,
datetime_filter,
catalog_filter,
unwrap_time,
sevir_data_home,
output_type,
normalize_x,
normalize_y)
return DataLoader(dataset,
batch_size=batch_size,
shuffle=shuffle,
pin_memory=pin_memory,
num_workers=num_workers)
class SEVIR(Dataset):
"""
Sequence class for generating batches from SEVIR
Parameters
----------
catalog str or pd.DataFrame
name of SEVIR catalog file to be read in, or an already read in and processed catalog
x_img_types list
List of image types to be used as model inputs. For types, run SEVIRSequence.get_types()
y_img_types list or None
List of image types to be used as model targets (if None, __getitem__ returns only x_img_types )
sevir_data_home str
Directory path to SEVIR data
catalog str
Name of SEVIR catalog CSV file.
start_date datetime
Start time of SEVIR samples to generate
end_date datetime
End time of SEVIR samples to generate
datetime_filter function
Mask function applied to time_utc column of catalog (return true to keep the row).
Pass function of the form lambda t : COND(t)
Example: lambda t: np.logical_and(t.dt.hour>=13,t.dt.hour<=21) # Generate only day-time events
catalog_filter function
Mask function applied to entire catalog dataframe (return true to keep row).
Pass function of the form lambda catalog: COND(catalog)
Example: lambda c: [s[0]=='S' for s in c.id] # Generate only the 'S' events
unwrap_time bool
If True, single images are returned instead of image sequences
output_type np.dtype
dtype of generated tensors
normalize_x list of tuple
list the same size as x_img_types containing tuples (scale,offset) used to
normalize data via X --> (X-offset)*scale. If None, no scaling is done
normalize_y list of tuple
list the same size as y_img_types containing tuples (scale,offset) used to
normalize data via X --> (X-offset)*scale
Returns
-------
SEVIRSequence generator
Examples
--------
# Get just Radar image sequences
vil_seq = SEVIRSequence(x_img_types=['vil'],batch_size=16)
X = vil_seq.__getitem__(1234) # returns list the same size as x_img_types passed to constructor
# Get ir satellite+lightning as X, radar for Y
vil_ir_lght_seq = SEVIRSequence(x_img_types=['ir107','lght'],y_img_types=['vil'],batch_size=4)
X,Y = vil_ir_lght_seq.__getitem__(420) # X,Y are lists same length as x_img_types and y_img_types
# Get single images of VIL
vil_imgs = SEVIRSequence(x_img_types=['vil'], batch_size=256, unwrap_time=True, shuffle=True)
# Filter out some times
vis_seq = SEVIRSequence(x_img_types=['vis'],batch_size=32,unwrap_time=True,
start_date=datetime.datetime(2018,1,1),
end_date=datetime.datetime(2019,1,1),
datetime_filter=lambda t: np.logical_and(t.dt.hour>=13,t.dt.hour<=21))
"""
def __init__(self,
x_img_types=['vil'],
y_img_types=None,
catalog=DEFAULT_CATALOG,
start_date=None,
end_date=None,
datetime_filter=None,
catalog_filter=None,
unwrap_time=False,
sevir_data_home=DEFAULT_DATA_HOME,
output_type=np.float32,
normalize_x=None,
normalize_y=None
):
self._samples = None
self._hdf_files = {}
self.x_img_types = x_img_types
self.y_img_types = y_img_types
if isinstance(catalog, (str,)):
self.catalog = pd.read_csv(catalog, parse_dates=['time_utc'], low_memory=False)
else:
self.catalog = catalog
self.datetime_filter = datetime_filter
self.catalog_filter = catalog_filter
self.start_date = start_date
self.end_date = end_date
self.unwrap_time = unwrap_time
self.sevir_data_home = sevir_data_home
self.output_type = output_type
self.normalize_x = normalize_x
self.normalize_y = normalize_y
if normalize_x:
assert (len(normalize_x) == len(x_img_types))
if normalize_y:
assert (len(normalize_y) == len(y_img_types))
if self.start_date:
self.catalog = self.catalog[self.catalog.time_utc > self.start_date]
if self.end_date:
self.catalog = self.catalog[self.catalog.time_utc <= self.end_date]
if self.datetime_filter:
self.catalog = self.catalog[self.datetime_filter(self.catalog.time_utc)]
if self.catalog_filter:
self.catalog = self.catalog[self.catalog_filter(self.catalog)]
self._compute_samples()
self._open_files()
def _compute_samples(self):
"""
Computes the list of samples in catalog to be used. This sets
self._samples
"""
# locate all events containing colocated x_img_types and y_img_types
imgt = self.x_img_types
if self.y_img_types:
imgt = list(set(imgt + self.y_img_types)) # remove duplicates
imgts = set(imgt)
filtcat = self.catalog[np.logical_or.reduce([self.catalog.img_type == i for i in imgt])]
# remove rows missing one or more requested img_types
filtcat = filtcat.groupby('id').filter(lambda x: imgts.issubset(set(x['img_type'])))
# If there are repeated IDs, remove them (this is a bug in SEVIR)
filtcat = filtcat.groupby('id').filter(lambda x: x.shape[0] == len(imgt))
self._samples = filtcat.groupby('id').apply(lambda df: df_to_series(df, imgt, unwrap_time=self.unwrap_time))
def _open_files(self):
"""
Opens HDF files
"""
imgt = self.x_img_types
if self.y_img_types:
imgt = list(set(imgt + self.y_img_types)) # remove duplicates
hdf_filenames = []
for t in imgt:
hdf_filenames += list(np.unique(self._samples[f'{t}_filename'].values))
self._hdf_files = {}
for f in hdf_filenames:
print('Opening HDF5 file for reading', f)
self._hdf_files[f] = h5py.File(self.sevir_data_home + '/' + f, 'r')
def __getitem__(self, idx):
"""
batch_x = self.x[idx * self.batch_size:(idx + 1) * self.batch_size]
batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size]
return np.array([
resize(imread(file_name), (200, 200))
for file_name in batch_x]), np.array(batch_y)
"""
data = {}
data = read_data(self._samples.iloc[idx], data, self._hdf_files, self.unwrap_time)
X = [data[t].astype(self.output_type) for t in self.x_img_types]
if self.normalize_x:
X = [normalize(X[k], s[0], s[1]) for k, s in enumerate(self.normalize_x)]
if self.y_img_types is not None:
Y = [data[t].astype(self.output_type) for t in self.y_img_types]
if self.normalize_y:
Y = [normalize(Y[k], s[0], s[1]) for k, s in enumerate(self.normalize_y)]
return X, Y
else:
return X
def read_data(row, data, hdf_files, unwrap_time=False):
"""
Reads data from data object
:param row: series with fields IMGTYPE_filename IMGTYPE_index, IMGTYPE_time_index
:param data: data object
:param hdf_files: hdf_file handles to read from
:param unwrap_time: boolean for unwrapping time field
:return:
"""
image_types = np.unique([x.split('_')[0] for x in list(row.keys())])
for t in image_types:
f_name = row[f'{t}_filename']
idx = row[f'{t}_index']
if unwrap_time:
time_idx = row[f'{t}_time_index']
t_slice = slice(time_idx, time_idx + 1)
else:
t_slice = slice(0, None)
# Need to bin lght counts into grid
if t == 'lght':
lightning_data = hdf_files[f_name][idx][:]
data_i = lightning_to_hist(lightning_data, t_slice)
else:
data_i = hdf_files[f_name][t][idx:idx + 1, :, :, t_slice]
data[t] = np.concatenate((data[t], data_i), axis=0) if (t in data) else data_i
return data
def lightning_to_hist(data, t_slice=slice(0, None)):
"""
Converts Nx5 lightning data matrix into a XYT histogram
:param data: lightning event data
:param t_slice: temporal dimension
:return: XYT histogram of lightning data
"""
out_size = (48, 48, len(FRAME_TIMES)) if t_slice.stop is None else (48, 48, 1)
if data.shape[0] == 0:
return np.zeros((1,) + out_size, dtype=np.float32)
# filter out points outside the grid
x, y = data[:, 3], data[:, 4]
m = np.logical_and.reduce([x >= 0, x < out_size[0], y >= 0, y < out_size[1]])
data = data[m, :]
if data.shape[0] == 0:
return np.zeros((1,) + out_size, dtype=np.float32)
# Filter/separate times
t = data[:, 0]
if t_slice.stop is not None: # select only one time bin
if t_slice.stop > 0:
tm = np.logical_and(t >= FRAME_TIMES[t_slice.stop - 1],
t < FRAME_TIMES[t_slice.stop])
else: # special case: frame 0 uses lightning from frame 1
tm = np.logical_and(t >= FRAME_TIMES[0], t < FRAME_TIMES[1])
data = data[tm, :]
z = np.zeros(data.shape[0], dtype=np.int64)
else: # compute z coordinate based on bin location times
z = np.digitize(t, FRAME_TIMES) - 1
z[z == -1] = 0 # special case: frame 0 uses lightning from frame 1
x = data[:, 3].astype(np.int64)
y = data[:, 4].astype(np.int64)
k = np.ravel_multi_index(np.array([y, x, z]), out_size)
n = np.bincount(k, minlength=np.prod(out_size))
return np.reshape(n, out_size).astype(np.float32)[np.newaxis, :]
def normalize(x, scale, offset, reverse=False):
"""
Normalize data or reverse normalization
:param x: data array
:param scale: const scaling value
:param offset: const offset value
:param reverse: boolean undo normalization
:return: normalized x array
"""
if reverse:
return x / scale + offset
else:
return (x-offset) * scale
def df_to_series(df, image_types, n_frames=49, unwrap_time=False):
"""
This looks like it takes a data frame and turns it into a sequence like data frame
:param df: pandas data frame
:param image_types: image types to extract
:param n_frames: number of frames to extract
:param unwrap_time: boolean for whether or not to set time index
:return: sequence data frame
"""
d = {}
df = df.set_index('img_type')
for i in image_types:
s = df.loc[i]
idx = s.file_index if i != 'lght' else s.id
if unwrap_time:
d.update({f'{i}_filename': [s.file_name] * n_frames,
f'{i}_index': [idx] * n_frames,
f'{i}_time_index': range(n_frames)})
else:
d.update({f'{i}_filename': [s.file_name],
f'{i}_index': [idx]})
return pd.DataFrame(d)