forked from dl4sits/BreizhCrops
-
Notifications
You must be signed in to change notification settings - Fork 4
/
generate_datasets.py
198 lines (173 loc) · 8.98 KB
/
generate_datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
import glob, math, sys, os, time, random, argparse
import fiona
import rasterio as rio
import rasterio.mask
import numpy as np
import pandas as pd
import geopandas as gpd
from tqdm import tqdm
def main(rasters_folder, stacked_raster, train_feature, output_dir, train_ratio, predict_ratio, pixel_ratio, cache):
os.environ["GDAL_CACHEMAX"] = cache
# stacking rasters if rasters_folder is provided.
if rasters_folder:
# Stacking time series rasters together
rasters = glob.glob(os.path.join(rasters_folder, "*.tif")) + glob.glob(
os.path.join(rasters_folder, "*.tiff"))
stack(rasters, stacked_raster)
# Generatin train/test datasets
train_csv = train_feature.replace(".shp", "_train_datasat.csv")
test_csv = train_feature.replace(".shp", "_test_datasat.csv")
if output_dir:
os.makedirs(output_dir, exist_ok=True)
train_csv = os.path.join(output_dir, os.path.basename(train_csv))
test_csv = os.path.join(output_dir, os.path.basename(test_csv))
train_list, test_list, predict_list = split_train_feature(train_feature, train_ratio, predict_ratio)
generate_training_data(stacked_raster, train_feature, test_list, test_csv, pixel_ratio)
generate_training_data(stacked_raster, train_feature, train_list, train_csv, pixel_ratio)
def stack(rasters, out_raster):
# Read in metadata
first_raster = rio.open(rasters[0], 'r')
out_raster = rio.open(out_raster, 'w', nodata=first_raster.nodata, driver='GTiff', height=first_raster.shape[0], width=first_raster.shape[1], count=first_raster.count*len(rasters), dtype=first_raster.read(1).dtype, crs=first_raster.crs, transform=first_raster.transform)
band_index = 0
for raster in tqdm(rasters):
for band in rio.open(raster, "r").read():
band_index+=1
out_raster.write(band, band_index)
out_raster.close()
def interpolation(input_raster, output_raster, n_channels, threshold, cache="50%"):
os.environ["GDAL_CACHEMAX"] = cache
src = rio.open(input_raster, 'r')
meta = src.meta.copy()
nodata=meta["nodata"]
height=meta["height"]
width=meta["width"]
count=meta["count"]
dtype = meta["dtype"]
dst = rio.open(output_raster, 'w', nodata=nodata, driver='GTiff',
height=height, width=width,
count=threshold*n_channels, dtype=dtype,
crs=meta["crs"], transform=meta["transform"])
# Read raster as array [bands, height, width]
array = src.read()
# Reshape array
n_pixels = height*width
array = array.transpose(1, 2, 0).reshape(n_pixels, int(count / n_channels), n_channels)
# Loop through every pixel
out_array = np.zeros((n_pixels, threshold, n_channels), dtype=dtype)
for i, arr in tqdm(enumerate(array), total=n_pixels):
# Delete nodata sequence
nodata_seq = np.unique(np.where(arr == nodata)[0])
arr = np.delete(arr, nodata_seq, axis=0)
# Define a sequencelength threshold,
# if sequencelength of pixel >= threshold, randomly extract from sequencelength
# if sequencelength of 0.5*threshold < pixel < threshold, we randomly interpolate value from sequencelength to it
# if sequencelength of pixel < 0.5*threshold, turn whole sequence to nodata
sequencelength = arr.shape[0]
if sequencelength >= threshold:
idxs = np.random.choice(sequencelength, threshold, replace=False)
idxs.sort()
arr = arr[idxs]
elif 0.5*threshold < sequencelength < threshold:
idxs = np.random.choice(sequencelength, threshold - sequencelength, replace=False)
idxs = np.append(np.arange(sequencelength), idxs)
idxs.sort()
arr = arr[idxs]
else:
arr = np.full((threshold, n_channels), nodata)
# Write in pixel
out_array[i, :] = arr
# Reshape to [bands, width, height]
out_array = out_array.reshape(height, width, -1).transpose(2, 0, 1)
# Write out_array to out_raster
dst.write(out_array)
def split_train_feature(train_feature, train_ratio=0.9, predict_ratio=0):
# read train_feature, get classvalue
shp = fiona.open(train_feature, "r")
classvalues = [int(feature['properties']['Classvalue']) for feature in shp]
train_list = []
test_list = []
predict_list = []
for classvalue in list(set(classvalues)):
fid_list = []
for i in range(len(classvalues)):
if classvalues[i] == classvalue:
fid_list.append(i)
random.shuffle(fid_list)
train = fid_list[:int(len(fid_list)*train_ratio)]
test = fid_list[int(len(fid_list)*train_ratio):int(len(fid_list)*(1-predict_ratio))]
predict = fid_list[int(len(fid_list)*(1-predict_ratio)):int(len(fid_list)*1)]
[train_list.append(i) for i in train]
[test_list.append(i) for i in list(test)]
[predict_list.append(i) for i in list(predict)]
return train_list, test_list, predict_list
def generate_training_data(input_raster, train_feature, sample_list, out_csv, pixel_ratio=0.2):
# read input_raster and train_feature
src = rio.open(input_raster)
shp = fiona.open(train_feature, "r")
# Reproject train_feature if the crs are different
if src.crs != shp.crs:
shp = gpd.read_file(train_feature)
shp = shp.to_crs(f"{src.crs}")
train_feature = train_feature.replace(".shp", "_reprojected.shp")
shp.to_file(train_feature)
shp = fiona.open(train_feature, "r")
# generate empty array to store data
out_images = np.zeros(shape=(int(src.width*src.height/3), src.count+1), dtype=np.int64)
row0 = 0
# loop through every feature(polygon) to extract pixels' value within it.
with tqdm(sample_list, desc = "Generating test/train datasets") as samples:
for i in samples:
# Masking raster using feature
shape = [shp[i]["geometry"]]
out_image, out_transform = rio.mask.mask(src, shape, crop=True)
# Reshape array to (n_pixels, n_bands)
out_image = out_image.reshape(-1).reshape(-1, src.count, order='F')
# Delete nodata rows
nodata_row = np.unique(np.where(out_image == -9999)[0])
out_image = np.delete(out_image, nodata_row, axis=0)
# Add classvalue to every row
classvalue = int(shp[i]['properties']['Classvalue'])
lc_id = np.linspace(classvalue, classvalue, out_image.shape[0], dtype="int64").reshape(-1, 1)
out_image = np.append(lc_id, out_image, 1)
# Write in out_image
row1 = row0 + out_image.shape[0]
out_images[row0:row1, :] = out_image
row0 += out_image.shape[0]
# Delete zero_rows
zero_rows = np.where((out_images[:, 1:] == 0).all(axis=1))[0]
out_images = np.delete(out_images, zero_rows, axis=0)
# To avoid similarity, only part of the whole data are exported
# Indices's datatype should be int32 cause maximum value of int16 is 32767
indices = np.floor(np.arange(1, out_images.shape[0], 1/pixel_ratio)).astype('int64')
out_images = np.take(out_images, indices, axis=0)
# Add index in second column
index = np.arange(0, out_images.shape[0], 1, dtype=np.int64)
out_images = np.insert(out_images, 1, index, axis=1)
# Export to .csv format
df = pd.DataFrame(out_images, index)
df.to_csv(out_csv, index=False, header=False)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Generating training datasets')
parser.add_argument('--rasters_folder', dest='rasters_folder',
help='rasters folder',
default=None)
parser.add_argument('--stacked_raster', dest='stacked_raster',
help='path for stacked raster',
default=None)
parser.add_argument('--train_feature', dest='train_feature',
help='path for train feature',
default=None)
parser.add_argument('--output_dir', dest='output_dir',
help='path to store datasets',
default=None)
parser.add_argument('--train_ratio', dest='train_ratio', type=float,
help='ratio of train part', default=0.7)
parser.add_argument('--predict_ratio', dest='predict_ratio', type=float,
help='ratio of predict part, for TF/Keras only', default=0)
parser.add_argument('--pixel_ratio', dest='pixel_ratio', type=float,
help='ratio of extracted pixel values to keep', default=0.5)
parser.add_argument('--cache', dest='cache',
help='Set GDAL raster block cache size, may speed up processing with higher percentage, default is 5% of usable physical RAM',
type=str, default='20%')
args = parser.parse_args()
main(args.rasters_folder, args.stacked_raster, args.train_feature, args.output_dir, args.train_ratio, args.predict_ratio, args.pixel_ratio, args.cache)