In [1]:
import os
import sys
import numpy as np
import json
from scipy import misc
from matplotlib import pyplot as plt
sys.path.append('..')

In [2]:
from modis_utils.misc import get_im
from modis_utils.misc import cache_data, restore_data

# Changeable parameters

In [3]:
modis_product = 'ALL'

year_range=(2002, 2017)
reservoir_id = 0
data_dir = '../raw_data/' + modis_product

#used_band = 'blue'
used_band = 'NDVI'

preprocessed_dir = 'preprocessed_data/'

In [4]:
if modis_product == 'ALL':
    day_period = 8
else:
    day_period = 16
    
n_data_per_year = 365//day_period + 1

# Change fill value to -2001

In [5]:
dest_dir = os.path.join(preprocessed_dir, modis_product, 'change_fill_value')

In [6]:
!ls ..

boundary_method  model_parallelism_tf  modis_utils  raw_data


In [7]:
for reservoir_index in [reservoir_id]:
    data_dir_1 = os.path.join(data_dir, str(reservoir_index))
    for year in range(year_range[0], year_range[1] + 1):
        for d in range(n_data_per_year):
            day = d*day_period + 1
            current_dir = os.path.join(data_dir_1, str(year), str(year) + str(day).zfill(3))
            try:
                img_dir = list(filter(lambda x: used_band in x, os.listdir(current_dir)))[0]
                path = os.path.join(current_dir, img_dir)
                img = get_im(path)

                img[img == -3000] = -2001
                cur_dest_dir = os.path.join(dest_dir, str(reservoir_index), str(year),
                                            str(year) + str(day).zfill(3))
                try:
                    os.makedirs(cur_dest_dir)
                except:
                    pass
                cache_data(img, os.path.join(cur_dest_dir, img_dir[:-4] + '.dat'))
            except:
                pass

# Min-max scaler (Scale to 0-1)

In [10]:
data_dir = os.path.join(preprocessed_dir, modis_product, 'change_fill_value')
dest_dir = os.path.join(preprocessed_dir, modis_product, 'zero_one')

In [11]:
min_val = -2001
max_val = 10000

In [12]:
for reservoir_index in [reservoir_id]:
    data_dir_1 = os.path.join(data_dir, str(reservoir_index))
    for year in range(year_range[0], year_range[1] + 1):
        for d in range(n_data_per_year):
            day = d*day_period + 1
            current_dir = os.path.join(data_dir_1, str(year), str(year) + str(day).zfill(3))
            try:
                img_dir = list(filter(lambda x: used_band in x, os.listdir(current_dir)))[0]
                path = os.path.join(current_dir, img_dir)
                img = restore_data(path)

                normalized_img = (img - min_val)*1.0 / (max_val - min_val)
                cur_dest_dir = os.path.join(dest_dir, str(reservoir_index), str(year),
                                            str(year) + str(day).zfill(3))
                try:
                    os.makedirs(cur_dest_dir)
                except:
                    pass
                cache_data(normalized_img, os.path.join(cur_dest_dir, img_dir))
            except:
                #print(reservoir_index, year, day)
                pass

# Z-Score normalization

In [13]:
train_year_range = (2002, 2015)

In [14]:
data_dir = os.path.join(preprocessed_dir, modis_product, 'zero_one')
dest_dir = os.path.join(preprocessed_dir, modis_product, 'normalized')

# Find mean, std of train set

In [15]:
list_imgs = []
for reservoir_index in [reservoir_id]:
    data_dir_1 = os.path.join(data_dir, str(reservoir_index))
    for year in range(train_year_range[0], train_year_range[1] + 1):
        for d in range(n_data_per_year):
            day = d*day_period + 1
            current_dir = os.path.join(data_dir_1, str(year), str(year) + str(day).zfill(3))
            try:
                img_dir = list(filter(lambda x: used_band in x, os.listdir(current_dir)))[0]
                path = os.path.join(current_dir, img_dir)
                img = restore_data(path)
                img = np.expand_dims(img, axis=0)
                list_imgs.append(img)
            except:
                pass
imgs = np.vstack(list_imgs)
assert imgs.min() == 0
assert imgs.max() < 1
mean = imgs.mean()
std = imgs.std()

In [16]:
print(mean, std)

0.5133767592612672 0.2699482407312094


In [17]:
cache_data((mean, std), 'mean_std.dat')

In [18]:
if not os.path.exists(dest_dir):
    os.makedirs(dest_dir)

In [19]:
for reservoir_index in [reservoir_id]:
    data_dir_1 = os.path.join(data_dir, str(reservoir_index))
    for year in range(year_range[0], year_range[1] + 1):
        for d in range(n_data_per_year):
            day = d*day_period + 1
            current_dir = os.path.join(data_dir_1, str(year), str(year) + str(day).zfill(3))
            try:
                img_dir = list(filter(lambda x: used_band in x, os.listdir(current_dir)))[0]
                path = os.path.join(current_dir, img_dir)
                img = restore_data(path)

                normalized_img = (img - mean) / std
                out_path = os.path.join(dest_dir, str(reservoir_index), '{}{:03}.dat'.format(year, day))
                cache_data(normalized_img, out_path)
            except:
                #print(reservoir_index, year, day)
                pass

In [20]:
list_imgs = []
for reservoir_index in [reservoir_id]:
    data_dir_1 = os.path.join(dest_dir, str(reservoir_index))
    for year in range(train_year_range[0], train_year_range[1] + 1):
        for d in range(n_data_per_year):
            day = d*day_period + 1
            path = os.path.join(dest_dir, str(reservoir_index), '{}{:03}.dat'.format(year, day))
            if os.path.isfile(path):
                img = restore_data(path)
                img = np.expand_dims(img, axis=0)
                list_imgs.append(img)
imgs = np.vstack(list_imgs)
mean_1 = imgs.mean()
std_1 = imgs.std()
print(imgs.min(), imgs.max())
print(mean_1, std_1)

-1.9017599739516085 1.8014191678510223
-5.6648648225275966e-15 1.0000000000000042
