In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
import rasterio as rio
from glob import glob
import os, sys
import itertools
import numpy as np


In [2]:
# get y variable names
y_var_dir = '../../../data/y_tiles/'
y_files = glob(y_var_dir + "/*.tif")
y_vars = set([os.path.basename(f).split('_')[0] for f in y_files])
y_vars

{'Arson',
 'Campfire',
 'Children',
 'DebrisBurning',
 'EquipmentUse',
 'Fireworks',
 'Human',
 'Lightning',
 'Miscellaneous',
 'Powerline',
 'Railroad',
 'Smoking',
 'Structure'}

In [6]:
class yDatasetSingleVar(Dataset):
    """Dataset class for ignition types (Y var)"""
    
    def __init__(self, data_dir, transform=None, ig_type='Arson'):
        """ 
        Args:
            data_dir(string): the folder containing the image files
            transform (callable, optional): Optional transform to  be applied to image data
            ig_types (iterable, optional): types of ignition rasters to include 
        """
        assert os.path.exists(data_dir)
        
        self.data_dir = data_dir
        self.transform = transform
        self.ig_types = ig_type # should have a default...
        self.val_months = set([os.path.basename(f).split('_')[-2] for f in y_files if int(os.path.basename(f).split('_')[-2]) <=12])
        self.tile_nums = set([os.path.basename(f).split('_')[-1].split('.tif')[0] for f in y_files])
        self.files = glob(data_dir + "/*{}*.tif".format(ig_type))
        self.month_tile = itertools.product(self.val_months, self.tile_nums)
        self.fi = None
        
        ## could also set up a set of variable names if specifying it in the constructor proves difficult
        #y_vars = set([os.path.basename(f).split('_')[0] for f in y_files])
        
    
    def __getitem__(self, idx):
        
        """
            Files are organized as <ig_type>_<year>_<month>_t<tileNumber>.tif, e.g., Arson_1992_1_t1
            A single dataset needs to be constructed for a given ignition type, year, month, and tile number
        """
        
        fi = self.files[idx] 
        print(fi)
        valid_thresh = -100000
        with rio.open(fi) as src:
            img_arr = src.read(1)
            img_arr[img_arr<0] = 0
            
            ## any thing matching this condition for X vars replace with mean of valid vals
            img_arr[img_arr <= -2.4e+38] = img_arr[img_arr > valid_thresh].mean()
        
        print(img_arr.shape)
        
        if self.transform is not None:
            return self.transform(torch.from_numpy(img_arr))   
        else:
            return torch.from_numpy(img_arr)
        
        
    def __len__(self):
        return len(self.files)

In [7]:
test_single_var = yDatasetSingleVar(y_var_dir, ig_type='Campfire') # variable is Arson by default

In [5]:
for var in y_vars:
    test_dl = yDatasetSingleVar(y_var_dir, ig_type=var)
    print('{}: {}'.format(var, len(test_dl)))

Campfire: 43201
Children: 43203
Railroad: 43200
EquipmentUse: 43201
Powerline: 43200
Smoking: 43200
Fireworks: 43200
Human: 43200
Miscellaneous: 43200
Structure: 43200
Lightning: 43200
DebrisBurning: 43200
Arson: 43200


In [8]:
## looks like most y-vars have 43200 files, but Children, Campfire, and Equipment have more.
## try to find out the filenames which are not like the other variables!
equipment = glob(y_var_dir + "/{}*.tif".format('Equipment'))
children = glob(y_var_dir + "/{}*.tif".format('Children'))
campfire = glob(y_var_dir + "/{}*.tif".format('Campfire'))
base = glob(y_var_dir + "/{}*.tif".format('Arson'))

print(equipment[0])
print(children[0])
print(campfire[0])
print(base[0])

../../../data/y_tiles\EquipmentUse_1992_10_t1.tif
../../../data/y_tiles\Children_1992_10_t1.tif
../../../data/y_tiles\Campfire_1992_10_t1.tif
../../../data/y_tiles\Arson_1992_10_t1.tif


In [9]:
# get just the year_month_tile.tif
equip = ['_'.join(f.split('_')[-3:]) for f in equipment]
child = ['_'.join(f.split('_')[-3:]) for f in children]
camp = ['_'.join(f.split('_')[-3:]) for f in campfire]
b = ['_'.join(f.split('_')[-3:]) for f in base]

In [10]:
print('equipment use difference')
print(np.setdiff1d(equip, b))

print('\nchildren difference')
print(np.setdiff1d(child, b))

print('\ncampfire difference')
print(np.setdiff1d(camp, b))

equipment use difference
['tiles\\EquipmentUse_2013_t82.tif']

children difference
['tiles\\Children_2008_t4.tif' 'tiles\\Children_2008_t8.tif'
 'tiles\\Children_2013_t1.tif']

campfire difference
['tiles\\Campfire_2013_t87.tif']


In [None]:
temp_list = []
for ig_type in ['Arson', 'Campfire']:
    files = glob(y_var_dir + "/{}*.tif".format(ig_type))
    print('{}: {}'.format(ig_type, len(files)))
    temp_list.append(files)
    
print(len(temp_list))

In [22]:
class yDatasetMultiVar(Dataset):
    """Dataset class for ignition types (Y var)"""
    
    def __init__(self, data_dir, transform=None, ig_types=['Arson', 'Campfire']):
        """ 
        Args:
            data_dir(string): the folder containing the image files
            transform (callable, optional): Optional transform to  be applied to image data
            ig_types (iterable, optional): types of ignition rasters to include 
        """
        
        # some sanity checks
        assert os.path.exists(data_dir)
        assert len(ig_types) > 1
        val_ig_types = set([os.path.basename(f).split('_')[0] for f in glob(data_dir + '/*.tif')])
        for v in val_ig_types:
            assert v in list(val_ig_types)
        
        # initialize some attributes
        self.data_dir = data_dir
        self.transform = transform
        self.ig_types = ig_types # should have a default...
        self.val_months = set([os.path.basename(f).split('_')[-2] for f in y_files if int(os.path.basename(f).split('_')[-2]) <=12])
        
        # the filenaming is not consistent to get the years from the filename :(
        #self.val_years = set([os.path.basename(f).split('_')[-3] for f in y_files if (len(os.path.basename(f).split('_')) > 3)])
        self.val_years = list(range(1992,2016)) # 2015, since it is open-ended on the right
        
        self.tile_nums = set([os.path.basename(f).split('_')[-1].split('.tif')[0] for f in y_files])
        
        # get the files for ig_type[0]... need to assert that num_files for each ig_type is equal
        self.files = glob(data_dir + "/*{}*.tif".format(ig_types[0]))
        temp_list = []
        for ig_type in ig_types:
            files = glob(data_dir + "/{}*.tif".format(ig_type))
            temp_list.append(files)
        
        # this should ensure that the file numbers are equal
        for flist in temp_list[1:]:
            assert len(temp_list[0]) == len(flist)        
        
        # create an iterable for the __getitem__ method
        self.var_year_month_tile = list(itertools.product(self.ig_types, self.val_years, self.val_months, self.tile_nums))
        self.year_month_tile = list(itertools.product(self.val_years, self.val_months, self.tile_nums))
        
    
    def __getitem__(self, idx):
        
        """
            Files are organized as <ig_type>_<year>_<month>_t<tileNumber>.tif, e.g., Arson_1992_1_t1
            A single dataset needs to be constructed for a given ignition type, year, month, and tile number
        """
        
        #var, year, month, tile = self.var_year_month_tile[idx]
        year, month, tile = self.year_month_tile[idx]
        files = [os.path.join(self.data_dir, '{}_{}_{}_{}.tif'.format(var, year, month, tile)) for var in self.ig_types]
        arrs = []
        
        print(files)
        
        valid_thresh = -100000
        for fi in files:
            with rio.open(fi) as src:
                arr = src.read(1)
                
                ## any thing matching this condition for X vars replace with mean of valid vals
                arr[arr <= -2.4e+38] = arr[arr > valid_thresh].mean()
            
            arrs.append(arr)
        img_arr = np.array(arrs)
        
        if self.transform is not None:
            return self.transform(torch.from_numpy(img_arr))   
        else:
            return torch.from_numpy(img_arr)
        
        
    def __len__(self):
        return len(self.files)

In [23]:
# Campfire: 43201
# Children: 43203
# Railroad: 43200
# EquipmentUse: 43201
# Powerline: 43200
# Smoking: 43200
# Fireworks: 43200
# Human: 43200
# Miscellaneous: 43200
# Structure: 43200
# Lightning: 43200
# DebrisBurning: 43200
# Arson: 43200

test_multi_var = yDatasetMultiVar(y_var_dir, ig_types=['Arson', 'Railroad', 'Powerline']) # variable is Arson by default


43200
43200


In [24]:
test_multi_var[0].shape

['../../../data/y_tiles/Arson_1992_1_t88.tif', '../../../data/y_tiles/Railroad_1992_1_t88.tif', '../../../data/y_tiles/Powerline_1992_1_t88.tif']


  ret = ret.dtype.type(ret / rcount)


torch.Size([3, 448, 448])