In [1]:
import sqlite3
import pandas as pd
import numpy as np
import pickle
import geopandas
from shapely.geometry import Polygon
import uuid

In [2]:
cnx = sqlite3.connect('../us_wildfire_dataset/FPA_FOD_20170508.sqlite')
df = pd.read_sql_query("SELECT DISCOVERY_DATE, CONT_DATE, LATITUDE, LONGITUDE, STATE, FIRE_NAME, FIRE_SIZE_CLASS, FIRE_SIZE,STAT_CAUSE_DESCR FROM fires", cnx)

In [3]:
def filter_raw_data(df, state='CA', min_class=None):

    # drop states
    df_filt = df[df.STATE == 'CA']
    df_filt = df_filt.drop(['STATE'], axis=1)

    # drop fire classes
    if min_class is not None:
        df_filt.FIRE_SIZE_CLASS = df_filt.FIRE_SIZE_CLASS.apply(ord)
        df_filt = df_filt[df_filt.FIRE_SIZE_CLASS >= ord(min_class)]

    # reformat dates
    df_filt.DISCOVERY_DATE = pd.to_datetime(df['DISCOVERY_DATE'], unit='D', origin='julian')
    df_filt.CONT_DATE = pd.to_datetime(df['CONT_DATE'], unit='D', origin='julian')

    # convert coordinates
    df_filt = geopandas.GeoDataFrame(df_filt, geometry=geopandas.points_from_xy(
        df_filt.LONGITUDE, df_filt.LATITUDE))
    df_filt = df_filt.drop(['LONGITUDE'], axis=1)
    df_filt = df_filt.drop(['LATITUDE'], axis=1)
    df_filt.insert(2, 'COORD', df_filt.pop('geometry'))

    # remove missing values
    df_filt = df_filt.dropna()

    # reformat head
    df_filt.columns = [
        'start_date', 'end_date', 'geometry',
        'name', 'size_class', 'size', 'cause'
    ]

    # sort by start dates
    df_filt = df_filt.sort_values(by='start_date')
    df_filt = df_filt.reset_index()

    return df_filt

In [4]:
def extract_geo_fires(df, area):

    # return fires within polygon
    res = df[df.within(area)]
    return res


def build_geo_grid(df, grid_area, square_size, verbose=False):
    bounds = grid_area.bounds

    # calculate number of grids in lat/long directions
    long_steps = int((bounds[2] - bounds[0]) / square_size)
    lat_steps = int((bounds[3] - bounds[1]) / square_size)

    grid_df = []
    prog, total = 0, long_steps * lat_steps
    for i in range(long_steps):
        for j in range(lat_steps):

            # update progress
            prog += 1
            if verbose and prog % 10 == 0:
                print('Progress: {}/{}'.format(prog, total), flush=True)
            
            # get south-east grid square corner
            c_lon = bounds[0] + i * square_size
            c_lat = bounds[1] + j * square_size

            # create grid square
            grid = Polygon([
                (c_lon, c_lat), 
                (c_lon + square_size, c_lat), 
                (c_lon, c_lat + square_size), 
                (c_lon + square_size, c_lat + square_size)
            ])

            fires = extract_geo_fires(df, grid)
            grid_df.append([uuid.uuid4(), grid, fires.index])

    # build grid df
    grid_df = geopandas.GeoDataFrame(grid_df)
    grid_df.columns = ['grid_id', 'grid_square', 'fire_indices']
    return grid_df

In [5]:
# coordinate-square north of San Bernardino/Riverside 
p = Polygon([(-118, 34), (-118, 36), (-116, 34), (-116, 36)])

# filter raw data from dataset and build sexy format
df_filt = filter_raw_data(df)

# df_filt is ordered by start_date with the following columns:
#   'start_date', 'end_date', 'geometry', 'name', 'size_class', 'size', 'cause'

# build df with each grid square area (from grid area `p` with grid square size 0.1) and corresponding fires indices (which indexes complete list of fires in df_filt)
grid_df = build_geo_grid(df_filt, p, 0.1, verbose=True)

# grid_df has the following columns:
#    'grid_id', 'grid_square', 'fire_indices'
#    grid_id: a random uuid to identify the grid square for later
#    grid_square: polygon object of the grid square
#    fire_indices: list of indices of the fires from df_filt within the grid_square

Progress: 10/400
Progress: 20/400
Progress: 30/400
Progress: 40/400
Progress: 50/400
Progress: 60/400
Progress: 70/400
Progress: 80/400
Progress: 90/400
Progress: 100/400
Progress: 110/400
Progress: 120/400
Progress: 130/400
Progress: 140/400
Progress: 150/400
Progress: 160/400
Progress: 170/400
Progress: 180/400
Progress: 190/400
Progress: 200/400
Progress: 210/400
Progress: 220/400
Progress: 230/400
Progress: 240/400
Progress: 250/400
Progress: 260/400
Progress: 270/400
Progress: 280/400
Progress: 290/400
Progress: 300/400
Progress: 310/400
Progress: 320/400
Progress: 330/400
Progress: 340/400
Progress: 350/400
Progress: 360/400
Progress: 370/400
Progress: 380/400
Progress: 390/400
Progress: 400/400


In [6]:
def label_grid_square(df, grid_df, start_date='1992-01', end_date='2015-12', verbose=False):
    
    # build date range (in months)
    date_range = [str(d) for d in np.arange(
        start_date, 
        end_date, 
        dtype='datetime64[M]'
    )]    

    # iterate over all grid squares 
    label_df = []
    prog, total = 0, grid_df.shape[0]
    for i in range(grid_df.shape[0]):
        
        # update progress
        prog += 1
        if verbose and prog % 10 == 0:
            print('Progress: {}/{}'.format(prog, total), flush=True)
                
        # get grid square fires
        id = grid_df.loc[i, 'grid_id']
        grid_square = grid_df.loc[i, 'grid_square']
        fire_indices = list(grid_df.loc[i, 'fire_indices'])
        fires = df.loc[fire_indices, :]
        
        # collect all months in fire date range
        months = []
        for _, row in fires.iterrows(): 
            start = row.start_date.date()
            end = (row.end_date + pd.DateOffset(months=1)).date()
            months.extend([str(d) for d in np.arange(start, end, dtype='datetime64[M]')])
        fire_months = list(set(months))
        
        # label fire months
        month_labels = []
        for month in date_range:
            if month in fire_months: month_labels.append(1)
            else: month_labels.append(0)
        
        # add labels
        labels = [id, grid_square] + month_labels
        label_df.append(labels)
        
    # build label df
    label_df = pd.DataFrame(label_df)
    label_df.columns = ['grid_id', 'grid_square'] + date_range
    return label_df

In [7]:
    
# labels whether there was a fire in each grid square 
label_df = label_grid_square(df_filt, grid_df, verbose=True)

# label_df has the following columns:
#    'grid_id', 'grid_square', 'months ...'
#    grid_id: a random uuid to identify the grid square for later
#    grid_square: polygon object of the grid square
#    indices: the rest of the columns are a label for each month (0 if there was no fire in the grid square during them onth and 1 otherwise)

Progress: 10/400
Progress: 20/400
Progress: 30/400
Progress: 40/400
Progress: 50/400
Progress: 60/400
Progress: 70/400
Progress: 80/400
Progress: 90/400
Progress: 100/400
Progress: 110/400
Progress: 120/400
Progress: 130/400
Progress: 140/400
Progress: 150/400
Progress: 160/400
Progress: 170/400
Progress: 180/400
Progress: 190/400
Progress: 200/400
Progress: 210/400
Progress: 220/400
Progress: 230/400
Progress: 240/400
Progress: 250/400
Progress: 260/400
Progress: 270/400
Progress: 280/400
Progress: 290/400
Progress: 300/400
Progress: 310/400
Progress: 320/400
Progress: 330/400
Progress: 340/400
Progress: 350/400
Progress: 360/400
Progress: 370/400
Progress: 380/400
Progress: 390/400
Progress: 400/400


In [8]:
print(label_df)

grid_id  \
0    5bb1229d-69c6-4eec-acb9-09e20c15bdd3   
1    b34b04ff-8d36-4550-9c28-74a58a267826   
2    5f55f152-a875-4ede-aac0-e0ff8528f5e5   
3    66a34d7e-95cb-4803-8be1-0927f9e905db   
4    96d4679b-1297-482c-bc07-a20d641eb1b9   
..                                    ...   
395  7d11cfb8-2c8b-4d46-813b-0ec92f08b8e3   
396  23db8c5e-8e5c-4e5a-9a99-9f5db9e34835   
397  4e69582d-7d1a-46b9-a9d9-5dbbc4bca259   
398  729f8d33-c7b0-4690-879a-46a4a68a69c2   
399  0fdd78f5-797a-47fa-970f-6affef59113b   

                                           grid_square  1992-01  1992-02  \
0    POLYGON ((-118 34, -117.9 34, -118 34.1, -117....        0        0   
1    POLYGON ((-118 34.1, -117.9 34.1, -118 34.2, -...        0        0   
2    POLYGON ((-118 34.2, -117.9 34.2, -118 34.3, -...        0        0   
3    POLYGON ((-118 34.3, -117.9 34.3, -118 34.4, -...        0        0   
4    POLYGON ((-118 34.4, -117.9 34.4, -118 34.5, -...        0        0   
..                                   

In [13]:
output = []
num_zeros = 0
num_subsequent_1s = 0
total_iters = 0
for index, row in label_df.iterrows():
    for rowIndex, col in enumerate(row[2:-1]):
        # if not on fire
        total_iters += 1
        if col == 0:
            num_zeros += 1
            # include implicit conversion to tuple of bounds instead of polygon shape; will make working with earth engine easier
            output.append((row[0], row[1].bounds, label_df.columns[rowIndex + 2], row[rowIndex + 1 + 2]))
            if row[rowIndex + 1 + 2] == 1:
                num_subsequent_1s += 1

In [14]:
print(num_zeros)
print(num_subsequent_1s)
print(total_iters)

112366
1563
114400


In [15]:
# save the corresponding data structure to disk
import pickle
with open("../us_wildfire_dataset/labelled_temporal_polygons.pkl", "wb") as f:
    pickle.dump(output, f, protocol=pickle.HIGHEST_PROTOCOL)

In [16]:
import ee
ee.Authenticate()
ee.Initialize()


Successfully saved authorization token.


In [21]:
for item in output[:1]:
    uuid, coords, date, label = item
    # we want to grab the polygon, and download the associated image files
    landsat = ee.Image('LANDSAT/LC08/C01/T1_TOA/LC08_123032_20140515').select(['B1', 'B2', 'B3'])
    geometry = ee.Geometry.Rectangle(list(coords))
    print(geometry)
    task = ee.batch.Export.image.toDrive(image=landsat, folder="world_images", region=geometry, scale=30, description="Hello world, I am a photo of you", fileFormat="TFRecord")
    task.start()

ee.Geometry({
  "type": "Polygon",
  "coordinates": [
    [
      [
        -118.0,
        34.1
      ],
      [
        -118.0,
        34.0
      ],
      [
        -117.9,
        34.0
      ],
      [
        -117.9,
        34.1
      ]
    ]
  ],
  "evenOdd": true
})


In [20]:
task.status()

{'state': 'FAILED',
 'description': 'Hello world, I am a photo of you',
 'creation_timestamp_ms': 1589934931057,
 'update_timestamp_ms': 1589934942015,
 'start_timestamp_ms': 1589934939556,
 'task_type': 'EXPORT_IMAGE',
 'error_message': 'Patch dimensions must be fully specified.',
 'id': 'MJ53ZGTQHMGGD4W2AK2HLQN4',
 'name': 'projects/earthengine-legacy/operations/MJ53ZGTQHMGGD4W2AK2HLQN4'}