In [3]:
import sqlite3
import pandas as pd
import numpy as np
import pickle
import geopandas
from shapely.geometry import Polygon
import uuid

In [4]:
cnx = sqlite3.connect('../us_wildfire_dataset/FPA_FOD_20170508.sqlite')
df = pd.read_sql_query("SELECT DISCOVERY_DATE, CONT_DATE, LATITUDE, LONGITUDE, STATE, FIRE_NAME, FIRE_SIZE_CLASS, FIRE_SIZE,STAT_CAUSE_DESCR FROM fires", cnx)

In [16]:
def filter_raw_data(df, state='CA', min_class=None):

    # drop states
    df_filt = df[df.STATE == 'CA']
    df_filt = df_filt.drop(['STATE'], axis=1)

    # drop fire classes
    if min_class is not None:
        df_filt.FIRE_SIZE_CLASS = df_filt.FIRE_SIZE_CLASS.apply(ord)
        df_filt = df_filt[df_filt.FIRE_SIZE_CLASS >= ord(min_class)]

    # reformat dates
    df_filt.DISCOVERY_DATE = pd.to_datetime(df['DISCOVERY_DATE'], unit='D', origin='julian')
    df_filt.CONT_DATE = pd.to_datetime(df['CONT_DATE'], unit='D', origin='julian')

    # convert coordinates
    df_filt = geopandas.GeoDataFrame(df_filt, geometry=geopandas.points_from_xy(
        df_filt.LONGITUDE, df_filt.LATITUDE))
    df_filt = df_filt.drop(['LONGITUDE'], axis=1)
    df_filt = df_filt.drop(['LATITUDE'], axis=1)
    df_filt.insert(2, 'COORD', df_filt.pop('geometry'))

    # remove missing values
    df_filt = df_filt.dropna()

    # reformat head
    df_filt.columns = [
        'start_date', 'end_date', 'geometry',
        'name', 'size_class', 'size', 'cause'
    ]

    # sort by start dates
    df_filt = df_filt.sort_values(by='start_date')
    df_filt = df_filt.reset_index()

    return df_filt

In [17]:
def extract_geo_fires(df, area):

    # return fires within polygon
    res = df[df.within(area)]
    return res

def build_geo_grid(df, grid_area, square_size, verbose=False):
    bounds = grid_area.bounds

    # calculate number of grids in lat/long directions
    long_steps = int((bounds[2] - bounds[0]) / square_size)
    lat_steps = int((bounds[3] - bounds[1]) / square_size)

    grid_df = []
    prog, total = 0, long_steps * lat_steps
    for i in range(long_steps):
        for j in range(lat_steps):

            # update progress
            prog += 1
            if verbose and prog % 10 == 0:
                print('Progress: {}/{}'.format(prog, total), flush=True)
            
            # get south-west grid square corner
            c_lon = bounds[0] + i * square_size
            c_lat = bounds[1] + j * square_size

            # create grid square
            grid = Polygon([
                (c_lon, c_lat), 
                (c_lon + square_size, c_lat), 
                (c_lon, c_lat + square_size), 
                (c_lon + square_size, c_lat + square_size)
            ])

            fires = extract_geo_fires(df, grid)
            grid_df.append([uuid.uuid4(), grid, fires.index])

    # build grid df
    grid_df = geopandas.GeoDataFrame(grid_df)
    grid_df.columns = ['grid_id', 'grid_square', 'fire_indices']
    return grid_df

In [18]:
# coordinate-square north of San Bernardino/Riverside 
p = Polygon([(-118, 34), (-118, 36), (-116, 34), (-116, 36)])

# filter raw data from dataset and build sexy format
df_filt = filter_raw_data(df)

# df_filt is ordered by start_date with the following columns:
#   'start_date', 'end_date', 'geometry', 'name', 'size_class', 'size', 'cause'

# build df with each grid square area (from grid area `p` with grid square size 0.1) and corresponding fires indices (which indexes complete list of fires in df_filt)
grid_df = build_geo_grid(df_filt, p, 0.5, verbose=True)

# grid_df has the following columns:
#    'grid_id', 'grid_square', 'fire_indices'
#    grid_id: a random uuid to identify the grid square for later
#    grid_square: polygon object of the grid square
#    fire_indices: list of indices of the fires from df_filt within the grid_square

Progress: 10/16
                                 grid_id  \
0   fd724b1e-8081-4e22-a6d4-c5daee378d3b   
1   2d3f0af7-e05c-48a8-a674-5277b7e92717   
2   270f433e-826d-4778-aeba-a3d96a4e3d9e   
3   bd6aa27d-a282-42cd-8f41-d70cb5a7d5a4   
4   a15efee3-a74b-4fc2-bdb5-2e7271f47280   
5   38b24d44-826b-4b7f-8612-4114503378c4   
6   e250087f-5c96-4a9d-8db7-0c3abff05274   
7   2ad8d395-c2a0-426b-a40f-58c9d7894403   
8   0e85ba05-f4de-4f4e-9bf1-df1205a241d7   
9   6856f9e2-e612-46d7-8ad5-f3b59a518245   
10  033e3abc-01f3-4a47-8dda-d0e13b131e34   
11  06f06a98-4f40-478d-b73c-8d1584a3a9c2   
12  b3e3d55d-9990-4c6f-ba71-49d830589836   
13  91262301-05de-4a9f-bb1b-706507e4ed96   
14  82a47734-c708-4e2b-8f99-57a36d0d533f   
15  b2f61e54-1176-42d6-af2e-ad694c2543e4   

                                          grid_square  \
0   POLYGON ((-118 34, -117.5 34, -118 34.5, -117....   
1   POLYGON ((-118 34.5, -117.5 34.5, -118 35, -11...   
2   POLYGON ((-118 35, -117.5 35, -118 35.5, -117....   
3   POL

In [20]:
def label_grid_square(df, grid_df, start_date='1992-01', end_date='2015-12', verbose=False):
    
    # build date range (in months)
    date_range = [str(d) for d in np.arange(
        start_date, 
        end_date, 
        dtype='datetime64[M]'
    )]

    # iterate over all grid squares 
    label_df = []
    prog, total = 0, grid_df.shape[0]
    for i in range(grid_df.shape[0]):
        
        # update progress
        prog += 1
        if verbose and prog % 10 == 0:
            print('Progress: {}/{}'.format(prog, total), flush=True)
                
        # get grid square fires
        id = grid_df.loc[i, 'grid_id']
        grid_square = grid_df.loc[i, 'grid_square']
        fire_indices = list(grid_df.loc[i, 'fire_indices'])
        fires = df.loc[fire_indices, :]
        
        # collect all months in fire date range
        months = []
        for _, row in fires.iterrows(): 
            start = row.start_date.date()
            end = (row.end_date + pd.DateOffset(months=1)).date()
            months.extend([str(d) for d in np.arange(start, end, dtype='datetime64[M]')])
        fire_months = list(set(months))
        
        # label fire months
        month_labels = []
        for month in date_range:
            if month in fire_months: month_labels.append(1)
            else: month_labels.append(0)
        
        # add labels
        labels = [id, grid_square] + month_labels
        label_df.append(labels)
        
    # build label df
    label_df = pd.DataFrame(label_df)
    label_df.columns = ['grid_id', 'grid_square'] + date_range
    return label_df

In [21]:
    
# labels whether there was a fire in each grid square 
label_df = label_grid_square(df_filt, grid_df, verbose=True)

# label_df has the following columns:
#    'grid_id', 'grid_square', 'months ...'
#    grid_id: a random uuid to identify the grid square for later
#    grid_square: polygon object of the grid square
#    indices: the rest of the columns are a label for each month (0 if there was no fire in the grid square during them onth and 1 otherwise)

Progress: 10/16
                                 grid_id  \
0   fd724b1e-8081-4e22-a6d4-c5daee378d3b   
1   2d3f0af7-e05c-48a8-a674-5277b7e92717   
2   270f433e-826d-4778-aeba-a3d96a4e3d9e   
3   bd6aa27d-a282-42cd-8f41-d70cb5a7d5a4   
4   a15efee3-a74b-4fc2-bdb5-2e7271f47280   
5   38b24d44-826b-4b7f-8612-4114503378c4   
6   e250087f-5c96-4a9d-8db7-0c3abff05274   
7   2ad8d395-c2a0-426b-a40f-58c9d7894403   
8   0e85ba05-f4de-4f4e-9bf1-df1205a241d7   
9   6856f9e2-e612-46d7-8ad5-f3b59a518245   
10  033e3abc-01f3-4a47-8dda-d0e13b131e34   
11  06f06a98-4f40-478d-b73c-8d1584a3a9c2   
12  b3e3d55d-9990-4c6f-ba71-49d830589836   
13  91262301-05de-4a9f-bb1b-706507e4ed96   
14  82a47734-c708-4e2b-8f99-57a36d0d533f   
15  b2f61e54-1176-42d6-af2e-ad694c2543e4   

                                          grid_square  1992-01  1992-02  \
0   POLYGON ((-118 34, -117.5 34, -118 34.5, -117....        0        0   
1   POLYGON ((-118 34.5, -117.5 34.5, -118 35, -11...        0        0   
2   POLYGO