# Preprocessing the Wildfires File

In [11]:
import sqlite3
import pandas as pd
import numpy as np
import pickle

In [12]:
cnx = sqlite3.connect('../us_wildfire_dataset/FPA_FOD_20170508.sqlite')
df = pd.read_sql_query("SELECT DISCOVERY_DATE, LATITUDE, LONGITUDE, FIRE_SIZE, STATE FROM fires", cnx)

In [13]:
df.shape

(1880465, 5)

In [43]:
# filter california fires
df = df[df["STATE"] == "CA"]

# convert the date to something more intelligible
try:
    df['Date'] = pd.to_datetime(df['DISCOVERY_DATE'], unit='D', origin='julian')
    df = df.drop("DISCOVERY_DATE", 1)
except KeyError:
    print("You silly chook; you've re-run this cell!")

You silly chook; you've re-run this cell!


In [44]:
df

Unnamed: 0,LATITUDE,LONGITUDE,FIRE_SIZE,STATE,Date
0,40.036944,-121.005833,0.10,CA,2005-02-02
1,38.933056,-120.404444,0.25,CA,2004-05-12
2,38.984167,-120.735556,0.10,CA,2004-05-31
3,38.559167,-119.913333,0.10,CA,2004-06-28
4,38.559167,-119.933056,0.10,CA,2004-06-28
5,38.635278,-120.103611,0.10,CA,2004-06-30
6,38.688333,-120.153333,0.10,CA,2004-07-01
7,40.968056,-122.433889,0.80,CA,2005-03-08
8,41.233611,-122.283333,1.00,CA,2005-03-15
9,38.548333,-120.149167,0.10,CA,2004-07-01


In [45]:
# note: california exists between 32 degrees North and 42 degrees north
# and 126 degrees west and 114 degrees west

# given a range of latitudes and longitudes, a fire database dataframe, and a desired resolution, this function
# will linearly partition the geodata into smaller grids
def extract_localized_wildfires(minLat, maxLat, minLong, maxLong, df, lat_resolution=None, long_resolution=None, resolution=None):
    import numpy as np
    
    assert (lat_resolution != None and long_resolution != None) or resolution != None
    if resolution != None:
        long_resolution = resolution
        lat_resolution = resolution

    longitude_range = np.linspace(maxLong, minLong, long_resolution + 1)
    latitude_range = np.linspace(minLat, maxLat, lat_resolution + 1)

    datapoints = {}
    for row in range(resolution):
        for col in range(resolution):
            rel = df[df["LATITUDE"] >= latitude_range[row]]
            rel = rel[rel["LATITUDE"] < latitude_range[row + 1]]
            rel = rel[rel["LONGITUDE"] <= longitude_range[col]]
            rel = rel[rel["LONGITUDE"] > longitude_range[col + 1]]
            datapoints[(row, col)] = rel

    return datapoints

# the below code tests this function on a toy example of california just northeast of San Bernardino
print(extract_localized_wildfires(34, 36, -118, -116, df, resolution=15)[(0, 2)])


          LATITUDE   LONGITUDE  FIRE_SIZE STATE       Date
217048   34.083300 -116.350800      807.0    CA 1998-06-23
217672   34.083300 -116.363300        0.1    CA 1999-07-10
217716   34.083469 -116.309469        0.1    CA 1999-07-28
243038   34.000010 -116.267540        0.3    CA 1992-05-05
243039   34.066710 -116.317540        0.3    CA 1992-05-05
243040   34.005600 -116.343740        0.5    CA 1992-05-05
243064   34.114510 -116.398350        8.0    CA 1994-06-09
243066   34.066440 -116.396900        0.1    CA 1994-06-11
243076   34.116710 -116.300840        1.5    CA 1995-06-07
243083   34.058410 -116.304740     5521.0    CA 1995-07-31
243094   34.025010 -116.325850        0.4    CA 1996-08-23
243104   34.071710 -116.392740        0.1    CA 1998-07-24
243116   34.063610 -116.350540        0.2    CA 1999-05-27
243117   34.002510 -116.295840    14000.0    CA 1999-05-27
243118   34.002510 -116.295840        5.0    CA 1999-05-27
314559   34.097981 -116.326811        2.0    CA 2001-07-

In [41]:
with open("../us_wildfire_dataset/ca_fires_raw.pkl", "wb") as f:
    pickle.dump(df, f, protocol=pickle.HIGHEST_PROTOCOL)