# Preprocessing the Wildfires File

In [11]:
import sqlite3
import pandas as pd
import numpy as np
import pickle

In [12]:
cnx = sqlite3.connect('../us_wildfire_dataset/FPA_FOD_20170508.sqlite')
df = pd.read_sql_query("SELECT DISCOVERY_DATE, LATITUDE, LONGITUDE, FIRE_SIZE, STATE FROM fires", cnx)

In [13]:
df.shape

(1880465, 5)

In [14]:
# filter california fires
df = df[df["STATE"] == "CA"]

In [15]:
df

Unnamed: 0,DISCOVERY_DATE,LATITUDE,LONGITUDE,FIRE_SIZE,STATE
0,2453403.5,40.036944,-121.005833,0.10,CA
1,2453137.5,38.933056,-120.404444,0.25,CA
2,2453156.5,38.984167,-120.735556,0.10,CA
3,2453184.5,38.559167,-119.913333,0.10,CA
4,2453184.5,38.559167,-119.933056,0.10,CA
5,2453186.5,38.635278,-120.103611,0.10,CA
6,2453187.5,38.688333,-120.153333,0.10,CA
7,2453437.5,40.968056,-122.433889,0.80,CA
8,2453444.5,41.233611,-122.283333,1.00,CA
9,2453187.5,38.548333,-120.149167,0.10,CA


In [30]:
# note: california exists between 32 degrees North and 42 degrees north
# and 126 degrees west and 114 degrees west


# given a range of latitudes and longitudes, a fire database dataframe, and a desired resolution, this function
# will linearly partition the geodata into smaller grids
def extract_localized_wildfires(minLat, maxLat, minLong, maxLong, df, lat_resolution=None, long_resolution=None, resolution=None):
    import numpy as np
    
    assert (lat_resolution != None and long_resolution != None) or resolution != None
    if resolution != None:
        long_resolution = resolution
        lat_resolution = resolution

    longitude_range = np.linspace(maxLong, minLong, long_resolution + 1)
    latitude_range = np.linspace(minLat, maxLat, lat_resolution + 1)

    datapoints = {}
    for row in range(resolution):
        for col in range(resolution):
            rel = df[df["LATITUDE"] >= latitude_range[row]]
            rel = rel[rel["LATITUDE"] < latitude_range[row + 1]]
            rel = rel[rel["LONGITUDE"] <= longitude_range[col]]
            rel = rel[rel["LONGITUDE"] > longitude_range[col + 1]]
            datapoints[(row, col)] = rel

    return datapoints

print(extract_localized_wildfires(34, 36, -118, -116, df, resolution=15)[(0, 2)])


         DISCOVERY_DATE   LATITUDE   LONGITUDE  FIRE_SIZE STATE
217048        2450987.5  34.083300 -116.350800      807.0    CA
217672        2451369.5  34.083300 -116.363300        0.1    CA
217716        2451387.5  34.083469 -116.309469        0.1    CA
243038        2448747.5  34.000010 -116.267540        0.3    CA
243039        2448747.5  34.066710 -116.317540        0.3    CA
243040        2448747.5  34.005600 -116.343740        0.5    CA
243064        2449512.5  34.114510 -116.398350        8.0    CA
243066        2449514.5  34.066440 -116.396900        0.1    CA
243076        2449875.5  34.116710 -116.300840        1.5    CA
243083        2449929.5  34.058410 -116.304740     5521.0    CA
243094        2450318.5  34.025010 -116.325850        0.4    CA
243104        2451018.5  34.071710 -116.392740        0.1    CA
243116        2451325.5  34.063610 -116.350540        0.2    CA
243117        2451325.5  34.002510 -116.295840    14000.0    CA
243118        2451325.5  34.002510 -116.

In [None]:
with open("../us_wildfire_dataset/ca_fires_raw.pkl", "wb") as f:
    pickle.dump(df, f, protocol=pickle.HIGHEST_PROTOCOL)