In [444]:
import pandas as pd  # provides interface for interacting with tabular data
import geopandas as gpd  # combines the capabilities of pandas and shapely for geospatial operations
from shapely.geometry import Point, Polygon, MultiPolygon  # for manipulating text data into geospatial shapes
from shapely import wkt  # stands for "well known text," allows for interchange across GIS programs
import rtree  # supports geospatial join
import os
import fnmatch
import numpy as np
import matplotlib.pyplot as plt
import descartes
import sys
import sklearn 
from shapely.ops import nearest_points
from datetime import datetime as dt, date
sys.path.append('/Users/saraprice/Documents/NYU/Fall_2020/DS_GA_1001/final_project/wildfires-1001/code/functions/')
from gis_processing import *
from target_fire_functions import *
import itertools
import copy
from functools import reduce

In [115]:
data_dir = '/Users/saraprice/Documents/NYU/Fall_2020/DS_GA_1001/final_project/wildfires-1001/data'
target_data = gpd.GeoDataFrame()
for i in np.arange(1,5):
    t_df = pd.read_pickle(os.path.join(data_dir, f'clean_data/target_full_{i}.pkl'))
    target_data = target_data.append(t_df)

In [117]:
full_fire_data = gpd.read_file(os.path.join(data_dir, 'clean_data/fire_gis/fire_gis.dbf'))
county_grid = gpd.read_file(os.path.join(data_dir, 'clean_data/county_grid/county_grid.dbf'))

## Historical Fire Features
### Processing historical fire data (i.e. from 1980-1990)

In [118]:
full_fire_data['start_date'] = pd.to_datetime(full_fire_data['ALARM_DATE'], format = '%Y-%m-%d', errors = 'coerce')
full_fire_data['end_date'] = pd.to_datetime(full_fire_data['CONT_DATE'], format = '%Y-%m-%d', errors = 'coerce')
full_fire_data.drop(['ALARM_DATE', 'CONT_DATE'], axis = 1, inplace = True)
full_fire_data.dropna(subset = ['start_date', 'end_date'], inplace = True)
full_fire_data['FIRE_AREA'] = full_fire_data.geometry.area

In [119]:
hist_data = full_fire_data[(full_fire_data['YEAR']>= 1980) & (full_fire_data['YEAR']<= 1990)]
hist_data.shape

(786, 15)

In [121]:
##Limit fire and grid data to just columns we need for target generate
grid_target_subset = county_grid[['NAME', 'COUNTYFP', 'COUNTY_ARE','GRID_AREA', 'GRID_ID', 'geometry']]
fire_target_subset = full_fire_data[['FIRE_AREA', 'FIRE_KEY', 'YEAR', 'start_date', 'end_date', 'geometry']]

In [140]:
def disaggregate_fire_data(grid, fire_data, min_year, max_year):
    ''' 
    Purpose: Create instances of wildfires occuring on a daily level. This function starts with daily because it
    is easier to aggregate up once we have all days represented in one dataframe
    -------
    Inputs:
    -------
        - grid : GeoDataFrame of grid sections
        - fire_data : GeoDataFrame of fire data
    '''
    fire_data = fire_data[(fire_data['YEAR']>=min_year) & (fire_data['YEAR']<=max_year)]
    #Overlay grid and fire to get intersection geometry (this will create a dataframe with a few more fire
    #instances than we had in the plain fire_data 
    ##** Note that this will automatically exclude grid sections where this is no fire
    fire_grid = gpd.overlay(grid, fire_data, how = 'intersection')
    fire_grid['FIRE_GRID_INT_AREA'] = fire_grid.geometry.area
    fire_grid.index = range((len(fire_grid)))
    
    #create list of fires that had to be dropped due to data isseus (i.e. start date after end date)
    fires_dropped = []
    disagg_fire = pd.DataFrame()
    for row in fire_grid.itertuples():
        date_range = pd.date_range(row[list(fire_grid.columns).index('start_date')+1], \
                                   row[list(fire_grid.columns).index('end_date')+1], freq = 'D')
        if len(date_range)> 365:
            fires_dropped.append(row[list(fire_grid.columns).index('FIRE_KEY')])
            continue
        if len(date_range) == 0:
            fires_dropped.append(row[list(fire_grid.columns).index('FIRE_KEY')])
            continue
        dup_fire_data = pd.concat([pd.Series(row[1:], index = list(fire_grid.columns)).to_frame().transpose()]*len(date_range))
        dup_fire_data['date'] = date_range
        disagg_fire = disagg_fire.append(dup_fire_data, ignore_index = True)
        if row[0] in(np.arange(0,9000, 250)):
            print(row[0], row[list(fire_grid.columns).index('GRID_ID')+1], len(disagg_fire))
            print(disagg_fire['date'].head())
    return disagg_fire, fires_dropped


In [141]:
### Get data back to 1980 since we need to do 10 years back starting with 1980
fire_data_1980_90, fires_dropped = disaggregate_fire_data(grid_target_subset, fire_target_subset, min_year = 1980, max_year = 1990)

0 49.0 5
0   1986-08-14
1   1986-08-15
2   1986-08-16
3   1986-08-17
4   1986-08-18
Name: date, dtype: datetime64[ns]
250 84.0 1457
0   1986-08-14
1   1986-08-15
2   1986-08-16
3   1986-08-17
4   1986-08-18
Name: date, dtype: datetime64[ns]
500 16.0 6788
0   1986-08-14
1   1986-08-15
2   1986-08-16
3   1986-08-17
4   1986-08-18
Name: date, dtype: datetime64[ns]
750 97.0 9057
0   1986-08-14
1   1986-08-15
2   1986-08-16
3   1986-08-17
4   1986-08-18
Name: date, dtype: datetime64[ns]


In [142]:
hist_target_frame = generate_target_frame(grid_target_subset, 1980, 1990)

(4018, 1)
           date  year  month  week month_id month_start  month_end week_id
363  1980-12-29  1980     12     1  1980_12  1980-12-01 1980-12-31  1981_1
364  1980-12-30  1980     12     1  1980_12  1980-12-01 1980-12-31  1981_1
365  1980-12-31  1980     12     1  1980_12  1980-12-01 1980-12-31  1981_1
1826 1984-12-31  1984     12     1  1984_12  1984-12-01 1984-12-31  1985_1
2190 1985-12-30  1985     12     1  1985_12  1985-12-01 1985-12-31  1986_1
2191 1985-12-31  1985     12     1  1985_12  1985-12-01 1985-12-31  1986_1
2554 1986-12-29  1986     12     1  1986_12  1986-12-01 1986-12-31  1987_1
2555 1986-12-30  1986     12     1  1986_12  1986-12-01 1986-12-31  1987_1
2556 1986-12-31  1986     12     1  1986_12  1986-12-01 1986-12-31  1987_1
4017 1990-12-31  1990     12     1  1990_12  1990-12-01 1990-12-31  1991_1


In [145]:
hist_target_full = pd.merge(hist_target_frame, fire_data_1980_90,on = ['date', 'GRID_ID'], how = 'outer') 
print(hist_target_full[hist_target_full['month_id'].isna()])

Empty DataFrame
Columns: [date, month_id, month_start, month_end, week_id, week_start, week_end, GRID_ID, NAME, COUNTYFP, COUNTY_ARE, GRID_AREA, FIRE_AREA, FIRE_KEY, YEAR, start_date, end_date, geometry, FIRE_GRID_INT_AREA]
Index: []


In [146]:
##Add in target variable options for historical data
#Binary
hist_target_full ['Y_bin'] = np.where(hist_target_full ['FIRE_KEY'].isna()== True, 0, 1)   

#Counts
hist_target_fire_count = hist_target_full[['date', 'GRID_ID', 'FIRE_KEY']].groupby(['date', 'GRID_ID']).count().reset_index()
hist_target_fire_count.rename(columns = {'FIRE_KEY': 'Y_fire_count'}, inplace = True)
print(hist_target_fire_count.shape)
#Merge onto target_options
hist_target_full = hist_target_full.merge(hist_target_fire_count, on = ['date', 'GRID_ID'])

(538412, 3)


In [149]:
### Option 3: area of fires
hist_target_full['Y_fire_area_prop'] = np.where(hist_target_full['FIRE_GRID_INT_AREA'].isna(), 0, 
                                               hist_target_full['FIRE_GRID_INT_AREA']/hist_target_full['GRID_AREA'])
hist_target_full['FIRE_ACRES'] = hist_target_full['FIRE_AREA'].fillna(0)* 0.0002471054
## Uses NWCG fire size class code https://www.nwcg.gov/sites/default/files/data-standards/pdf/values.pdf
hist_target_full['Y_fire_class_size'] = np.where(hist_target_full['FIRE_ACRES']== 0, 0, 
                                                  np.where((hist_target_full['FIRE_ACRES']>0) &\
                                                           (hist_target_full['FIRE_ACRES']<100), 1,
                                                  np.where((hist_target_full['FIRE_ACRES']>=100) & \
                                                           (hist_target_full['FIRE_ACRES']<1000), 2,
                                                           np.where((hist_target_full['FIRE_ACRES']>=1000) & \
                                                                    (hist_target_full['FIRE_ACRES']< 5000) , 3,
                                                                    np.where(hist_target_full['FIRE_ACRES']>=5000, 4, 0)))))

In [150]:
## append historical and current target data
hist_train_target_data = target_data.append(hist_target_full)
hist_train_target_data['YEAR'] = hist_train_target_data['date'].apply(lambda x:x.year)
hist_train_target_data['MONTH'] = hist_train_target_data['date'].apply(lambda x:x.month)

In [153]:
hist_train_target_data.rename(columns = {'YEAR': 'year', 'MONTH': 'month'}, inplace = True )

## Fire in previous X timeframe?
### Creating features: 
* **binary indicators for fire in a previous X timeframe in a given grid section**:
    *  hist_bin_1m: was there a fire in the previous month?
    *  hist_bin_1y: was there a fire in the previous year
    *  hist_bin_5y: was there a fire in the previous 5 years
    *  hist_bin_10y: was there a fire in the previous 10 years
* **categorical variable for the max size of fire in a previous X timeframe in a given grid section**:
    *  hist_cl_size_1m: max size of fire (denoted by class size) in previous month?
    *  hist_cl_size_1y: max size of fire (denoted by class size) in previous year?
    *  hist_cl_size_5y: max size of fire (denoted by class size) in previous 5 years?
    *  hist_cl_size_10y: max size of fire (denoted by class size) in previous 10 years?
* **max proportion of grid section covered by fire in previous X timeframe**:
    *  hist_p_area_1m: max proportion of grid section covered by fire in the previous month?
    *  hist_p_area_1y: max proportion of grid section covered by fire in previous year?
    *  hist_p_area_5y: max proportion of grid section covered by fire in previous 5 years?
    *  hist_p_area_10y: max proportion of grid section covered by fire in previous 10 years?
* **total area of the grid covered by fire in the previous year**
    *  hit_tot_p_area_1y
* **percent of time over last X timeframe when fire was present**
    *  hist_p_time_1m : percent of time fire was present in past month in given grid section
    *  hist_p_time_1y : percent of time fire was present in past year in given grid section
    *  hist_p_time_5y : percent of time fire was present in past 5 years in given grid section
    *  hist_p_time_10y : percent of time fire was present in past 10 years in given grid section

In [302]:
def fire_prev_time(df, time_int, int_num, train_min_year, train_max_year, fire_size = False, fire_area = False):
    '''
    Purpose: generate binary flag for if fire occurred in a given grid section in the past X timeframe
    -------
    Inputs:
    ------
        df :             GeoDataFrame 
                         training data and historical data back at least 10 years for last training year of interest
        time_int :       str (options are 'month', 'year')
                         unit of time we are looking back for
        int_num :        numeric 
                         # number of units of the time int to look back for historical fire data
        fire_size :      list , default = None; options are fire_class_size values (1,2,3,4)
                         will create indicators for each value in the input list
        train_min_year : numeric 
                         minimum year that will ultimately be in the training data (& not just including for 
                         historical calculations)
        train_max_year : numeric
                         same as train_min_year but for max
    '''
    #Set Y variable of interest depending on if we're interested in fire size or not
    #If not we will look at binary Y and if yes then we look at Y_fire_class_size
    if fire_size == False:
        Y_var = 'Y_bin'
        label_mod = 'bin'
    if fire_size:
        Y_var = 'Y_fire_class_size'
        label_mod = 'cl_size'
    if fire_area:
        Y_var = 'Y_fire_area_prop'
        label_mod = 'p_area'
        
    if time_int == 'month':
        #Drop duplicates
        fire_bin_map = df[['GRID_ID', 'year', 'month', 'month_id', Y_var]].drop_duplicates()
        #aggregate on a grid_id and month_ID and take the max value of the Y_var. In the binary case this will return
        #1 if there was a fire, and in the fire_class_size case will return largest fire class  
        fire_bin_map = fire_bin_map.groupby(['GRID_ID', 'year', 'month', 'month_id']).max().reset_index()
        #create copy that will be historical data we merge onto current data
        fire_bin_map_hist = fire_bin_map.copy()
        #create merge id for historical month
        fire_bin_map_hist['merge_month_id'] = np.where(fire_bin_map_hist['month'] != 12, 
                                                  fire_bin_map_hist['year'].astype(str)+"_"+\
                                                  fire_bin_map_hist['month'].apply(lambda x:x+1).astype(str),
                                                  fire_bin_map_hist['year'].apply(lambda x:x+1).astype(str)+"_"+\
                                                  '1')
        fire_bin_map_hist.rename(columns= {Y_var: f'hist_{label_mod}_1m'}, inplace = True)
        fire_bin_map_hist.drop(['month_id', 'month', 'year'], inplace = True, axis = 1)
        fire_bin_map_hist.rename(columns = {'merge_month_id':'month_id'}, inplace = True)
        hist_fire_df = pd.merge(fire_bin_map, fire_bin_map_hist, on =['month_id', 'GRID_ID'], how = 'left')
        hist_fire_df = hist_fire_df[(hist_fire_df['year']>= train_min_year) & (hist_fire_df['year']<= train_max_year)]
    else:
        hist_fire_df = pd.DataFrame()
        fire_bin_map = df[['GRID_ID', 'year', Y_var]]
        for g in fire_bin_map.GRID_ID.unique():
            g_df = fire_bin_map[fire_bin_map['GRID_ID']==g]
            for y in np.arange(train_min_year, train_max_year +1):
                prev_add = g_df[(g_df['year']>=y-int_num) & (g_df['year']<y)][Y_var].max()
                hist_fire_df = hist_fire_df.append(pd.DataFrame({'GRID_ID':[g],'year':[y], 
                                                                 f'{Y_var}_prev_{int_num}yr':[prev_add]},
                                                    columns = ['GRID_ID', 'year', f'hist{label_mod}_{int_num}y']))
    return hist_fire_df 

In [277]:
hist_fire_bin_1m = fire_prev_time(hist_train_target_data, 'month', 1, 1990, 2019, fire_size = False)

64320
64320
64320
64320
   GRID_ID  year  month month_id  Y_bin
0      0.0  1980      1   1980_1      0
1      0.0  1980      2   1980_2      0
2      0.0  1980      3   1980_3      0
3      0.0  1980      4   1980_4      0
4      0.0  1980      5   1980_5      0
   GRID_ID  Y_bin_prev_month month_id
0      0.0                 0   1980_2
1      0.0                 0   1980_3
2      0.0                 0   1980_4
3      0.0                 0   1980_5
4      0.0                 0   1980_6
64320


In [456]:
hist_fire_bin_1m.rename(columns= {'Y_bin_prev_month': 'hist_bin_1m'}, inplace = True)
try:
    hist_fire_bin_1m.drop('Y_bin', inplace = True, axis = 1)
except:
    pass
hist_fire_bin_1m.head()

Unnamed: 0,GRID_ID,year,month,month_id,hist_bin_1m
0,0.0,1980,1,1980_1,
1,0.0,1980,2,1980_2,0.0
2,0.0,1980,3,1980_3,0.0
3,0.0,1980,4,1980_4,0.0
4,0.0,1980,5,1980_5,0.0


In [251]:
hist_fire_bin_1yr = fire_prev_time(hist_train_target_data, 'year', 1, 1990, 2019)

In [457]:
hist_fire_bin_1yr.rename(columns= {'Y_bin_prev_1yr': 'hist_bin_1y'}, inplace = True)

In [257]:
hist_fire_bin_5yr = fire_prev_time(hist_train_target_data, 'year', 5, 1990, 2019)

In [459]:
hist_fire_bin_5yr.rename(columns= {'Y_bin_prev_5yr': 'hist_bin_5y'}, inplace = True)

In [448]:
hist_fire_bin_10yr = fire_prev_time(hist_train_target_data, 'year', 10, 1990, 2019)

In [460]:
hist_fire_bin_10yr.rename(columns= {'Y_bin_prev_10yr': 'hist_bin_10y'}, inplace = True)

### Get max fire class size over the last X period

In [None]:
hist_max_fire_size_1m = fire_prev_time(hist_train_target_data, 'month', 1, 1990, 2019, fire_size = True)

In [None]:
hist_max_fire_size_1m.rename(columns = {'Y_fire_class_size_prev_month':'hist_cl_size_1m'}, inplace = True)

In [303]:
hist_max_fire_size_1yr = fire_prev_time(hist_train_target_data, 'year', 1, 1990, 2019, fire_size = True)

In [467]:
hist_max_fire_size_1yr.rename(columns = {'Y_fire_class_size_prev_1yr':'hist_cl_size_1y'}, inplace = True)

In [307]:
hist_max_fire_size_5yr = fire_prev_time(hist_train_target_data, 'year', 5, 1990, 2019, fire_size = True)

In [468]:
hist_max_fire_size_5yr.rename(columns = {'Y_fire_class_size_prev_5yr':'hist_cl_size_5y'}, inplace = True)

In [308]:
hist_max_fire_size_10yr = fire_prev_time(hist_train_target_data, 'year', 10, 1990, 2019, fire_size = True)

In [469]:
hist_max_fire_size_10yr.rename(columns = {'Y_fire_class_size_prev_10yr':'hist_cl_size_10y'}, inplace = True)

### Max % of grid section covered by fire in the past X timeframe

In [None]:
hist_max_fire_area_prop_1m = fire_prev_time(hist_train_target_data, 'month', 1, 1990, 2019, fire_area = True)

In [None]:
hist_max_fire_area_prop_1m.rename({'Y_fire_area_prop_prev_month': 'hist_p_area_1m'}, inplace = True)

In [309]:
hist_max_fire_area_prop_1yr = fire_prev_time(hist_train_target_data, 'year', 1, 1990, 2019, fire_area = True)

In [470]:
hist_max_fire_area_prop_1yr.rename({'Y_fire_area_prop_prev_1yr': 'hist_p_area_1y'}, inplace = True)

In [310]:
hist_max_fire_area_prop_5yr = fire_prev_time(hist_train_target_data, 'year', 5, 1990, 2019, fire_area = True)

In [471]:
hist_max_fire_area_prop_5yr.rename({'Y_fire_area_prop_prev_5yr': 'hist_p_area_5y'}, inplace = True)

In [311]:
hist_max_fire_area_prop_10yr = fire_prev_time(hist_train_target_data, 'year', 10, 1990, 2019, fire_area = True)

In [472]:
hist_max_fire_area_prop_10yr.rename({'Y_fire_area_prop_prev_10yr': 'hist_p_area_10y'}, inplace = True)

### Total % of grid section covered by fire in the previous X timeframe

In [399]:
def perc_grid_covered(df, int_num, train_min_year, train_max_year):
    '''
    Purpose: generate binary flag for if fire occurred in a given grid section in the past X timeframe
    -------
    Inputs:
    ------
        df :             GeoDataFrame 
                         training data and historical data back at least 10 years for last training year of interest
        time_int :       str (options are 'month', 'year')
                         unit of time we are looking back for
        int_num :        numeric 
                         # number of units of the time int to look back for historical fire data
        fire_size :      list , default = None; options are fire_class_size values (1,2,3,4)
                         will create indicators for each value in the input list
        train_min_year : numeric 
                         minimum year that will ultimately be in the training data (& not just including for 
                         historical calculations)
        train_max_year : numeric
                         same as train_min_year but for max
    '''    
    #Drop duplicates
    df = df[['GRID_ID', 'year','date','geometry']].drop_duplicates()
    gdf = gpd.GeoDataFrame(df, geometry = df.geometry, crs={'init' :'epsg:3857'})
    hist_fire_df = pd.DataFrame()
    for g in gdf.GRID_ID.unique():
        gdf_sub = gdf[(gdf['GRID_ID']==g) & (gdf['geometry'].isna() == False)]
        for y in np.arange(train_min_year, train_max_year +1):
            time_sub = gdf_sub[(gdf_sub['year']>=y-int_num) & (gdf_sub['year']<y)][['GRID_ID', 'geometry']]
            if len(time_sub)==0:
                hist_fire_df = hist_fire_df.append(pd.DataFrame({'GRID_ID':[g],'year':[y], 'total_area':[0]},
                                                columns = ['GRID_ID', 'year', 'total_area']))
            else:
                x = time_sub[['GRID_ID', 'geometry']].dissolve(by = 'GRID_ID')
                geo = x.geometry.area.iloc[0]
                hist_fire_df = hist_fire_df.append(pd.DataFrame({'GRID_ID':[g],'year':[y], 
                                                                 'total_area':[geo]},
                                                columns = ['GRID_ID', 'year', 'total_area']))

    return hist_fire_df 

### Total % of Grid area touched by fire in the past X timeframe

In [406]:
hist_df.head(10)

Unnamed: 0,GRID_ID,year,total_area
0,0.0,1990,0.0
0,0.0,1991,0.0
0,0.0,1992,0.0
0,0.0,1993,0.0
0,0.0,1994,0.0
0,0.0,1995,0.0
0,0.0,1996,0.0
0,0.0,1997,0.0
0,0.0,1998,0.0
0,0.0,1999,6094418.0


In [402]:
hist_df = perc_grid_covered(hist_train_target_data, 1, 1990, 2019)

  return _prepare_from_string(" ".join(pjargs))


In [407]:
hist_df.rename(columns = {'total_area': 'tot_area_fire_prev_1yr'}, inplace = True)

In [476]:
perc_grid_fire_1y = hist_df.copy()
perc_grid_fire_1y.shape

(4020, 3)

### % of Time over last X when fire was present

In [410]:
perc_time_df = hist_train_target_data[['GRID_ID', 'date', 'month_id', 'month', 'year', 'Y_bin']].drop_duplicates()
perc_time_df = perc_time_df.groupby(['GRID_ID', 'date', 'month_id', 'month', 'year']).sum('Y_bin').reset_index()
perc_time_df.head()

Unnamed: 0,GRID_ID,date,month_id,month,year,Y_bin
0,0.0,1980-01-01,1980_1,1,1980,0
1,0.0,1980-01-02,1980_1,1,1980,0
2,0.0,1980-01-03,1980_1,1,1980,0
3,0.0,1980-01-04,1980_1,1,1980,0
4,0.0,1980-01-05,1980_1,1,1980,0


In [411]:
print(perc_time_df.shape)
print(perc_time_df.drop_duplicates(['GRID_ID', 'date']).shape)

(1957740, 6)
(1957740, 6)


#### Last month

In [477]:
perc_1month = perc_time_df.drop(['date'], axis =1)
perc_1month['counter'] = 1
perc_1month = perc_1month.groupby(['GRID_ID', 'month_id', 'month', 'year']).sum().reset_index()\
                          .rename(columns= {'Y_bin': 'count_fire_days'})
perc_1month['hist_p_time_1m'] = perc_1month['count_fire_days']/perc_1month['counter']
print(perc_1month[perc_1month['hist_p_time_1m']!=0]['hist_p_time_1m'].describe())
perc_1month.drop(['counter', 'count_fire_days'], inplace = True, axis = 1)
hist_perc_time_fire_1m = perc_1month.copy()

count    5749.000000
mean        0.314248
std         0.353804
min         0.032258
25%         0.064516
50%         0.129032
75%         0.483871
max         1.000000
Name: hist_p_time_1m, dtype: float64


In [431]:
def perc_year_fire(df, years, train_min_year, train_max_year):
    '''
    Purpose: generate binary flag for if fire occurred in a given grid section in the past X timeframe
    -------
    Inputs:
    ------
        df :             GeoDataFrame 
                         training data and historical data back at least 10 years for last training year of interest
        time_int :       str (options are 'month', 'year')
                         unit of time we are looking back for
        int_num :        numeric 
                         # number of units of the time int to look back for historical fire data
        train_min_year : numeric 
                         minimum year that will ultimately be in the training data (& not just including for 
                         historical calculations)
        train_max_year : numeric
                         same as train_min_year but for max
    '''    
    #Drop duplicates
    df = df[['GRID_ID', 'year', 'date', 'Y_bin']].drop_duplicates()
    df.drop('date', axis = 1, inplace = True)
    df = df.groupby(['GRID_ID', 'year']).sum().reset_index()
    hist_fire_df = pd.DataFrame()
    for g in df.GRID_ID.unique():
        df_sub = df[df['GRID_ID']==g]
        for y in np.arange(train_min_year, train_max_year +1):
            time_sub = df_sub[(df_sub['year']>=y-years) & (df_sub['year']<y)]
            x = time_sub['Y_bin'].sum()
            hist_fire_df = hist_fire_df.append(pd.DataFrame({'GRID_ID':[g],'year':[y], 'total_fire_days':[x]},
                                                columns = ['GRID_ID', 'year', 'total_fire_days']))
            hist_fire_df['hist_p_time_1y'] = hist_fire_df['total_fire_days']/(years *365)
    return hist_fire_df 

In [432]:
hist_perc_time_fire_1y = perc_year_fire(hist_train_target_data, 1, 1990, 2019)

Unnamed: 0,GRID_ID,year,total_fire_days,hist_p_time_1y
0,0.0,1990,0,0.0
0,0.0,1991,0,0.0
0,0.0,1992,0,0.0
0,0.0,1993,0,0.0
0,0.0,1994,0,0.0


In [438]:
hist_perc_time_fire_5y = perc_year_fire(hist_train_target_data, 5, 1990, 2019)

In [439]:
hist_perc_time_fire_10y = perc_year_fire(hist_train_target_data, 10, 1990, 2019)

## Combine all historical features together

In [462]:
##Binary fire present variables
hist_bin_df_list = [hist_fire_bin_1m, hist_fire_bin_1yr, hist_fire_bin_5yr,hist_fire_bin_10yr]
hist_bin_merged = reduce(lambda  left,right: pd.merge(left,right,on=['GRID_ID', 'year'],
                                                how='outer'), hist_bin_df_list)
hist_bin_merged = hist_bin_merged[(hist_bin_merged['year']>=1990) & (hist_bin_merged['year']<=2019)]
hist_bin_merged.head()

Unnamed: 0,GRID_ID,year,month,month_id,hist_bin_1m,hist_bin_1y,hist_bin_5y,hist_bin_10y
120,0.0,1990,1,1990_1,0.0,0.0,0.0,0.0
121,0.0,1990,2,1990_2,0.0,0.0,0.0,0.0
122,0.0,1990,3,1990_3,0.0,0.0,0.0,0.0
123,0.0,1990,4,1990_4,0.0,0.0,0.0,0.0
124,0.0,1990,5,1990_5,0.0,0.0,0.0,0.0


In [466]:
#Max size fire variables
hist_max_size_df_list = [hist_max_fire_size_1m, hist_max_fire_size_1yr, hist_max_fire_size_5yr,hist_max_fire_size_10yr]
for df in hist_max_size_df_list:
    print(df.columns, df.shape)
hist_max_size_merged = reduce(lambda  left,right: pd.merge(left,right,on=['GRID_ID', 'year'],
                                               how='outer'), hist_max_size_df_list)
hist_max_size_merged = hist_max_size_merged[(hist_max_size_merged['year']>=1990) & \
                                            (hist_max_size_merged['year']<=2019)]
print(hist_max_size_merged.shape)
hist_max_size_merged.head()

Index(['GRID_ID', 'year', 'month', 'month_id', 'Y_fire_class_size',
       'Y_fire_class_size_prev_month'],
      dtype='object') (64320, 6)
Index(['GRID_ID', 'year', 'Y_fire_class_size_prev_1yr'], dtype='object') (4020, 3)
Index(['GRID_ID', 'year', 'Y_fire_class_size_prev_5yr'], dtype='object') (4020, 3)
Index(['GRID_ID', 'year', 'Y_fire_class_size_prev_10yr'], dtype='object') (4020, 3)
(48240, 9)


Unnamed: 0,GRID_ID,year,month,month_id,Y_fire_class_size,Y_fire_class_size_prev_month,Y_fire_class_size_prev_1yr,Y_fire_class_size_prev_5yr,Y_fire_class_size_prev_10yr
120,0.0,1990,1,1990_1,0,0.0,0.0,0.0,0.0
121,0.0,1990,2,1990_2,0,0.0,0.0,0.0,0.0
122,0.0,1990,3,1990_3,0,0.0,0.0,0.0,0.0
123,0.0,1990,4,1990_4,0,0.0,0.0,0.0,0.0
124,0.0,1990,5,1990_5,0,0.0,0.0,0.0,0.0


In [474]:
#Max prop fire variables
hist_max_p_area_df_list = [hist_max_fire_area_prop_1m, hist_max_fire_area_prop_1yr, hist_max_fire_area_prop_5yr,
                           hist_max_fire_area_prop_10yr]
for df in hist_max_p_area_df_list:
    print(df.columns, df.shape)
hist_max_p_area_merged = reduce(lambda  left,right: pd.merge(left,right,on=['GRID_ID', 'year'],
                                               how='outer'), hist_max_p_area_df_list)
hist_max_p_area_merged = hist_max_p_area_merged[(hist_max_p_area_merged['year']>=1990) & \
                                            (hist_max_p_area_merged['year']<=2019)]
print(hist_max_p_area_merged.shape)
hist_max_p_area_merged.head()

Index(['GRID_ID', 'year', 'month', 'month_id', 'Y_fire_area_prop',
       'Y_fire_area_prop_prev_month'],
      dtype='object') (64320, 6)
Index(['GRID_ID', 'year', 'Y_fire_area_prop_prev_1yr'], dtype='object') (4020, 3)
Index(['GRID_ID', 'year', 'Y_fire_area_prop_prev_5yr'], dtype='object') (4020, 3)
Index(['GRID_ID', 'year', 'Y_fire_area_prop_prev_10yr'], dtype='object') (4020, 3)
(48240, 9)


Unnamed: 0,GRID_ID,year,month,month_id,Y_fire_area_prop,Y_fire_area_prop_prev_month,Y_fire_area_prop_prev_1yr,Y_fire_area_prop_prev_5yr,Y_fire_area_prop_prev_10yr
120,0.0,1990,1,1990_1,0.0,0.0,0.0,0.0,0.0
121,0.0,1990,2,1990_2,0.0,0.0,0.0,0.0,0.0
122,0.0,1990,3,1990_3,0.0,0.0,0.0,0.0,0.0
123,0.0,1990,4,1990_4,0.0,0.0,0.0,0.0,0.0
124,0.0,1990,5,1990_5,0.0,0.0,0.0,0.0,0.0


In [479]:
hist_perc_time_list = [hist_perc_time_fire_1m, hist_perc_time_fire_1y, hist_perc_time_fire_5y, hist_perc_time_fire_10y]
hist_perc_time_merged = reduce(lambda  left,right: pd.merge(left,right,on=['GRID_ID', 'year'],
                                               how='outer'), hist_perc_time_list)
hist_perc_time_merged = hist_perc_time_merged[(hist_perc_time_merged['year']>=1990) & \
                                            (hist_perc_time_merged['year']<=2019)]

In [482]:
hist_eng_features = reduce(lambda  left,right: pd.merge(left,right,on=['GRID_ID', 'year', 'month_id', 'month'],
                                               how='outer'), 
                           [hist_bin_merged, hist_max_size_merged,hist_max_p_area_merged, hist_perc_time_merged])
hist_eng_features = hist_eng_features.merge(perc_grid_fire_1y, on = ['GRID_ID', 'year'], how = 'outer')

In [486]:
hist_eng_features.to_pickle(os.path.join(data_dir, 'clean_data/engineered_features/fire_hist_features.pkl'))