# Pull in the main fire dataframe

In [1]:
#import packages
import pandas as pd  # provides interface for interacting with tabular data
import geopandas as gpd  # combines the capabilities of pandas and shapely for geospatial operations
import rtree  # supports geospatial join
import os
import fnmatch
import numpy as np
import matplotlib.pyplot as plt
import sys
import pickle
from shapely.ops import nearest_points
from datetime import datetime as dt, date
sys.path.append('/Users/jackepstein/Documents/GitHub/wildfires-1001/code/functions/')
data_dir = '/Users/jackepstein/Documents/GitHub/wildfires-1001/data'

In [2]:
#read in the target variables for fire
target_df = {}
full_target_data = gpd.GeoDataFrame()
for i in np.arange(1, 4):
    target_df[i] = pd.read_pickle(os.path.join(data_dir, f'clean_data/target_full_{i}.pkl')) 
    full_target_data = full_target_data.append(target_df[i])
    
#change data types
full_target_data['COUNTYFP'] = full_target_data['COUNTYFP'].astype(int)
full_target_data['GRID_ID'] = full_target_data['GRID_ID'].astype(int)
full_target_data['YEAR'] = full_target_data['date'].apply(lambda x:x.year)  
full_target_data['MONTH'] = full_target_data['date'].apply(lambda x:x.month)  

#drop unneeded columns
full_target_data2 = full_target_data.drop(columns=['date','month_start', 'month_end', 'week_id',
                                                  'week_start', 'week_end', 'start_date', 'end_date'])

In [3]:
#checking what one instance will look like
#full_target_data2.loc[full_target_data2['GRID_ID']==36].loc[full_target_data2['month_id']=='2018_11']

# Initial Group By

In [4]:
#group by gridid and month and take means of fire data
#y_bin, y_fire_class_size -- take max
#y_fire_count -- count distinct of FIRE ID
#y_fire_area prop -- done below with a separate dissolve and join rather than groupby 
target_data_month = full_target_data2.groupby(['GRID_ID','month_id','YEAR', 'MONTH','COUNTYFP','NAME', 'GRID_AREA', 
                                               'COUNTY_ARE']).agg({'Y_bin':'max', 
                                                                   'Y_fire_class_size': 'max',
                                                                   'FIRE_KEY':'nunique'}).reset_index()
target_data_month = target_data_month.rename(columns={'FIRE_KEY': 'Y_fire_count'})

In [5]:
#DO NOT RE-RUN UNLESS NEEDED VERY SLOW
#make a new DF with just needed columns
sub_geo_df = full_target_data2[['month_id', 'GRID_ID', 'geometry']]
#sub_geo_df.loc[sub_geo_df['GRID_ID']==36].loc[sub_geo_df['month_id']=='2018_11']

#only positive instances
sub_geo_df_2 = sub_geo_df.loc[~sub_geo_df['geometry'].isna()]

#dissolve
sub_geo_dissolve = sub_geo_df_2.dissolve(by=['GRID_ID','month_id'])

In [6]:
#reset the index and calcuate area
sub_geo_dissolve = sub_geo_dissolve.reset_index()
sub_geo_dissolve['Fire_area'] = sub_geo_dissolve['geometry'].area

In [7]:
#merge grouped by df with dissolved df
target_data_month = target_data_month.merge(sub_geo_dissolve, on=['GRID_ID','month_id'], how='left')
#replace NaN in Fire_area with 0
target_data_month['Fire_area'] = target_data_month['Fire_area'].fillna(0)

In [8]:
#calculate target variable for regression
target_data_month['Y_fire_area_prop'] = target_data_month['Fire_area']/target_data_month['GRID_AREA']

In [9]:
#drop grid ID 59 -- no weather data
target_data_month_df = target_data_month.loc[target_data_month['GRID_ID']!=59]
#check for positive instances
#target_data_month_df.loc[target_data_month_df['Y_bin']==1]

In [10]:
#take in an object formatted as YYYY_MM
def add_one_month(month_id_obj):
    
    #turn this object into a string
    #split this and take the element after the '_'
    #turn this back into an int
    month_int = int(str(month_id_obj).split('_')[1])
    year_int = int(str(month_id_obj).split('_')[0])
    
    #check if the month is decemember -- if so, set to 1 if not, add one
    if month_int == 12:
        mont = 1
        new_month_id = str(year_int+1)+'_'+str(mont)
    else:
        mont = int(month_int+1)
        new_month_id = str(year_int)+'_'+str(mont)
        
    return new_month_id    
    
#take in an object formatted as YYYY_MM
def sub_one_month(month_id_obj):
    
    #turn this object into a string
    #split this and take the element after the '_'
    #turn this back into an int
    month_int = int(str(month_id_obj).split('_')[1])
    year_int = int(str(month_id_obj).split('_')[0])
    
    #check if the month is janary -- if so, set to 12 if not, subtract one
    if month_int == 1:
        mont = 12
        new_month_id = str(year_int-1)+'_'+str(mont)
    else:
        mont = int(month_int-1)
        new_month_id = str(year_int)+'_'+str(mont)
        
    return new_month_id  

# Pull in the other simpler data sets (demogs, arson, topo, infr)

In [11]:
#topography
#no need to shift -- no month ids
topo_df = pd.read_csv(os.path.join(data_dir, 'clean_data/topography/grid_elevation.csv'))
topo_df['GRID_ID'] = topo_df['GRID_ID'].astype(int)
topo_df = topo_df.drop(columns=topo_df.columns[0])

In [12]:
#infrastructure
#shift month up 1
infr_df = pd.read_csv((os.path.join(data_dir, 'clean_data/grid_infrastructure/grid_infrastructure_monthly.csv')))
infr_df['GRID_ID'] = infr_df['GRID_ID'].astype(int)
infr_df['month_id_old'] = infr_df['month_id']
infr_df['month_id'] = infr_df['month_id'].apply(lambda x: add_one_month(x))
infr_df = infr_df.drop(columns=infr_df.columns[0])

In [13]:
#demographics
#shift up a year
demographics_df = pd.read_csv((os.path.join(data_dir, 'clean_data/ca_demogs/demogs_arson_master.csv')))
demographics_df['YEAR'] = demographics_df['YEAR']+1

In [14]:
#pull in built fire features
#no need to shift
fire_feat = pd.read_csv((os.path.join(data_dir, 'clean_data/engineered_features/adj_fire_final.csv')))
fire_feat['GRID_ID'] = fire_feat['GRID_ID'].astype(int)

# Merge with these

In [15]:
#merge with topo
target_data_month_df = target_data_month_df.merge(topo_df, on='GRID_ID', how='left')

#merge with infrastructure
target_data_month_df = target_data_month_df.merge(infr_df, on=['GRID_ID','month_id'], how='left')

#merge with demographics
target_data_month_df = target_data_month_df.merge(demographics_df, on=['GRID_ID', 'NAME', 'COUNTYFP', 'YEAR'], how='left')

#merge with other fire
target_data_month_df = target_data_month_df.merge(fire_feat, on=['GRID_ID','month_id'], how='left')

# Pull in and merge with weather

In [16]:
#weather 
era_weather = pd.read_pickle((os.path.join(data_dir, 'clean_data/ERA_weather-data/ERA5_CAgrid_gdf.pkl')))
era_weather['GRID_ID'] = era_weather['GRID_ID'].astype(int)


#add in a month_id column
#need to shift up a month
era_weather['month'] = pd.DatetimeIndex(era_weather['date']).month
era_weather['YEAR'] = pd.DatetimeIndex(era_weather['date']).year
era_weather['month_id'] = era_weather['YEAR'].astype(str)+'_'+era_weather['month'].astype(str)
era_weather['month_id_old'] = era_weather['month_id']
era_weather['month_id'] = era_weather['month_id'].apply(lambda x: add_one_month(x))
era_weather = era_weather.drop(columns=['date','month','YEAR'])

In [17]:
#merge weather
target_data_month_df = target_data_month_df.merge(era_weather, on=['GRID_ID','month_id'], how='left')

# Final Clean Up and Send to Pickle

In [18]:
#checking the merge with shifts
target_data_month_df.loc[target_data_month_df['Total precipitation hrs:12'].isna()].groupby(['GRID_ID','month_id'])['month_id'].count()
#target_data_month.loc[target_data_month['Total precipitation hrs:12'].isna()].loc[target_data_month['YEAR']==2017]
target_data_month_df.loc[target_data_month_df['Total precipitation hrs:12'].isna()].shape[0]

100

In [19]:
#dropping jan 1990 with no weather data from the previous month
target_df_final = target_data_month_df.loc[target_data_month_df['month_id']!='1990_1']
target_df_final.describe()

Unnamed: 0,GRID_ID,YEAR,MONTH,COUNTYFP,GRID_AREA,COUNTY_ARE,Y_bin,Y_fire_class_size,Y_fire_count,Fire_area,...,Surface pressure hrs:12,Total precipitation hrs:12,10 metre U wind component hrs:18,10 metre V wind component hrs:18,2 metre dewpoint temperature hrs:18,2 metre temperature hrs:18,"Leaf area index, high vegetation hrs:18","Leaf area index, low vegetation hrs:18",Surface pressure hrs:18,Total precipitation hrs:18
count,35733.0,35733.0,35733.0,35733.0,35733.0,35733.0,35733.0,35733.0,35733.0,35733.0,...,35733.0,35733.0,35733.0,35733.0,35733.0,35733.0,35733.0,35733.0,35733.0,35733.0
mean,75.902107,2004.503008,6.515238,56.321971,4537280000.0,22191390000.0,0.108471,0.22822,0.187866,4005290.0,...,91654.929576,0.0008217999,0.487142,0.217151,275.632303,289.433539,2.348558,1.193621,91770.39208,0.001235061
std,46.597356,8.632085,3.444702,31.0092,1117092000.0,19652930000.0,0.310979,0.754216,0.765706,44174850.0,...,6262.900418,0.001351111,0.826254,0.801147,5.599808,8.440839,1.353853,0.693053,6274.745857,0.002025946
min,0.0,1990.0,1.0,1.0,932505300.0,2562528000.0,0.0,0.0,0.0,0.0,...,73537.5625,1.714898e-07,-2.016808,-3.82979,256.166332,263.808187,0.0,0.024249,73622.7125,3.320812e-07
25%,37.0,1997.0,4.0,29.0,5000733000.0,7492856000.0,0.0,0.0,0.0,0.0,...,87705.729167,3.286537e-05,-0.085085,-0.268096,271.568644,283.223772,1.708643,0.540334,87816.263889,5.682434e-05
50%,66.0,2004.0,7.0,53.0,5041000000.0,17388080000.0,0.0,0.0,0.0,0.0,...,91763.991071,0.0002473255,0.385402,0.212174,275.768464,289.378662,2.517645,1.088755,91894.858333,0.0003864345
75%,120.0,2012.0,10.0,79.0,5041000000.0,29419020000.0,0.0,0.0,0.0,0.0,...,97275.321429,0.0009963614,0.980531,0.727225,279.693701,296.153288,3.355879,1.722641,97384.7625,0.001486302
max,147.0,2019.0,12.0,111.0,5041000000.0,77493000000.0,1.0,4.0,31.0,1651920000.0,...,101988.095833,0.01331524,5.048863,3.249624,291.338135,309.847803,5.172008,2.757053,102153.995833,0.02056689


In [20]:
#re-read in county grid to join with geometry
county_grid = gpd.read_file(os.path.join(data_dir, 'clean_data/county_grid/county_grid.dbf'))
county_grid['GRID_ID'] = county_grid['GRID_ID'].astype(int)

In [21]:
#merge this with the initial df to get geometry
target_df_final_geo = target_df_final.merge(county_grid[['GRID_ID','geometry']], on='GRID_ID', how='left')

In [22]:
#final column clear up
target_df_final_geo['geometry'] = target_df_final_geo['geometry_x']
target_df_final_geo = target_df_final_geo.drop(columns=['geometry_x', 'geometry_y','month_id_old_x','month_id_old_y'])

In [23]:
#send to pickle file and csv
target_df_final_geo.to_pickle(os.path.join(data_dir, 'clean_data/target_df_final_geo.pkl'))
#target_df_final_geo.to_csv(os.path.join(data_dir, 'clean_data/target_df_final_geo.csv'))