In [76]:
import pandas as pd  # provides interface for interacting with tabular data
import geopandas as gpd  # combines the capabilities of pandas and shapely for geospatial operations
from shapely.geometry import Point, Polygon, MultiPolygon  # for manipulating text data into geospatial shapes
from shapely import wkt  # stands for "well known text," allows for interchange across GIS programs
import rtree  # supports geospatial join
import os
import fnmatch
import numpy as np
import matplotlib.pyplot as plt
import descartes
import sys
import sklearn
import pickle
from shapely.ops import nearest_points
from datetime import datetime as dt, date
sys.path.append('/Users/saraprice/Documents/NYU/Fall_2020/DS_GA_1001/final_project/wildfires-1001/code/functions/')
from gis_processing import *
from functools import reduce

In [2]:
data_dir = '/Users/saraprice/Documents/NYU/Fall_2020/DS_GA_1001/final_project/wildfires-1001/data'

In [3]:
weather = pd.read_pickle(os.path.join(data_dir, 'clean_data/ERA_weather-data/ERA5_CAgrid_gdf.pkl'))

In [4]:
weather_1980_90 = pd.read_pickle(os.path.join(data_dir, 'clean_data/ERA_weather-data/ERA5_CAgrid1980_gdf.pkl'))

In [5]:
weather['month'] = weather['date'].apply(lambda x:x.month)
weather['year'] = weather['date'].apply(lambda x:x.year)

In [6]:
weather_1980_90['month'] = weather_1980_90['date'].apply(lambda x:x.month)
weather_1980_90['year'] = weather_1980_90['date'].apply(lambda x:x.year)
weather_1980_90['month_id'] = weather_1980_90['year'].astype(str) + "_" + weather_1980_90['month'].astype(str)

In [7]:
rename_weather = {'10 metre U wind component': 'U_wind_10m',
                  '10 metre V wind component': 'V_wind_10m',
                  '2 metre dewpoint temperature': '2m_dewpoint_tmp',
                  '2 metre temperature': '2m_tmp',
                  'Leaf area index, high vegetation':'leaf_high_veg',
                  'Leaf area index, low vegetation':'leaf_low_veg',
                  'Surface pressure' : 'surface_pressure',
                  'Total precipitation' : 'tot_prcp'}
orig_keys = list(rename_weather.keys())
for key in orig_keys:
    for i in [0, 6, 12, 18]:
        rename_weather[key + f" hrs:{i}"] = rename_weather[key]+f"_{i}hrs"
for key in orig_keys:
    del rename_weather[key]

In [8]:
try:
    weather.rename(columns= rename_weather, inplace = True)
except:
    pass
try:
    weather_1980_90.rename(columns= rename_weather, inplace = True)
except:
    pass

In [9]:
for i in weather.columns:
    if i not in weather_1980_90.columns:
        print(i)

In [10]:
weather_full = weather.append(weather_1980_90)

In [13]:
w_cols = list(rename_weather.values())
print(w_cols)

['U_wind_10m_0hrs', 'U_wind_10m_6hrs', 'U_wind_10m_12hrs', 'U_wind_10m_18hrs', 'V_wind_10m_0hrs', 'V_wind_10m_6hrs', 'V_wind_10m_12hrs', 'V_wind_10m_18hrs', '2m_dewpoint_tmp_0hrs', '2m_dewpoint_tmp_6hrs', '2m_dewpoint_tmp_12hrs', '2m_dewpoint_tmp_18hrs', '2m_tmp_0hrs', '2m_tmp_6hrs', '2m_tmp_12hrs', '2m_tmp_18hrs', 'leaf_high_veg_0hrs', 'leaf_high_veg_6hrs', 'leaf_high_veg_12hrs', 'leaf_high_veg_18hrs', 'leaf_low_veg_0hrs', 'leaf_low_veg_6hrs', 'leaf_low_veg_12hrs', 'leaf_low_veg_18hrs', 'surface_pressure_0hrs', 'surface_pressure_6hrs', 'surface_pressure_12hrs', 'surface_pressure_18hrs', 'tot_prcp_0hrs', 'tot_prcp_6hrs', 'tot_prcp_12hrs', 'tot_prcp_18hrs']


In [65]:
def hist_weather_data(df, w_cols, n_years, min_year, max_year):
    output_df = pd.DataFrame(columns =['GRID_ID', 'year', 'month']+w_cols)
    for g in df.GRID_ID.unique():
        df_sub = df[df['GRID_ID']==g]
        for m in list(np.arange(1, 13)):
            df_sub_month = df_sub[df_sub['month']==m]
            for y in np.arange(min_year, max_year +1):
                time_sub = df_sub_month[(df_sub_month['year']>=y-n_years) & (df_sub_month['year']<y)]
                hist_means= time_sub[w_cols].mean(axis = 0)
                hist_means = hist_means.to_dict()
                hist_means['GRID_ID'] = time_sub.GRID_ID.unique()[0]
                hist_means['year'] = y
                hist_means['month'] = m
                output_df = output_df.append(hist_means, ignore_index = True)
    return output_df

In [67]:
hist_weather_1yr = hist_weather_data(weather_full, w_cols, 1, 1990, 2019)

In [94]:
hist_weather_1yr[['GRID_ID', 'year', 'month']] = hist_weather_1yr[['GRID_ID', 'year', 'month']].astype(int)
rename_1yr = {}
for i in rename_weather.values():
    rename_1yr[i] = i + "_1y"
hist_weather_1yr.rename(columns = rename_1yr, inplace = True)

In [66]:
hist_weather_5yr = hist_weather_data(weather_full, w_cols, 5, 1990, 2019)

In [89]:
hist_weather_5yr[['GRID_ID', 'year', 'month']] = hist_weather_5yr[['GRID_ID', 'year', 'month']].astype(int)
rename_5yr = {}
for i in rename_weather.values():
    rename_5yr[i] = i + "_5y"
hist_weather_5yr.rename(columns = rename_5yr, inplace = True)
hist_weather_5yr.head()

Unnamed: 0,GRID_ID,year,month,U_wind_10m_0hrs_5y,U_wind_10m_6hrs_5y,U_wind_10m_12hrs_5y,U_wind_10m_18hrs_5y,V_wind_10m_0hrs_5y,V_wind_10m_6hrs_5y,V_wind_10m_12hrs_5y,...,leaf_low_veg_12hrs_5y,leaf_low_veg_18hrs_5y,surface_pressure_0hrs_5y,surface_pressure_6hrs_5y,surface_pressure_12hrs_5y,surface_pressure_18hrs_5y,tot_prcp_0hrs_5y,tot_prcp_6hrs_5y,tot_prcp_12hrs_5y,tot_prcp_18hrs_5y
0,0,1990,1,0.104822,-1.477139,-1.660333,-1.335759,0.318739,0.56455,0.566412,...,0.283641,0.283641,96189.832955,96257.796591,96222.989773,96295.743182,0.00712,0.001538,0.00336,0.005323
1,0,1991,1,0.106437,-1.300705,-1.542645,-1.33128,0.616342,0.727494,0.666517,...,0.283641,0.283641,96129.270455,96216.4125,96188.942045,96249.452273,0.009151,0.002196,0.004489,0.00679
2,0,1992,1,0.201303,-1.25934,-1.524403,-1.185684,0.239588,0.478254,0.416391,...,0.283641,0.283641,96198.268182,96279.415909,96256.334091,96316.545455,0.007667,0.002045,0.004106,0.005853
3,0,1993,1,0.104816,-1.265259,-1.502194,-1.245701,0.273002,0.529313,0.442421,...,0.283641,0.283641,96176.518182,96253.717045,96235.205682,96293.931818,0.006857,0.001944,0.003764,0.00515
4,0,1994,1,0.164171,-1.207301,-1.424059,-1.162759,0.176995,0.504215,0.36569,...,0.283641,0.283641,96066.184091,96120.955682,96121.813636,96184.814773,0.0069,0.001917,0.003872,0.005211


In [68]:
hist_weather_10yr = hist_weather_data(weather_full, w_cols, 10, 1990, 2019)

In [108]:
hist_weather_10yr[['GRID_ID', 'year', 'month']] = hist_weather_10yr[['GRID_ID', 'year', 'month']].astype(int)
rename_10yr = {}
for i in rename_weather.values():
    rename_10yr[i] = i + "_10y"
#print(rename_10yr)
fix_rename = {}
for i in rename_1yr.values():
    fix_rename[i] = str.replace(i, '1y', '10y')
print(fix_rename)
hist_weather_10yr.rename(columns = fix_rename, inplace = True)
hist_weather_10yr.head()

{'U_wind_10m_0hrs_1y': 'U_wind_10m_0hrs_10y', 'U_wind_10m_6hrs_1y': 'U_wind_10m_6hrs_10y', 'U_wind_10m_12hrs_1y': 'U_wind_10m_12hrs_10y', 'U_wind_10m_18hrs_1y': 'U_wind_10m_18hrs_10y', 'V_wind_10m_0hrs_1y': 'V_wind_10m_0hrs_10y', 'V_wind_10m_6hrs_1y': 'V_wind_10m_6hrs_10y', 'V_wind_10m_12hrs_1y': 'V_wind_10m_12hrs_10y', 'V_wind_10m_18hrs_1y': 'V_wind_10m_18hrs_10y', '2m_dewpoint_tmp_0hrs_1y': '2m_dewpoint_tmp_0hrs_10y', '2m_dewpoint_tmp_6hrs_1y': '2m_dewpoint_tmp_6hrs_10y', '2m_dewpoint_tmp_12hrs_1y': '2m_dewpoint_tmp_12hrs_10y', '2m_dewpoint_tmp_18hrs_1y': '2m_dewpoint_tmp_18hrs_10y', '2m_tmp_0hrs_1y': '2m_tmp_0hrs_10y', '2m_tmp_6hrs_1y': '2m_tmp_6hrs_10y', '2m_tmp_12hrs_1y': '2m_tmp_12hrs_10y', '2m_tmp_18hrs_1y': '2m_tmp_18hrs_10y', 'leaf_high_veg_0hrs_1y': 'leaf_high_veg_0hrs_10y', 'leaf_high_veg_6hrs_1y': 'leaf_high_veg_6hrs_10y', 'leaf_high_veg_12hrs_1y': 'leaf_high_veg_12hrs_10y', 'leaf_high_veg_18hrs_1y': 'leaf_high_veg_18hrs_10y', 'leaf_low_veg_0hrs_1y': 'leaf_low_veg_0hrs_10y'

Unnamed: 0,GRID_ID,year,month,U_wind_10m_0hrs_10y,U_wind_10m_6hrs_10y,U_wind_10m_12hrs_10y,U_wind_10m_18hrs_10y,V_wind_10m_0hrs_10y,V_wind_10m_6hrs_10y,V_wind_10m_12hrs_10y,...,leaf_low_veg_12hrs_10y,leaf_low_veg_18hrs_10y,surface_pressure_0hrs_10y,surface_pressure_6hrs_10y,surface_pressure_12hrs_10y,surface_pressure_18hrs_10y,tot_prcp_0hrs_10y,tot_prcp_6hrs_10y,tot_prcp_12hrs_10y,tot_prcp_18hrs_10y
0,0,1990,1,0.048762,-1.466852,-1.59242,-1.296175,0.239931,0.546736,0.491304,...,0.283641,0.283641,96117.22096,96170.564394,96130.732955,96214.952652,0.00697,0.001534,0.003374,0.005288
1,0,1991,1,0.089227,-1.377454,-1.547198,-1.270596,0.259143,0.53578,0.47056,...,0.283641,0.283641,96108.310795,96168.652841,96132.647159,96211.742614,0.00737,0.001721,0.003659,0.00557
2,0,1992,1,0.127599,-1.332059,-1.534987,-1.256521,0.200327,0.472784,0.445752,...,0.283641,0.283641,96153.14375,96216.159091,96187.740909,96257.952273,0.007094,0.001752,0.00357,0.005302
3,0,1993,1,0.102164,-1.383374,-1.587149,-1.333555,0.332519,0.547327,0.547173,...,0.283641,0.283641,96143.060227,96214.692614,96184.095455,96252.6625,0.006765,0.001682,0.003385,0.005018
4,0,1994,1,0.14712,-1.350769,-1.550086,-1.273165,0.20592,0.499784,0.465961,...,0.283641,0.283641,96128.999432,96192.264205,96172.520455,96241.706818,0.006688,0.001662,0.003425,0.005012


In [109]:
hist_weather_features = reduce(lambda  left,right: pd.merge(left,right,on=['GRID_ID', 'year', 'month']), 
                           [hist_weather_1yr, hist_weather_5yr, hist_weather_10yr])

In [111]:
hist_weather_features.to_pickle(os.path.join(data_dir, 'clean_data/engineered_features/historical_weather.pkl'))