# About this notebook

This notebook will generate windowed aggregate features from the base integrated dataframe. The windowed features are created by:

1. Selecting a window period, e.g. 1 week or 1 year
2. Selecting an aggregation function, e.g. max or cumulative
3. Applying the function for the given window preceding each row.

If you need to download the base dataframe from S3, uncomment and run the cell with the comment `#DOWNLOAD DATA`.

In [4]:
import pandas as pd
from tqdm import tqdm_notebook as tqdm

In [2]:
#DOWNLOAD DATA
# import boto3
# s3 = boto3.client("s3")
# all_objects = s3.list_objects(Bucket='dse-cohort5-group5')
# display(all_objects)
#s3.download_file('dse-cohort5-group5', 'wildfire_capstone/gridMet.parquet.gz', 'gridMet.parquet.gz')

In [5]:
df = pd.read_parquet('gridMet.parquet.gz')

In [6]:
print(df.shape)
df = df[~df['precipitation_amount_mm'].isna()]
df = df.reset_index()
df = df.set_index('date')
print(df.shape)

(10676640, 14)
(6826300, 16)


In [7]:
cols = [c for c in df][2:]
df.head()

Unnamed: 0_level_0,latitude,longitude,precipitation_amount_mm,relative_humidity_%,specific_humidity_kg/kg,surface_downwelling_shortwave_flux_in_air_W m-2,wind_from_direction_Degrees Clockwise from north,wind_speed_m/s,max_air_temperature_K,min_air_temperature_K,burning_index_g_Unitless,dead_fuel_moisture_100hr_Percent,dead_fuel_moisture_1000hr_Percent,energy_release_component-g_Unitless,potential_evapotranspiration_mm,mean_vapor_pressure_deficit_kPa
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1999-01-01,33.566667,-117.975,0.0,40.3,0.00589,138.0,123.0,1.6,293.1,281.1,24.0,16.0,15.5,34.0,1.7,0.74
1999-01-01,33.566667,-117.933333,0.0,39.8,0.0059,137.2,123.0,1.6,293.1,281.2,24.0,16.0,15.5,35.0,1.7,0.74
1999-01-01,33.566667,-117.891667,0.0,38.2,0.0058,137.2,123.0,1.7,293.2,281.2,26.0,15.2,14.9,37.0,1.7,0.77
1999-01-01,33.566667,-117.85,0.0,36.4,0.00567,137.3,49.0,1.8,293.3,280.3,27.0,15.0,14.8,38.0,1.8,0.76
1999-01-01,33.566667,-117.808333,0.0,33.8,0.00538,137.3,49.0,1.9,293.2,279.9,29.0,13.3,13.6,43.0,1.9,0.8


In [8]:
cols

['precipitation_amount_mm',
 'relative_humidity_%',
 'specific_humidity_kg/kg',
 'surface_downwelling_shortwave_flux_in_air_W m-2',
 'wind_from_direction_Degrees Clockwise from north',
 'wind_speed_m/s',
 'max_air_temperature_K',
 'min_air_temperature_K',
 'burning_index_g_Unitless',
 'dead_fuel_moisture_100hr_Percent',
 'dead_fuel_moisture_1000hr_Percent',
 'energy_release_component-g_Unitless',
 'potential_evapotranspiration_mm',
 'mean_vapor_pressure_deficit_kPa']

In [7]:
windows = ['7d', '14d', '21d', '30d', '60d', '90d', '180d', '365d']
windows

['7d', '14d', '21d', '30d', '60d', '90d', '180d', '365d']

In [8]:
result = df.reset_index().set_index(['date', 'latitude', 'longitude'])
print(result.shape)
for window in tqdm(windows):
    name_map = {n: "{}_mean_{}".format(window, n) for n in cols}
    tmp = df
    tmp = tmp.groupby(['latitude', 'longitude'])
    tmp = tmp.rolling(window).mean(skipna=True)
    tmp = tmp.rename(columns=name_map)
    result = result.join(
        tmp.drop(['latitude', 'longitude'], axis=1).reset_index()\
        .set_index(['date', 'latitude', 'longitude'])
    )
    print(result.shape)
result.to_parquet('mean.parquet.gz', compression='gzip')

(6826300, 14)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))

(6826300, 28)
(6826300, 42)
(6826300, 56)
(6826300, 70)
(6826300, 84)
(6826300, 98)
(6826300, 112)
(6826300, 126)



In [9]:
result = df.reset_index().set_index(['date', 'latitude', 'longitude'])
print(result.shape)
for window in tqdm(windows):
    name_map = {n: "{}_max_{}".format(window, n) for n in cols}
    tmp = df
    tmp = tmp.groupby(['latitude', 'longitude'])
    tmp = tmp.rolling(window).max(skipna=True)
    tmp = tmp.rename(columns=name_map)
    result = result.join(
        tmp.drop(['latitude', 'longitude'], axis=1).reset_index()\
        .set_index(['date', 'latitude', 'longitude'])
    )
    print(result.shape)
result.to_parquet('max.parquet.gz', compression='gzip')

(6826300, 14)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))

(6826300, 28)
(6826300, 42)
(6826300, 56)
(6826300, 70)
(6826300, 84)
(6826300, 98)
(6826300, 112)
(6826300, 126)



In [10]:
result = df.reset_index().set_index(['date', 'latitude', 'longitude'])
print(result.shape)
for window in tqdm(windows):
    name_map = {n: "{}_cumulative_{}".format(window, n) for n in cols}
    tmp = df
    tmp = tmp.groupby(['latitude', 'longitude'])
    tmp = tmp.rolling(window).sum(skipna=True)
    tmp = tmp.rename(columns=name_map)
    result = result.join(
        tmp.drop(['latitude', 'longitude'], axis=1).reset_index()\
        .set_index(['date', 'latitude', 'longitude'])
    )
    print(result.shape)
result.to_parquet('cumulative.parquet.gz', compression='gzip')

(6826300, 14)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))

(6826300, 28)
(6826300, 42)
(6826300, 56)
(6826300, 70)
(6826300, 84)
(6826300, 98)
(6826300, 112)
(6826300, 126)



In [11]:
result = df.reset_index().set_index(['date', 'latitude', 'longitude'])
print(result.shape)
for window in tqdm(windows):
    name_map = {n: "{}_min_{}".format(window, n) for n in cols}
    tmp = df
    tmp = tmp.groupby(['latitude', 'longitude'])
    tmp = tmp.rolling(window).min(skipna=True)
    tmp = tmp.rename(columns=name_map)
    result = result.join(
        tmp.drop(['latitude', 'longitude'], axis=1).reset_index()\
        .set_index(['date', 'latitude', 'longitude'])
    )
    print(result.shape)
result.to_parquet('min.parquet.gz', compression='gzip')

(6826300, 14)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))

(6826300, 28)
(6826300, 42)
(6826300, 56)
(6826300, 70)
(6826300, 84)
(6826300, 98)
(6826300, 112)
(6826300, 126)



In [12]:
result = df.reset_index().set_index(['date', 'latitude', 'longitude'])
print(result.shape)
for window in tqdm(windows):
    name_map = {n: "{}_std_{}".format(window, n) for n in cols}
    tmp = df
    tmp = tmp.groupby(['latitude', 'longitude'])
    tmp = tmp.rolling(window).std(skipna=True)
    tmp = tmp.rename(columns=name_map)
    result = result.join(
        tmp.drop(['latitude', 'longitude'], axis=1).reset_index()\
        .set_index(['date', 'latitude', 'longitude'])
    )
    print(result.shape)
result.to_parquet('std.parquet.gz', compression='gzip')

(6826300, 14)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))

(6826300, 28)
(6826300, 42)
(6826300, 56)
(6826300, 70)
(6826300, 84)
(6826300, 98)
(6826300, 112)
(6826300, 126)



In [31]:
# Uncomment to free resources if needed
# del df
# del tmp
# del result

In [37]:
# Use to generate smaller, partitioned parquet files that can be written efficiently to S3
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
from tqdm.notebook import tqdm
files = ['min.parquet.gz', 'max.parquet.gz', 'mean.parquet.gz', 'cumulative.parquet.gz', 'std.parquet.gz']
for file in tqdm(files):
    result = pd.read_parquet(file).drop(cols, axis=1) # Drop cols containing the base features
    display(result.head().index)
    result['year'] = result.index.get_level_values(0).year
    table = pa.Table.from_pandas(result)
    root_path = file.split('.')[0] # Split by aggregation type
    pq.write_to_dataset(table, root_path=root_path, partition_cols=['year'], 
                    flavor='spark')

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

MultiIndex([('1999-01-01', 33.56666666666667, -117.97499996666667),
            ('1999-01-01', 33.56666666666667, -117.93333330000002),
            ('1999-01-01', 33.56666666666667, -117.89166663333334),
            ('1999-01-01', 33.56666666666667, -117.84999996666667),
            ('1999-01-01', 33.56666666666667, -117.80833330000002)],
           names=['date', 'latitude', 'longitude'])

MultiIndex([('1999-01-01', 33.56666666666667, -117.97499996666667),
            ('1999-01-01', 33.56666666666667, -117.93333330000002),
            ('1999-01-01', 33.56666666666667, -117.89166663333334),
            ('1999-01-01', 33.56666666666667, -117.84999996666667),
            ('1999-01-01', 33.56666666666667, -117.80833330000002)],
           names=['date', 'latitude', 'longitude'])

MultiIndex([('1999-01-01', 33.56666666666667, -117.97499996666667),
            ('1999-01-01', 33.56666666666667, -117.93333330000002),
            ('1999-01-01', 33.56666666666667, -117.89166663333334),
            ('1999-01-01', 33.56666666666667, -117.84999996666667),
            ('1999-01-01', 33.56666666666667, -117.80833330000002)],
           names=['date', 'latitude', 'longitude'])

MultiIndex([('1999-01-01', 33.56666666666667, -117.97499996666667),
            ('1999-01-01', 33.56666666666667, -117.93333330000002),
            ('1999-01-01', 33.56666666666667, -117.89166663333334),
            ('1999-01-01', 33.56666666666667, -117.84999996666667),
            ('1999-01-01', 33.56666666666667, -117.80833330000002)],
           names=['date', 'latitude', 'longitude'])

MultiIndex([('1999-01-01', 33.56666666666667, -117.97499996666667),
            ('1999-01-01', 33.56666666666667, -117.93333330000002),
            ('1999-01-01', 33.56666666666667, -117.89166663333334),
            ('1999-01-01', 33.56666666666667, -117.84999996666667),
            ('1999-01-01', 33.56666666666667, -117.80833330000002)],
           names=['date', 'latitude', 'longitude'])




In [31]:
# Uncomment to free resources if needed
# del table
# del result
# del df
# del tmp

In [51]:
%%time
import boto3
import pandas as pd
import os
s3_url = 'dse-cohort5-group5'
object_base = "wildfire_capstone/timeLaggedGridMetFeatures/partitioned/{agg}/{part}/{fname}"
s3 = boto3.client('s3')
dirs = ['min', 'max', 'mean', 'cumulative', 'std']
for agg in tqdm(dirs[4:]):
    print("reading:", agg)
    parts = os.listdir(agg)
    tree = {part: os.listdir(os.path.join(agg, part)) for part in parts}
    for part, files in tqdm(tree.items()):
        for fname in files:
            object_name = object_base.format(agg=agg, part=part, fname=fname)
            with open(os.path.join(agg, part, fname), "rb") as f:
                s3.upload_fileobj(f, s3_url, object_name)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))

reading: std


HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))



Wall time: 7h 18min 52s


In [52]:
s3 = boto3.client("s3")
all_objects = s3.list_objects(Bucket='dse-cohort5-group5')
display(all_objects)

{'ResponseMetadata': {'RequestId': '5447C331784490FC',
  'HostId': 'lHN53XwtravNFDvzt5UU3MrIX8es974B0jMUgntCs/iMdGzCQSU0gAVLJaDPNNHASoWNF334O/U=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'lHN53XwtravNFDvzt5UU3MrIX8es974B0jMUgntCs/iMdGzCQSU0gAVLJaDPNNHASoWNF334O/U=',
   'x-amz-request-id': '5447C331784490FC',
   'date': 'Tue, 07 Apr 2020 02:21:42 GMT',
   'x-amz-bucket-region': 'us-west-1',
   'content-type': 'application/xml',
   'transfer-encoding': 'chunked',
   'server': 'AmazonS3'},
  'RetryAttempts': 1},
 'IsTruncated': False,
 'Marker': '',
 'Contents': [{'Key': 'test/',
   'LastModified': datetime.datetime(2020, 2, 14, 2, 26, 50, tzinfo=tzutc()),
   'ETag': '"d41d8cd98f00b204e9800998ecf8427e"',
   'Size': 0,
   'StorageClass': 'STANDARD',
   'Owner': {'DisplayName': 'kcoakley+cohort5group5',
    'ID': 'fe3a1755292b139ad5397ba537aa0aff32ef6ebba07fe45dcbe486d81e1ccc11'}},
  {'Key': 'wildfire_capstone/gridMet.parquet.gz',
   'LastModified': datetime.datetime(2020, 