# About this notebook

The purpose of this notebook is to upload previously acquired gridMet data to Amazon s3. Before running this notebook, you should have previously run notebook 1 in this directory in order to download gridMet data locally, potentially changing the range of years, in order to acquire data for the desired date range.

In [1]:
import boto3
import pandas as pd
import os
import re
import numpy as np
from pprint import pprint

In [2]:
gridMet_yearly_data = [f for f in os.listdir('.') if re.match(r'gridMet_\d{4}.parquet.gz', f)]
pprint(gridMet_yearly_data)

['gridMet_1999.parquet.gz',
 'gridMet_2000.parquet.gz',
 'gridMet_2001.parquet.gz',
 'gridMet_2002.parquet.gz',
 'gridMet_2003.parquet.gz',
 'gridMet_2004.parquet.gz',
 'gridMet_2005.parquet.gz',
 'gridMet_2006.parquet.gz',
 'gridMet_2007.parquet.gz',
 'gridMet_2008.parquet.gz',
 'gridMet_2009.parquet.gz',
 'gridMet_2010.parquet.gz',
 'gridMet_2011.parquet.gz',
 'gridMet_2012.parquet.gz',
 'gridMet_2013.parquet.gz',
 'gridMet_2014.parquet.gz',
 'gridMet_2015.parquet.gz',
 'gridMet_2016.parquet.gz',
 'gridMet_2017.parquet.gz',
 'gridMet_2018.parquet.gz',
 'gridMet_2019.parquet.gz']


In [3]:
%%time
df = pd.DataFrame()
for file in gridMet_yearly_data:
    df = pd.concat([df, pd.read_parquet(file)])
print(df.shape)

(10676640, 14)
Wall time: 17.4 s


In [4]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,precipitation_amount_mm,relative_humidity_%,specific_humidity_kg/kg,surface_downwelling_shortwave_flux_in_air_W m-2,wind_from_direction_Degrees Clockwise from north,wind_speed_m/s,max_air_temperature_K,min_air_temperature_K,burning_index_g_Unitless,dead_fuel_moisture_100hr_Percent,dead_fuel_moisture_1000hr_Percent,energy_release_component-g_Unitless,potential_evapotranspiration_mm,mean_vapor_pressure_deficit_kPa
date,latitude,longitude,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1999-01-01,33.566667,-117.975,0.0,40.3,0.00589,138.0,123.0,1.6,293.1,281.1,24.0,16.0,15.5,34.0,1.7,0.74
1999-01-01,33.566667,-117.933333,0.0,39.8,0.0059,137.2,123.0,1.6,293.1,281.2,24.0,16.0,15.5,35.0,1.7,0.74
1999-01-01,33.566667,-117.891667,0.0,38.2,0.0058,137.2,123.0,1.7,293.2,281.2,26.0,15.2,14.9,37.0,1.7,0.77
1999-01-01,33.566667,-117.85,0.0,36.4,0.00567,137.3,49.0,1.8,293.3,280.3,27.0,15.0,14.8,38.0,1.8,0.76
1999-01-01,33.566667,-117.808333,0.0,33.8,0.00538,137.3,49.0,1.9,293.2,279.9,29.0,13.3,13.6,43.0,1.9,0.8


In [5]:
df.describe()

Unnamed: 0,precipitation_amount_mm,relative_humidity_%,specific_humidity_kg/kg,surface_downwelling_shortwave_flux_in_air_W m-2,wind_from_direction_Degrees Clockwise from north,wind_speed_m/s,max_air_temperature_K,min_air_temperature_K,burning_index_g_Unitless,dead_fuel_moisture_100hr_Percent,dead_fuel_moisture_1000hr_Percent,energy_release_component-g_Unitless,potential_evapotranspiration_mm,mean_vapor_pressure_deficit_kPa
count,6826300.0,6826300.0,6826300.0,6826300.0,6826300.0,6826300.0,6826300.0,6826300.0,6826300.0,6826300.0,6826300.0,6826300.0,6826300.0,6826300.0
mean,0.8304122,30.00426,0.00650333,238.5361,226.6884,2.980093,297.6084,283.4156,37.14322,12.01792,12.75536,50.26444,4.325072,1.354994
std,4.286879,17.69667,0.002908315,79.89067,77.85217,1.255808,7.771557,6.103147,19.10992,4.536604,4.035236,20.9661,1.928999,0.9730105
min,0.0,0.1,0.00024,4.5,0.0,0.4,268.6,257.5,0.0,1.9,3.4,0.0,0.2,0.0
25%,0.0,15.4,0.00431,164.9,197.0,2.2,292.1,279.1,27.0,8.1,9.6,34.0,2.7,0.63
50%,0.0,26.7,0.0062,245.6,241.0,2.7,297.0,283.3,37.0,12.1,12.9,47.0,4.2,1.09
75%,0.0,42.6,0.00843,312.9,277.0,3.5,303.1,287.9,50.0,15.4,15.6,66.0,5.7,1.83
max,287.1,100.0,0.02392,377.7,360.0,16.2,322.7,307.4,151.0,30.6,30.8,110.0,13.2,7.17


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 10676640 entries, (Timestamp('1999-01-01 00:00:00'), 33.56666666666667, -117.97499996666667) to (Timestamp('2019-12-31 00:00:00'), 32.400000000000006, -116.01666663333334)
Data columns (total 14 columns):
 #   Column                                            Dtype  
---  ------                                            -----  
 0   precipitation_amount_mm                           float64
 1   relative_humidity_%                               float64
 2   specific_humidity_kg/kg                           float64
 3   surface_downwelling_shortwave_flux_in_air_W m-2   float64
 4   wind_from_direction_Degrees Clockwise from north  float64
 5   wind_speed_m/s                                    float64
 6   max_air_temperature_K                             float64
 7   min_air_temperature_K                             float64
 8   burning_index_g_Unitless                          float64
 9   dead_fuel_moisture_100hr_Percent               

In [7]:
np.sum(df.isna())

precipitation_amount_mm                             3850340
relative_humidity_%                                 3850340
specific_humidity_kg/kg                             3850340
surface_downwelling_shortwave_flux_in_air_W m-2     3850340
wind_from_direction_Degrees Clockwise from north    3850340
wind_speed_m/s                                      3850340
max_air_temperature_K                               3850340
min_air_temperature_K                               3850340
burning_index_g_Unitless                            3850340
dead_fuel_moisture_100hr_Percent                    3850340
dead_fuel_moisture_1000hr_Percent                   3850340
energy_release_component-g_Unitless                 3850340
potential_evapotranspiration_mm                     3850340
mean_vapor_pressure_deficit_kPa                     3850340
dtype: int64

In [9]:
s3_url = 'dse-cohort5-group5'
s3 = boto3.client("s3")
all_objects = s3.list_objects(Bucket=s3_url)
all_objects

{'ResponseMetadata': {'RequestId': '92E5E72DFE2F2E58',
  'HostId': 'pJR7T7b5yeHrmuCSyvyYDuDOpDXf4i1o2nuEM9VXAEpEMTqaEASTFPZwWlTgnvldcb4vfR0KWoQ=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'pJR7T7b5yeHrmuCSyvyYDuDOpDXf4i1o2nuEM9VXAEpEMTqaEASTFPZwWlTgnvldcb4vfR0KWoQ=',
   'x-amz-request-id': '92E5E72DFE2F2E58',
   'date': 'Wed, 11 Mar 2020 04:39:13 GMT',
   'x-amz-bucket-region': 'us-west-1',
   'content-type': 'application/xml',
   'transfer-encoding': 'chunked',
   'server': 'AmazonS3'},
  'RetryAttempts': 1},
 'IsTruncated': False,
 'Marker': '',
 'Contents': [{'Key': 'test/',
   'LastModified': datetime.datetime(2020, 2, 14, 2, 26, 50, tzinfo=tzutc()),
   'ETag': '"d41d8cd98f00b204e9800998ecf8427e"',
   'Size': 0,
   'StorageClass': 'STANDARD',
   'Owner': {'DisplayName': 'kcoakley+cohort5group5',
    'ID': 'fe3a1755292b139ad5397ba537aa0aff32ef6ebba07fe45dcbe486d81e1ccc11'}},
  {'Key': 'wildfire_capstone/gridMet.parquet.gz',
   'LastModified': datetime.datetime(2020, 

In [10]:
s3_url = "s3://dse-cohort5-group5/wildfire_capstone/gridMet.parquet.gz"
df.to_parquet(s3_url, compression="gzip")