# About this notebook

The purpose of this notebook is to upload previously acquired gridMet data to Amazon s3. Before running this notebook, you should have previously run notebook 1 in this directory in order to download gridMet data locally, potentially changing the range of years, in order to acquire data for the desired date range.

In [1]:
import boto3
import pandas as pd
import os
import re
import numpy as np
from pprint import pprint

In [2]:
gridMet_yearly_data = [f for f in os.listdir('.') if re.match(r'gridMet_\d{4}.parquet.gz', f)]
pprint(gridMet_yearly_data)

['gridMet_2000.parquet.gz',
 'gridMet_2001.parquet.gz',
 'gridMet_2002.parquet.gz',
 'gridMet_2003.parquet.gz',
 'gridMet_2004.parquet.gz',
 'gridMet_2005.parquet.gz',
 'gridMet_2006.parquet.gz',
 'gridMet_2007.parquet.gz',
 'gridMet_2008.parquet.gz',
 'gridMet_2009.parquet.gz',
 'gridMet_2010.parquet.gz',
 'gridMet_2011.parquet.gz',
 'gridMet_2012.parquet.gz',
 'gridMet_2013.parquet.gz',
 'gridMet_2014.parquet.gz',
 'gridMet_2015.parquet.gz',
 'gridMet_2016.parquet.gz',
 'gridMet_2017.parquet.gz',
 'gridMet_2018.parquet.gz',
 'gridMet_2019.parquet.gz']


In [3]:
%%time
df = pd.DataFrame()
for file in gridMet_yearly_data:
    df = pd.concat([df, pd.read_parquet(file)])
print(df.shape)

(10168560, 14)
Wall time: 15.2 s


In [4]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,precipitation_amount_mm,relative_humidity_%,specific_humidity_kg/kg,surface_downwelling_shortwave_flux_in_air_W m-2,wind_from_direction_Degrees Clockwise from north,wind_speed_m/s,max_air_temperature_K,min_air_temperature_K,burning_index_g_Unitless,dead_fuel_moisture_100hr_Percent,dead_fuel_moisture_1000hr_Percent,energy_release_component-g_Unitless,potential_evapotranspiration_mm,mean_vapor_pressure_deficit_kPa
date,latitude,longitude,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2000-01-01,33.566667,-117.975,0.0,40.5,0.006,139.7,222.0,2.1,292.0,282.2,31.0,12.3,12.1,48.0,1.8,0.69
2000-01-01,33.566667,-117.933333,0.0,40.9,0.00593,136.6,222.0,2.1,291.9,282.2,32.0,12.2,12.0,48.0,1.8,0.7
2000-01-01,33.566667,-117.891667,0.0,41.2,0.0058,133.2,222.0,2.3,292.1,282.1,33.0,11.8,11.5,50.0,1.9,0.72
2000-01-01,33.566667,-117.85,0.0,40.8,0.00577,129.8,218.0,2.4,292.0,281.4,34.0,11.8,11.4,50.0,1.9,0.69
2000-01-01,33.566667,-117.808333,0.0,38.2,0.00547,126.5,218.0,2.5,292.6,281.0,37.0,10.7,10.5,55.0,2.1,0.78


In [5]:
df.describe()

Unnamed: 0,precipitation_amount_mm,relative_humidity_%,specific_humidity_kg/kg,surface_downwelling_shortwave_flux_in_air_W m-2,wind_from_direction_Degrees Clockwise from north,wind_speed_m/s,max_air_temperature_K,min_air_temperature_K,burning_index_g_Unitless,dead_fuel_moisture_100hr_Percent,dead_fuel_moisture_1000hr_Percent,energy_release_component-g_Unitless,potential_evapotranspiration_mm,mean_vapor_pressure_deficit_kPa
count,6501450.0,6501450.0,6501450.0,6501450.0,6501450.0,6501450.0,6501450.0,6501450.0,6501450.0,6501450.0,6501450.0,6501450.0,6501450.0,6501450.0
mean,0.8502061,30.11313,0.006529446,238.6129,226.4572,2.97989,297.6179,283.4824,37.09669,12.03044,12.76973,50.19266,4.32674,1.35572
std,4.372622,17.7752,0.002913681,80.01784,77.45626,1.259263,7.789443,6.112466,19.1151,4.546625,4.040355,20.97613,1.93578,0.9774859
min,0.0,0.1,0.00024,4.5,0.0,0.4,268.6,257.5,0.0,1.9,3.4,0.0,0.2,0.0
25%,0.0,15.4,0.00433,165.1,196.0,2.2,292.1,279.1,27.0,8.2,9.7,34.0,2.7,0.63
50%,0.0,26.8,0.00622,246.1,241.0,2.7,297.0,283.4,37.0,12.1,12.9,47.0,4.2,1.09
75%,0.0,42.7,0.00846,312.9,277.0,3.5,303.1,288.0,50.0,15.4,15.5,66.0,5.7,1.83
max,287.1,100.0,0.02392,377.7,360.0,16.2,322.7,307.4,151.0,30.6,30.8,110.0,13.2,7.17


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 10168560 entries, (Timestamp('2000-01-01 00:00:00'), 33.56666666666667, -117.97499996666667) to (Timestamp('2019-12-31 00:00:00'), 32.400000000000006, -116.01666663333334)
Data columns (total 14 columns):
 #   Column                                            Dtype  
---  ------                                            -----  
 0   precipitation_amount_mm                           float64
 1   relative_humidity_%                               float64
 2   specific_humidity_kg/kg                           float64
 3   surface_downwelling_shortwave_flux_in_air_W m-2   float64
 4   wind_from_direction_Degrees Clockwise from north  float64
 5   wind_speed_m/s                                    float64
 6   max_air_temperature_K                             float64
 7   min_air_temperature_K                             float64
 8   burning_index_g_Unitless                          float64
 9   dead_fuel_moisture_100hr_Percent               

In [7]:
np.sum(df.isna())

precipitation_amount_mm                             3667110
relative_humidity_%                                 3667110
specific_humidity_kg/kg                             3667110
surface_downwelling_shortwave_flux_in_air_W m-2     3667110
wind_from_direction_Degrees Clockwise from north    3667110
wind_speed_m/s                                      3667110
max_air_temperature_K                               3667110
min_air_temperature_K                               3667110
burning_index_g_Unitless                            3667110
dead_fuel_moisture_100hr_Percent                    3667110
dead_fuel_moisture_1000hr_Percent                   3667110
energy_release_component-g_Unitless                 3667110
potential_evapotranspiration_mm                     3667110
mean_vapor_pressure_deficit_kPa                     3667110
dtype: int64

In [8]:
s3_url = 'dse-cohort5-group5'
s3 = boto3.client("s3")
all_objects = s3.list_objects(Bucket=s3_url)
all_objects

{'ResponseMetadata': {'RequestId': '09FE33CDDF4788E8',
  'HostId': 'iUDfMPonCvu9NY5xFBNJ89DxKQ/2gDFtxtAo9g1W68KLLPCaRSN0/O0VSSIveBzYOBRbpgb0UsU=',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amz-id-2': 'iUDfMPonCvu9NY5xFBNJ89DxKQ/2gDFtxtAo9g1W68KLLPCaRSN0/O0VSSIveBzYOBRbpgb0UsU=',
   'x-amz-request-id': '09FE33CDDF4788E8',
   'date': 'Sun, 23 Feb 2020 03:00:48 GMT',
   'x-amz-bucket-region': 'us-west-1',
   'content-type': 'application/xml',
   'transfer-encoding': 'chunked',
   'server': 'AmazonS3'},
  'RetryAttempts': 1},
 'IsTruncated': False,
 'Marker': '',
 'Contents': [{'Key': 'test/',
   'LastModified': datetime.datetime(2020, 2, 14, 2, 26, 50, tzinfo=tzutc()),
   'ETag': '"d41d8cd98f00b204e9800998ecf8427e"',
   'Size': 0,
   'StorageClass': 'STANDARD',
   'Owner': {'DisplayName': 'kcoakley+cohort5group5',
    'ID': 'fe3a1755292b139ad5397ba537aa0aff32ef6ebba07fe45dcbe486d81e1ccc11'}}],
 'Name': 'dse-cohort5-group5',
 'Prefix': '',
 'MaxKeys': 1000,
 'EncodingType': 'url'}

In [9]:
s3_url = "s3://dse-cohort5-group5/wildfire_capstone/gridMet.parquet.gz"
df.to_parquet(s3_url, compression="gzip")