-----------
## In this notebook:
* Import and clean up df
* Take 'precip' difference
* Create precip and no precip DFs
-----------

In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

## Load Data

In [2]:
df = pd.read_parquet('/tf/NYSM/mesonet_parquet/201508-202012.parquet')

In [3]:
df.columns

Index(['index', 'station', 'time_5M', 'lat', 'lon', 'elev', 'tair', 'ta9m',
       'tslo', 'relh', 'srad', 'pres', 'wspd_sonic', 'wmax_sonic',
       'wssd_sonic', 'wdir_sonic', 'wdsd_sonic', 'wspd_prop', 'wmax_prop',
       'wssd_prop', 'wdir_prop', 'wdsd_prop', 'wspd_merge', 'wmax_merge',
       'wssd_merge', 'wdir_merge', 'wdsd_merge', 'precip', 'precip_total',
       'precip_max_intensity', 'SRAD1HR', 'ts05', 'ts25', 'ts50', 'sm05',
       'sm25', 'sm50', 'frozen05', 'frozen25', 'frozen50', 'snow_depth',
       'filedate', 'tair_qa', 'relh_qa', 'tslo_qa', 'ta9m_qa', 'srad_qa',
       'wspd_sonic_qa', 'wmax_sonic_qa', 'wssd_sonic_qa', 'wdir_sonic_qa',
       'wdsd_sonic_qa', 'wvec_sonic_qa', 'count_sonic_qa', 'wspd_prop_qa',
       'wmax_prop_qa', 'wssd_prop_qa', 'wdir_prop_qa', 'wdsd_prop_qa',
       'wvec_prop_qa', 'pres_qa', 'precip_qa', 'precip_maxint_qa',
       'precip_sumint_qa', 'precip_rt_nrt_qa', 'precip_nrt_qa',
       'precip_total_qa', 'sr50a_dist_qa', 'ts05_qa', 'er05_

In [4]:
# drop unecessary variables

drop_vars = ['lat', 'lon', 'elev',
       'tslo', 'relh', 'srad', 'pres', 'wspd_sonic', 'wmax_sonic',
       'wssd_sonic', 'wdir_sonic', 'wdsd_sonic', 'wspd_prop', 'wmax_prop',
       'wssd_prop', 'wdir_prop', 'wdsd_prop', 'wspd_merge', 'wmax_merge',
       'wssd_merge', 'wdir_merge', 'wdsd_merge', 'SRAD1HR', 'ts05', 'ts25', 'ts50', 'sm05',
       'sm25', 'sm50', 'frozen05', 'frozen25', 'frozen50',
       'filedate', 'tair_qa', 'relh_qa', 'tslo_qa', 'ta9m_qa', 'srad_qa',
       'wspd_sonic_qa', 'wmax_sonic_qa', 'wssd_sonic_qa', 'wdir_sonic_qa',
       'wdsd_sonic_qa', 'wvec_sonic_qa', 'count_sonic_qa', 'wspd_prop_qa',
       'wmax_prop_qa', 'wssd_prop_qa', 'wdir_prop_qa', 'wdsd_prop_qa',
       'wvec_prop_qa', 'pres_qa', 'precip_qa', 'precip_maxint_qa',
       'precip_sumint_qa', 'precip_rt_nrt_qa', 'precip_nrt_qa',
       'precip_total_qa', 'sr50a_dist_qa', 'ts05_qa', 'er05_qa', 'ert05_qa',
       'lsm05_qa', 'ts25_qa', 'er25_qa', 'ert25_qa', 'lsm25_qa', 'ts50_qa',
       'er50_qa', 'ert50_qa', 'lsm50_qa', 'batv_qa', 'flsv_qa', 'stnm',
       'stid']

df = df.drop(drop_vars,axis=1)

## Precip vs No Precip

In [5]:
# use precip values and corresponding dates
values = np.array(df['precip'])
dates = np.array(df['time_5M'])

# differentiate to determine where there is/isn't precip
values_diffs = np.diff(values)
values_diff = np.append([-1], values_diffs)

# define where there is precipitation
precip = np.where(values_diff > 0)
precip_dates = dates[precip]

# define where there is no precipitation
no_precip = np.where(values_diff == 0)
no_precip_dates = dates[no_precip]

In [6]:
print(np.array(precip).shape)
print(np.array(no_precip).shape)

(1, 3466255)
(1, 51950185)


In [7]:
# add differences to df
df['precip_diff'] = values_diff

## Convert to DFs

In [8]:
# create a preicp df
precip_df = df[(df['precip_diff'] > 0)]

# change index
precip_df['index'] = list(range(0,len(precip_df)))

In [10]:
precip_df[0:30]

Unnamed: 0,index,station,time_5M,tair,ta9m,precip,precip_total,precip_max_intensity,snow_depth,precip_diff
411,0,b'SCHU',2015-08-11 10:15:00,,,0.054,0.0,0.0,,0.054
412,1,b'SCHU',2015-08-11 10:20:00,,,0.196,0.054,0.0,,0.142
413,2,b'SCHU',2015-08-11 10:25:00,,,0.441,0.196,0.0,,0.245
414,3,b'SCHU',2015-08-11 10:30:00,,,0.702,0.441,0.0,,0.261
415,4,b'SCHU',2015-08-11 10:35:00,,,0.822,0.702,0.0,,0.12
416,5,b'SCHU',2015-08-11 10:40:00,,,0.978,0.822,0.0,,0.156
417,6,b'SCHU',2015-08-11 10:45:00,,,1.26,0.978,0.0,,0.282
418,7,b'SCHU',2015-08-11 10:50:00,,,1.56,1.26,0.0,,0.3
419,8,b'SCHU',2015-08-11 10:55:00,,,1.773,1.56,0.0,,0.213
420,9,b'SCHU',2015-08-11 11:00:00,,,2.161,1.773,0.0,,0.388


In [13]:
# save precip df
precip_df.to_pickle("precip_df.pkl")

In [11]:
# create a no_precip df
no_precip_df = df[(df['precip_diff'] == 0)]

# change index
no_precip_df['index'] = list(range(0,len(no_precip_df)))

In [12]:
no_precip_df[0:30]

Unnamed: 0,index,station,time_5M,tair,ta9m,precip,precip_total,precip_max_intensity,snow_depth,precip_diff
291,0,b'SCHU',2015-08-11 00:15:00,,,0.0,0.0,0.0,,0.0
292,1,b'SCHU',2015-08-11 00:20:00,,,0.0,0.0,0.0,,0.0
293,2,b'SCHU',2015-08-11 00:25:00,,,0.0,0.0,0.0,,0.0
294,3,b'SCHU',2015-08-11 00:30:00,,,0.0,0.0,0.0,,0.0
295,4,b'SCHU',2015-08-11 00:35:00,,,0.0,0.0,0.0,,0.0
296,5,b'SCHU',2015-08-11 00:40:00,,,0.0,0.0,0.0,,0.0
297,6,b'SCHU',2015-08-11 00:45:00,,,0.0,0.0,0.0,,0.0
298,7,b'SCHU',2015-08-11 00:50:00,,,0.0,0.0,0.0,,0.0
299,8,b'SCHU',2015-08-11 00:55:00,,,0.0,0.0,0.0,,0.0
300,9,b'SCHU',2015-08-11 01:00:00,,,0.0,0.0,0.0,,0.0


In [14]:
# save no precip df
no_precip_df.to_pickle("no_precip_df.pkl")