In [1]:
import pandas as pd
url = "https://raw.githubusercontent.com/mattharrison/datasets/master/data/alta-noaa-1980-2019.csv"

In [2]:
alta_df = pd.read_csv(url, engine='pyarrow', dtype_backend='pyarrow')
dates = pd.to_datetime(alta_df.DATE)

In [3]:
snow = (alta_df # take the df you want to work from
        .SNOW # .heading that you want to use
        .rename(dates))

In [4]:
snow

1980-01-01    2.0
1980-01-02    3.0
1980-01-03    1.0
1980-01-04    0.0
1980-01-05    0.0
             ... 
2019-09-03    0.0
2019-09-04    0.0
2019-09-05    0.0
2019-09-06    0.0
2019-09-07    0.0
Name: SNOW, Length: 14160, dtype: double[pyarrow]

In [5]:
snow.isna().any()

np.True_

In [6]:
snow[snow.isna()]

1985-07-30    <NA>
1985-09-12    <NA>
1985-09-19    <NA>
1986-02-07    <NA>
1986-06-26    <NA>
              ... 
2017-04-26    <NA>
2017-09-20    <NA>
2017-10-02    <NA>
2017-12-23    <NA>
2018-12-03    <NA>
Name: SNOW, Length: 365, dtype: double[pyarrow]

In [7]:
snow.loc['1985-09':'1985-09-20']

1985-09-01     0.0
1985-09-02     0.0
1985-09-03     0.0
1985-09-04     0.0
1985-09-05     0.0
1985-09-06     0.0
1985-09-07     0.0
1985-09-08     0.0
1985-09-09     0.0
1985-09-10     0.0
1985-09-11     0.0
1985-09-12    <NA>
1985-09-13     0.0
1985-09-14     0.0
1985-09-15     0.0
1985-09-16     0.0
1985-09-17     0.0
1985-09-18     0.0
1985-09-19    <NA>
1985-09-20     0.0
Name: SNOW, dtype: double[pyarrow]

In [8]:
(snow
 .loc['1985-09':'1985-09-20']
 .fillna(0))

1985-09-01    0.0
1985-09-02    0.0
1985-09-03    0.0
1985-09-04    0.0
1985-09-05    0.0
1985-09-06    0.0
1985-09-07    0.0
1985-09-08    0.0
1985-09-09    0.0
1985-09-10    0.0
1985-09-11    0.0
1985-09-12    0.0
1985-09-13    0.0
1985-09-14    0.0
1985-09-15    0.0
1985-09-16    0.0
1985-09-17    0.0
1985-09-18    0.0
1985-09-19    0.0
1985-09-20    0.0
Name: SNOW, dtype: double[pyarrow]

In [9]:
snow.loc['1987-12-30':'1988-01-10']

1987-12-30     6.0
1987-12-31     5.0
1988-01-01    <NA>
1988-01-02     0.0
1988-01-03     0.0
1988-01-04    <NA>
1988-01-05     2.0
1988-01-06     6.0
1988-01-07     4.0
1988-01-08     9.0
1988-01-09     5.0
1988-01-10     2.0
Name: SNOW, dtype: double[pyarrow]

In [10]:
# Forward fill
(snow
 .loc['1987-12-30':'1988-01-10']
 .ffill()
 )



1987-12-30    6.0
1987-12-31    5.0
1988-01-01    5.0
1988-01-02    0.0
1988-01-03    0.0
1988-01-04    0.0
1988-01-05    2.0
1988-01-06    6.0
1988-01-07    4.0
1988-01-08    9.0
1988-01-09    5.0
1988-01-10    2.0
Name: SNOW, dtype: double[pyarrow]

In [11]:
(snow
 .loc['1987-12-30':'1988-01-10']
 .bfill())

1987-12-30    6.0
1987-12-31    5.0
1988-01-01    0.0
1988-01-02    0.0
1988-01-03    0.0
1988-01-04    2.0
1988-01-05    2.0
1988-01-06    6.0
1988-01-07    4.0
1988-01-08    9.0
1988-01-09    5.0
1988-01-10    2.0
Name: SNOW, dtype: double[pyarrow]

In [12]:
(snow
 .loc['1987-12-30':'1988-01-10']
 .interpolate()
 )

1987-12-30    6.0
1987-12-31    5.0
1988-01-01    2.5
1988-01-02    0.0
1988-01-03    0.0
1988-01-04    1.0
1988-01-05    2.0
1988-01-06    6.0
1988-01-07    4.0
1988-01-08    9.0
1988-01-09    5.0
1988-01-10    2.0
Name: SNOW, dtype: double[pyarrow]

In [13]:
winter = (snow.index.quarter == 1) | (snow.index.quarter == 4) # symbol | is an OR operator, showing that winter is in the first and last quarter of the year

In [14]:
(snow.case_when(caselist=[(winter & snow.isna(), snow.interpolate()), # when winter and no data, interpolate
                          (~winter & snow.isna(), 0)])) # when NOT winter and no data, replace with 0

1980-01-01    2.0
1980-01-02    3.0
1980-01-03    1.0
1980-01-04    0.0
1980-01-05    0.0
             ... 
2019-09-03    0.0
2019-09-04    0.0
2019-09-05    0.0
2019-09-06    0.0
2019-09-07    0.0
Name: SNOW, Length: 14160, dtype: double[pyarrow]

In [15]:
(snow
 .where(~(winter & snow.isna()), snow.interpolate())
 .where(~(~winter & snow.isna()), 0))

# very hard to read using .where compared to previous example

1980-01-01    2.0
1980-01-02    3.0
1980-01-03    1.0
1980-01-04    0.0
1980-01-05    0.0
             ... 
2019-09-03    0.0
2019-09-04    0.0
2019-09-05    0.0
2019-09-06    0.0
2019-09-07    0.0
Name: SNOW, Length: 14160, dtype: double[pyarrow]

In [16]:
(snow.loc[['1985-09-19','1988-01-01']])

1985-09-19    <NA>
1988-01-01    <NA>
Name: SNOW, dtype: double[pyarrow]

In [17]:
(snow.case_when(caselist=[
    (winter & snow.isna(), snow.interpolate()),
    (~winter & snow.isna(), 0)
])
.loc[['1985-09-19','1988-01-01']])

1985-09-19    0.0
1988-01-01    2.5
Name: SNOW, dtype: double[pyarrow]

In [18]:
snow.shift(1)

1980-01-01    <NA>
1980-01-02     2.0
1980-01-03     3.0
1980-01-04     1.0
1980-01-05     0.0
              ... 
2019-09-03     0.0
2019-09-04     0.0
2019-09-05     0.0
2019-09-06     0.0
2019-09-07     0.0
Name: SNOW, Length: 14160, dtype: double[pyarrow]

In [19]:
snow.shift(-1)

1980-01-01     3.0
1980-01-02     1.0
1980-01-03     0.0
1980-01-04     0.0
1980-01-05     1.0
              ... 
2019-09-03     0.0
2019-09-04     0.0
2019-09-05     0.0
2019-09-06     0.0
2019-09-07    <NA>
Name: SNOW, Length: 14160, dtype: double[pyarrow]

In [20]:
(snow
 .add(snow.shift(1)) # changes the allignment +1, then on top of that number, it adds that shifted value
 .add(snow.shift(2)) # same here
 .add(snow.shift(3)) # etc
 .add(snow.shift(4))
 .div(5) # creates the mean
 ).head(20)

1980-01-01    <NA>
1980-01-02    <NA>
1980-01-03    <NA>
1980-01-04    <NA>
1980-01-05     1.2
1980-01-06     1.0
1980-01-07     1.2
1980-01-08     2.2
1980-01-09     4.0
1980-01-10     7.8
1980-01-11    10.0
1980-01-12    10.6
1980-01-13    10.6
1980-01-14     9.6
1980-01-15     9.2
1980-01-16     7.0
1980-01-17     6.0
1980-01-18     5.8
1980-01-19     6.0
1980-01-20     2.6
Name: SNOW, dtype: double[pyarrow]

In [21]:
# more elegant version
(snow
 .rolling(5)
 .mean()
)

1980-01-01    NaN
1980-01-02    NaN
1980-01-03    NaN
1980-01-04    NaN
1980-01-05    1.2
             ... 
2019-09-03    0.0
2019-09-04    0.0
2019-09-05    0.0
2019-09-06    0.0
2019-09-07    0.0
Name: SNOW, Length: 14160, dtype: float64

In [22]:
(snow
 .resample('ME')
 .max()
 )

1980-01-31    20.0
1980-02-29    25.0
1980-03-31    16.0
1980-04-30    10.0
1980-05-31     9.0
              ... 
2019-05-31     5.1
2019-06-30     0.0
2019-07-31     0.0
2019-08-31     0.0
2019-09-30     0.0
Freq: ME, Name: SNOW, Length: 477, dtype: double[pyarrow]

In [23]:
(snow
 .resample('2ME')
 .max()
)

1980-01-31    20.0
1980-03-31    25.0
1980-05-31    10.0
1980-07-31     1.0
1980-09-30     0.0
              ... 
2019-01-31    19.0
2019-03-31    20.7
2019-05-31    18.0
2019-07-31     0.0
2019-09-30     0.0
Freq: 2ME, Name: SNOW, Length: 239, dtype: double[pyarrow]

In [24]:
(snow
 .resample('YE-MAY')
 .sum()
 )





1980-05-31    456.5
1981-05-31    495.0
1982-05-31    794.5
1983-05-31    855.0
1984-05-31    793.0
1985-05-31    564.3
1986-05-31    716.0
1987-05-31    272.1
1988-05-31    312.3
1989-05-31    431.7
1990-05-31    331.5
1991-05-31    504.7
1992-05-31    333.8
1993-05-31    673.5
1994-05-31    338.0
1995-05-31    627.0
1996-05-31    530.5
1997-05-31    576.6
1998-05-31    555.6
1999-05-31    454.7
2000-05-31    448.0
2001-05-31    471.0
2002-05-31    464.8
2003-05-31    353.4
2004-05-31    520.0
2005-05-31    476.0
2006-05-31    596.6
2007-05-31    289.1
2008-05-31    625.6
2009-05-31    475.3
2010-05-31    403.0
2011-05-31    534.2
2012-05-31    293.6
2013-05-31    352.8
2014-05-31    351.7
2015-05-31    301.3
2016-05-31    348.6
2017-05-31    507.4
2018-05-31    331.4
2019-05-31    504.5
2020-05-31      0.0
Freq: YE-MAY, Name: SNOW, dtype: double[pyarrow]

In [25]:
(snow
 .div(snow
      .resample('QE')
      .transform('sum'))
      .mul(100)
      .fillna(0)
      )

1980-01-01    0.527009
1980-01-02    0.790514
1980-01-03    0.263505
1980-01-04         0.0
1980-01-05         0.0
                ...   
2019-09-03         NaN
2019-09-04         NaN
2019-09-05         NaN
2019-09-06         NaN
2019-09-07         NaN
Name: SNOW, Length: 14160, dtype: double[pyarrow]

In [26]:
(snow
 .resample('ME')
 .sum()
)

1980-01-31    144.0
1980-02-29    112.5
1980-03-31    123.0
1980-04-30     30.0
1980-05-31     47.0
              ...  
2019-05-31     26.0
2019-06-30      0.0
2019-07-31      0.0
2019-08-31      0.0
2019-09-30      0.0
Freq: ME, Name: SNOW, Length: 477, dtype: double[pyarrow]

In [27]:
(snow
 .resample('ME')
 .transform('sum')
 )

1980-01-01    144.0
1980-01-02    144.0
1980-01-03    144.0
1980-01-04    144.0
1980-01-05    144.0
              ...  
2019-09-03      0.0
2019-09-04      0.0
2019-09-05      0.0
2019-09-06      0.0
2019-09-07      0.0
Name: SNOW, Length: 14160, dtype: double[pyarrow]

In [28]:
season2017 = snow.loc['2016-10':'2017-05']

In [29]:
(season2017
 .resample('ME')
 .sum()
 .div(season2017
      .sum())
      .mul(100))


2016-10-31     2.153969
2016-11-30     9.772637
2016-12-31    15.715995
2017-01-31    25.468688
2017-02-28    21.041085
2017-03-31     9.274033
2017-04-30    14.738732
2017-05-31     1.834862
Freq: ME, Name: SNOW, dtype: double[pyarrow]

In [30]:

(season2017
 .resample('ME')
 .sum()
 .div(season2017
      .sum())
      .mul(100)
      )


2016-10-31     2.153969
2016-11-30     9.772637
2016-12-31    15.715995
2017-01-31    25.468688
2017-02-28    21.041085
2017-03-31     9.274033
2017-04-30    14.738732
2017-05-31     1.834862
Freq: ME, Name: SNOW, dtype: double[pyarrow]

In [58]:
def season(idx):
    year = idx.year
    month = idx.month
    return year.where((month < 10), year + 1)

In [57]:
(snow
 .groupby(season)
 .sum()
 )

1980    457.5
1981    503.0
1982    842.5
1983    807.5
1984    816.0
1985    536.0
1986    740.8
1987    243.1
1988    314.5
1989    429.5
1990    331.5
1991    504.7
1992    340.8
1993    683.5
1994    321.0
1995    645.0
1996    525.5
1997    563.6
1998    579.6
1999    435.7
2000    453.0
2001    468.0
2002    457.8
2003    365.4
2004    514.0
2005    472.0
2006    594.6
2007    319.7
2008    606.0
2009    476.8
2010    391.0
2011    533.8
2012    293.5
2013    362.8
2014    358.7
2015    284.3
2016    354.6
2017    524.0
2018    308.8
2019    504.5
Name: SNOW, dtype: double[pyarrow]

In [33]:
def calc_pct(s):
    return s.div(s.sum()).mul(100) 

In [34]:
(snow
 .resample('ME')
 .sum()
 .groupby(season)
 .apply(calc_pct)
 )

1980  1980-01-31     31.47541
      1980-02-29    24.590164
      1980-03-31    26.885246
      1980-04-30     6.557377
      1980-05-31    10.273224
                      ...    
2019  2019-05-31     5.153617
      2019-06-30          0.0
      2019-07-31          0.0
      2019-08-31          0.0
      2019-09-30          0.0
Name: SNOW, Length: 477, dtype: double[pyarrow]

In [35]:
(snow
 .resample('YE-SEP')
 .transform('cumsum')
 )

1980-01-01      2.0
1980-01-02      5.0
1980-01-03      6.0
1980-01-04      6.0
1980-01-05      6.0
              ...  
2019-09-03    504.5
2019-09-04    504.5
2019-09-05    504.5
2019-09-06    504.5
2019-09-07    504.5
Name: SNOW, Length: 14160, dtype: double[pyarrow]

In [36]:
# Exercises

In [37]:
# Convert a column with date information to a date
snow_with_year = snow.copy()
snow_with_year.index = snow_with_year.index.year
snow_with_year

1980    2.0
1980    3.0
1980    1.0
1980    0.0
1980    0.0
       ... 
2019    0.0
2019    0.0
2019    0.0
2019    0.0
2019    0.0
Name: SNOW, Length: 14160, dtype: double[pyarrow]

In [38]:
# Put the date information into the index for a numeric column
pd.set_option('display.max_columns', 22)

date_as_index = pd.to_datetime(alta_df.DATE)

location = ((alta_df[['LATITUDE','LONGITUDE','ELEVATION']])
            .rename(date_as_index)
            )
location


Unnamed: 0,LATITUDE,LONGITUDE,ELEVATION
1980-01-01,40.5905,-111.6369,2660.9
1980-01-02,40.5905,-111.6369,2660.9
1980-01-03,40.5905,-111.6369,2660.9
1980-01-04,40.5905,-111.6369,2660.9
1980-01-05,40.5905,-111.6369,2660.9
...,...,...,...
2019-09-03,40.5905,-111.6369,2660.9
2019-09-04,40.5905,-111.6369,2660.9
2019-09-05,40.5905,-111.6369,2660.9
2019-09-06,40.5905,-111.6369,2660.9


In [39]:
# Calculate the average value of the column for each month

(snow
 .resample('ME')
 .mean()
 )


1980-01-31    4.645161
1980-02-29     3.87931
1980-03-31    3.967742
1980-04-30         1.0
1980-05-31    1.516129
                ...   
2019-05-31     0.83871
2019-06-30         0.0
2019-07-31         0.0
2019-08-31         0.0
2019-09-30         0.0
Freq: ME, Name: SNOW, Length: 477, dtype: double[pyarrow]

In [40]:
# Calculate the average value of the column for every two months.

(snow
 .resample('2ME', closed='left')
 .mean()
 )


1980-02-29    4.313559
1980-04-30    2.540984
1980-06-30    0.786885
1980-08-31         0.0
1980-10-31    0.606557
                ...   
2019-02-28    3.461017
2019-04-30    2.406557
2019-06-30    0.496721
2019-08-31         0.0
2019-10-31         0.0
Freq: 2ME, Name: SNOW, Length: 239, dtype: double[pyarrow]

In [262]:
# Calculate the percentage of the column, out of the total, for each month.

snow2 = snow.copy()

(snow2
.resample('ME')
.sum()
.div((snow2
 .resample('YE')
 .sum()))).dropna().head(60)

1980-12-31    0.059807
1981-12-31    0.267248
1982-12-31    0.194232
1983-12-31    0.270315
1984-12-31    0.154457
1985-12-31    0.126453
1986-12-31    0.014013
1987-12-31    0.235741
1988-12-31    0.222914
1989-12-31    0.076119
1990-12-31    0.138458
1991-12-31    0.069918
1992-12-31    0.266247
1993-12-31    0.086806
1994-12-31    0.128755
1995-12-31    0.069856
1996-12-31    0.184926
1997-12-31    0.147071
1998-12-31    0.070721
1999-12-31    0.191465
2000-12-31     0.13475
2001-12-31    0.160026
2002-12-31    0.121125
2003-12-31    0.260104
2004-12-31         0.0
2005-12-31    0.237833
2006-12-31    0.088663
2007-12-31    0.349919
2008-12-31    0.169565
2009-12-31    0.159838
2010-12-31     0.19467
2011-12-31    0.037347
2012-12-31     0.25426
2013-12-31     0.15706
2014-12-31    0.188432
2015-12-31    0.354478
2016-12-31    0.211714
2017-12-31    0.084324
2018-12-31    0.162569
Name: SNOW, dtype: double[pyarrow]

In [None]:
(snow2
 .resample('YE')
 .transform('sum'))

1980-01-01    568.5
1980-01-02    568.5
1980-01-03    568.5
1980-01-04    568.5
1980-01-05    568.5
              ...  
2019-09-03    374.5
2019-09-04    374.5
2019-09-05    374.5
2019-09-06    374.5
2019-09-07    374.5
Name: SNOW, Length: 14160, dtype: double[pyarrow]

In [271]:
# Calculate the average value of the column for a rolling window of size 7.

(snow2
 .interpolate()
 .rolling(7)
 .mean()).dropna().head(60)

1980-01-07     1.571429
1980-01-08     2.142857
1980-01-09     3.000000
1980-01-10     5.571429
1980-01-11     7.285714
1980-01-12     8.285714
1980-01-13     9.000000
1980-01-14     9.000000
1980-01-15    10.571429
1980-01-16     9.428571
1980-01-17     7.000000
1980-01-18     6.000000
1980-01-19     5.714286
1980-01-20     4.857143
1980-01-21     4.285714
1980-01-22     1.857143
1980-01-23     1.714286
1980-01-24     1.428571
1980-01-25     0.714286
1980-01-26     0.000000
1980-01-27     0.000000
1980-01-28     1.714286
1980-01-29     2.857143
1980-01-30     5.714286
1980-01-31     5.714286
1980-02-01     5.714286
1980-02-02     5.714286
1980-02-03     5.714286
1980-02-04     4.142857
1980-02-05     3.000000
1980-02-06     0.142857
1980-02-07     1.428571
1980-02-08     1.428571
1980-02-09     1.428571
1980-02-10     1.428571
1980-02-11     1.285714
1980-02-12     1.285714
1980-02-13     1.285714
1980-02-14     0.214286
1980-02-15     3.785714
1980-02-16     4.500000
1980-02-17     4

In [275]:
# Using .loc pull out the first three months of a year.

(snow2.loc['1980-01':'1980-03'])


1980-01-01     2.0
1980-01-02     3.0
1980-01-03     1.0
1980-01-04     0.0
1980-01-05     0.0
              ... 
1980-03-27     0.0
1980-03-28    12.0
1980-03-29     0.0
1980-03-30     0.0
1980-03-31    13.0
Name: SNOW, Length: 91, dtype: double[pyarrow]

In [276]:
# Using .loc pull out the last four months of a year.

(snow2
 .loc['1980-08':'1980-12'])

1980-08-01    0.0
1980-08-02    0.0
1980-08-03    0.0
1980-08-04    0.0
1980-08-05    0.0
             ... 
1980-12-27    0.0
1980-12-28    0.0
1980-12-29    0.0
1980-12-30    0.0
1980-12-31    0.0
Name: SNOW, Length: 153, dtype: double[pyarrow]

In [280]:
snow2.loc[snow2.index.month >= 8]

# used for showing the last 4 months of every year, compared to specific year

1980-08-01    0.0
1980-08-02    0.0
1980-08-03    0.0
1980-08-04    0.0
1980-08-05    0.0
             ... 
2019-09-03    0.0
2019-09-04    0.0
2019-09-05    0.0
2019-09-06    0.0
2019-09-07    0.0
Name: SNOW, Length: 5842, dtype: double[pyarrow]