# Chapter 13: Dates in the Index

In [3]:
import pandas as pd
import numpy as np

In [6]:
url = 'https://github.com/mattharrison/datasets/raw/master/data/alta-noaa-1980-2019.csv'
alta_df = pd.read_csv(url)
dates = pd.to_datetime(alta_df.DATE)

In [7]:
snow = (alta_df.SNOW.rename(dates))

In [9]:
snow

1980-01-01    2.0
1980-01-02    3.0
1980-01-03    1.0
1980-01-04    0.0
1980-01-05    0.0
             ... 
2019-09-03    0.0
2019-09-04    0.0
2019-09-05    0.0
2019-09-06    0.0
2019-09-07    0.0
Name: SNOW, Length: 14160, dtype: float64

## 13.1 Finding Missing Data

- Check is any values missing using ``.any``

In [10]:
snow.isna().any()

True

In [11]:
snow[snow.isna()]

1985-07-30   NaN
1985-09-12   NaN
1985-09-19   NaN
1986-02-07   NaN
1986-06-26   NaN
              ..
2017-04-26   NaN
2017-09-20   NaN
2017-10-02   NaN
2017-12-23   NaN
2018-12-03   NaN
Name: SNOW, Length: 365, dtype: float64

In [12]:
snow.loc['1985-09':'1985-09-20']

1985-09-01    0.0
1985-09-02    0.0
1985-09-03    0.0
1985-09-04    0.0
1985-09-05    0.0
1985-09-06    0.0
1985-09-07    0.0
1985-09-08    0.0
1985-09-09    0.0
1985-09-10    0.0
1985-09-11    0.0
1985-09-12    NaN
1985-09-13    0.0
1985-09-14    0.0
1985-09-15    0.0
1985-09-16    0.0
1985-09-17    0.0
1985-09-18    0.0
1985-09-19    NaN
1985-09-20    0.0
Name: SNOW, dtype: float64

## 13.2 Filling in Missing Data

In [13]:
(snow
.loc['1985-09': '1985-09-20']
.fillna(0))

1985-09-01    0.0
1985-09-02    0.0
1985-09-03    0.0
1985-09-04    0.0
1985-09-05    0.0
1985-09-06    0.0
1985-09-07    0.0
1985-09-08    0.0
1985-09-09    0.0
1985-09-10    0.0
1985-09-11    0.0
1985-09-12    0.0
1985-09-13    0.0
1985-09-14    0.0
1985-09-15    0.0
1985-09-16    0.0
1985-09-17    0.0
1985-09-18    0.0
1985-09-19    0.0
1985-09-20    0.0
Name: SNOW, dtype: float64

## 13.3 Interpolation

In [14]:
(snow
.loc['1987-12-30':'1988-01-10']
.interpolate())

1987-12-30    6.0
1987-12-31    5.0
1988-01-01    2.5
1988-01-02    0.0
1988-01-03    0.0
1988-01-04    1.0
1988-01-05    2.0
1988-01-06    6.0
1988-01-07    4.0
1988-01-08    9.0
1988-01-09    5.0
1988-01-10    2.0
Name: SNOW, dtype: float64

- Interpolate when it is winter and we are missing snow values
- This corresponds to quarter 1 and 4
- ``.where`` method keeps values where the first parameter is True

In [15]:
winter = (snow.index.quarter == 1) | (snow.index.quarter==4)

In [16]:
winter

array([ True,  True,  True, ..., False, False, False])

In [17]:
(snow
.where(~(winter & snow.isna()), snow.interpolate())
.where(~(~winter & snow.isna()), 0)
)

1980-01-01    2.0
1980-01-02    3.0
1980-01-03    1.0
1980-01-04    0.0
1980-01-05    0.0
             ... 
2019-09-03    0.0
2019-09-04    0.0
2019-09-05    0.0
2019-09-06    0.0
2019-09-07    0.0
Name: SNOW, Length: 14160, dtype: float64

## 13.4 Dropping Missing Values

In [18]:
(snow.loc['1987-12-30':'1988-01-10']
.dropna())

1987-12-30    6.0
1987-12-31    5.0
1988-01-02    0.0
1988-01-03    0.0
1988-01-05    2.0
1988-01-06    6.0
1988-01-07    4.0
1988-01-08    9.0
1988-01-09    5.0
1988-01-10    2.0
Name: SNOW, dtype: float64

## 13.5 Shifting Data

- Shift the data up or down

In [19]:
snow.shift(1)

1980-01-01    NaN
1980-01-02    2.0
1980-01-03    3.0
1980-01-04    1.0
1980-01-05    0.0
             ... 
2019-09-03    0.0
2019-09-04    0.0
2019-09-05    0.0
2019-09-06    0.0
2019-09-07    0.0
Name: SNOW, Length: 14160, dtype: float64

## 13.5 Rolling Average

- We can calculate five day moving average in two ways

In [20]:
# method 1
(snow
.add(snow.shift(1))
.add(snow.shift(2))
.add(snow.shift(3))
.add(snow.shift(4))
.div(5))

1980-01-01    NaN
1980-01-02    NaN
1980-01-03    NaN
1980-01-04    NaN
1980-01-05    1.2
             ... 
2019-09-03    0.0
2019-09-04    0.0
2019-09-05    0.0
2019-09-06    0.0
2019-09-07    0.0
Name: SNOW, Length: 14160, dtype: float64

In [21]:
# method 2
(snow
.rolling(5)
.mean())

1980-01-01    NaN
1980-01-02    NaN
1980-01-03    NaN
1980-01-04    NaN
1980-01-05    1.2
             ... 
2019-09-03    0.0
2019-09-04    0.0
2019-09-05    0.0
2019-09-06    0.0
2019-09-07    0.0
Name: SNOW, Length: 14160, dtype: float64

## 13.7 Resampling

- We use the ``.resample`` method to aggregate values at different levels
- If we have dates in the index, we can use the ``.resample`` method to aggregate at date frequencies
- A a high level, we group date entries by some interval (yearly, monthly, weekly) and then aggregate the values at the interval
- The 'M' string in the ``.resample`` call is called an offset alias. using M means group all values by the end of the month
- If we want to aggregate at the end of every two months, we can use '2M' as the offset alias
- If we want to aggregate the maximum value for each ski season which normally ends in May, we could use 'A-MAY' indicates that we want an annual grouping but ending in May of each year

In [22]:
# group all values by the end of the month
(snow
.resample('M')
.max())

1980-01-31    20.0
1980-02-29    25.0
1980-03-31    16.0
1980-04-30    10.0
1980-05-31     9.0
              ... 
2019-05-31     5.1
2019-06-30     0.0
2019-07-31     0.0
2019-08-31     0.0
2019-09-30     0.0
Freq: M, Name: SNOW, Length: 477, dtype: float64

In [23]:
# group all values at the end of every two months
(snow
.resample('2M')
.max())

1980-01-31    20.0
1980-03-31    25.0
1980-05-31    10.0
1980-07-31     1.0
1980-09-30     0.0
              ... 
2019-01-31    19.0
2019-03-31    20.7
2019-05-31    18.0
2019-07-31     0.0
2019-09-30     0.0
Freq: 2M, Name: SNOW, Length: 239, dtype: float64

In [24]:
# group annually but ending in May
(snow
.resample('A-MAY')
.max())

1980-05-31    25.0
1981-05-31    26.0
1982-05-31    34.0
1983-05-31    38.0
1984-05-31    25.0
1985-05-31    22.0
1986-05-31    34.0
1987-05-31    16.0
1988-05-31    23.0
1989-05-31    30.0
1990-05-31    32.0
1991-05-31    28.0
1992-05-31    22.0
1993-05-31    30.0
1994-05-31    36.0
1995-05-31    25.0
1996-05-31    34.0
1997-05-31    22.0
1998-05-31    29.0
1999-05-31    26.0
2000-05-31    23.0
2001-05-31    19.0
2002-05-31    28.0
2003-05-31    14.0
2004-05-31    24.0
2005-05-31    31.0
2006-05-31    27.0
2007-05-31    15.0
2008-05-31    21.0
2009-05-31    23.0
2010-05-31    32.0
2011-05-31    22.0
2012-05-31    18.0
2013-05-31    19.0
2014-05-31    11.0
2015-05-31    25.0
2016-05-31    15.0
2017-05-31    26.0
2018-05-31    21.8
2019-05-31    20.7
2020-05-31     0.0
Freq: A-MAY, Name: SNOW, dtype: float64

## 13.8 Gathering Aggregate Values (But Keeping Index)

- Instead of performing aggregation with ``.resample``, we leverage the ``.transform`` method
- It works on aggregation groups but returns a series with original index

In [27]:
(snow
.div(snow
    .resample('Q')
    .transform('sum'))
.mul(100)
.fillna(0)
)

1980-01-01    0.527009
1980-01-02    0.790514
1980-01-03    0.263505
1980-01-04    0.000000
1980-01-05    0.000000
                ...   
2019-09-03    0.000000
2019-09-04    0.000000
2019-09-05    0.000000
2019-09-06    0.000000
2019-09-07    0.000000
Name: SNOW, Length: 14160, dtype: float64

- To compute the percentage of season's snowfall that fell during each month

In [31]:
season2017 = snow.loc['2016-10':'2017-05']
season2017

2016-10-01    0.0
2016-10-02    0.0
2016-10-03    4.9
2016-10-04    0.0
2016-10-05    0.6
             ... 
2017-05-27    0.0
2017-05-28    0.0
2017-05-29    0.0
2017-05-30    0.0
2017-05-31    0.0
Name: SNOW, Length: 243, dtype: float64

In [29]:
(season2017
.resample('M')
.sum()
.div(season2017
     .sum())
.mul(100)
)

2016-10-31     2.153969
2016-11-30     9.772637
2016-12-31    15.715995
2017-01-31    25.468688
2017-02-28    21.041085
2017-03-31     9.274033
2017-04-30    14.738732
2017-05-31     1.834862
Freq: M, Name: SNOW, dtype: float64

## 13.9 Groupby Operations

In [32]:
def season(idx):
    """
    Function that will determine ski season by 
    looking at the index with date information
    """
    year = idx.year
    month = idx.month
    return year.where((month < 10), year+1)

In [51]:
(snow
.groupby(season)
.sum())

1980    457.5
1981    503.0
1982    842.5
1983    807.5
1984    816.0
1985    536.0
1986    740.8
1987    243.1
1988    314.5
1989    429.5
1990    331.5
1991    504.7
1992    340.8
1993    683.5
1994    321.0
1995    645.0
1996    525.5
1997    563.6
1998    579.6
1999    435.7
2000    453.0
2001    468.0
2002    457.8
2003    365.4
2004    514.0
2005    472.0
2006    594.6
2007    319.7
2008    606.0
2009    476.8
2010    391.0
2011    533.8
2012    293.5
2013    362.8
2014    358.7
2015    284.3
2016    354.6
2017    524.0
2018    308.8
2019    504.5
Name: SNOW, dtype: float64

- If we have dates in the index, we can use the ``.resample`` method to aggregate at date frequencies
- The ``.transform`` method will take the resulting aggregates and place them back in the cell that contributed to the value

In [52]:
(snow
.resample('M')
.sum())

1980-01-31    144.0
1980-02-29    112.5
1980-03-31    123.0
1980-04-30     30.0
1980-05-31     47.0
              ...  
2019-05-31     26.0
2019-06-30      0.0
2019-07-31      0.0
2019-08-31      0.0
2019-09-30      0.0
Freq: M, Name: SNOW, Length: 477, dtype: float64

In [54]:
(snow
.resample('M')
.transform('sum')
)

1980-01-01    144.0
1980-01-02    144.0
1980-01-03    144.0
1980-01-04    144.0
1980-01-05    144.0
              ...  
2019-09-03      0.0
2019-09-04      0.0
2019-09-05      0.0
2019-09-06      0.0
2019-09-07      0.0
Name: SNOW, Length: 14160, dtype: float64

## 13.10 Cumulative Operations

- ``.cummin``: returns cumulative minimum
- ``.cummax``: returns cumulative maximum
- ``.cumprod``: returns cumulative product
- ``.cumsum``: returns cumulative sum

In [55]:
(snow
.loc['2016-10':'2017-09']
.cumsum())

2016-10-01      0.0
2016-10-02      0.0
2016-10-03      4.9
2016-10-04      4.9
2016-10-05      5.5
              ...  
2017-09-26    524.0
2017-09-27    524.0
2017-09-28    524.0
2017-09-29    524.0
2017-09-30    524.0
Name: SNOW, Length: 364, dtype: float64

- If we wanted for every year

In [57]:
(snow
.resample('A-SEP')
.transform('cumsum'))

1980-01-01      2.0
1980-01-02      5.0
1980-01-03      6.0
1980-01-04      6.0
1980-01-05      6.0
              ...  
2019-09-03    504.5
2019-09-04    504.5
2019-09-05    504.5
2019-09-06    504.5
2019-09-07    504.5
Name: SNOW, Length: 14160, dtype: float64

In [58]:
snow

1980-01-01    2.0
1980-01-02    3.0
1980-01-03    1.0
1980-01-04    0.0
1980-01-05    0.0
             ... 
2019-09-03    0.0
2019-09-04    0.0
2019-09-05    0.0
2019-09-06    0.0
2019-09-07    0.0
Name: SNOW, Length: 14160, dtype: float64

## Exercises

- Calculate the average value of the column for each month

In [60]:
(snow
.resample('M')
.mean())

1980-01-31    4.645161
1980-02-29    3.879310
1980-03-31    3.967742
1980-04-30    1.000000
1980-05-31    1.516129
                ...   
2019-05-31    0.838710
2019-06-30    0.000000
2019-07-31    0.000000
2019-08-31    0.000000
2019-09-30    0.000000
Freq: M, Name: SNOW, Length: 477, dtype: float64

- Calculate the average value of the column for every 2 months

In [61]:
(snow
.resample('2M')
.mean())

1980-01-31    4.645161
1980-03-31    3.925000
1980-05-31    1.262295
1980-07-31    0.016393
1980-09-30    0.000000
                ...   
2019-01-31    2.459016
2019-03-31    3.459322
2019-05-31    1.342623
2019-07-31    0.000000
2019-09-30    0.000000
Freq: 2M, Name: SNOW, Length: 239, dtype: float64

- Calculate the percentage of the column out of the total for each month

In [64]:
(snow
.resample('M')
.sum()
.div(snow
     .sum())
.mul(100)
)

1980-01-31    0.747485
1980-02-29    0.583973
1980-03-31    0.638477
1980-04-30    0.155726
1980-05-31    0.243971
                ...   
2019-05-31    0.134963
2019-06-30    0.000000
2019-07-31    0.000000
2019-08-31    0.000000
2019-09-30    0.000000
Freq: M, Name: SNOW, Length: 477, dtype: float64

- Calculate the value of the column for a rolling window of size 7

In [67]:
# method 2
(snow
.rolling(7)
.mean())

1980-01-01    NaN
1980-01-02    NaN
1980-01-03    NaN
1980-01-04    NaN
1980-01-05    NaN
             ... 
2019-09-03    0.0
2019-09-04    0.0
2019-09-05    0.0
2019-09-06    0.0
2019-09-07    0.0
Name: SNOW, Length: 14160, dtype: float64