In [1]:
import pandas as pd
import numpy as np

In [4]:
rng = pd.date_range('1/1/2011', periods=72, freq='H')
ts = pd.Series(np.random.randn(len(rng)), index=rng)
ts.head()

2011-01-01 00:00:00    2.116945
2011-01-01 01:00:00   -1.691962
2011-01-01 02:00:00    0.808121
2011-01-01 03:00:00    0.135274
2011-01-01 04:00:00   -1.060183
Freq: H, dtype: float64

## Resampling

There are two methods to resample the data.
1. Asfreq: Returns the original data sampled at the new index. If no data exists for the specified new index NaN is returned.
1. Resample: Returns a summarised version of the data sampled at a new index. For example if the new index is from 1 second to 3 seconds the new value could be the sum of the mean of the previous values.

In [12]:
## Example asfreq
index = pd.date_range(start='1/1/2000', periods=4, freq='T')
original_df = pd.DataFrame(data=[0.0, None, 2.0, 3.0], index=index)
print(original_df)
resampled_df = original_df.asfreq(freq='1.5T')
print(resampled_df) ##NaN on 1:30 because there is no data for that value.

                       0
2000-01-01 00:00:00  0.0
2000-01-01 00:01:00  NaN
2000-01-01 00:02:00  2.0
2000-01-01 00:03:00  3.0
                       0
2000-01-01 00:00:00  0.0
2000-01-01 00:01:30  NaN
2000-01-01 00:03:00  3.0


In [18]:
## Example resample.
index = pd.date_range('1/1/2000', periods=9, freq='T')
series = pd.Series(range(9), index=index)
print(series)
series.resample('1.25T').sum()

2000-01-01 00:00:00    0
2000-01-01 00:01:00    1
2000-01-01 00:02:00    2
2000-01-01 00:03:00    3
2000-01-01 00:04:00    4
2000-01-01 00:05:00    5
2000-01-01 00:06:00    6
2000-01-01 00:07:00    7
2000-01-01 00:08:00    8
Freq: T, dtype: int64


2000-01-01 00:00:00     1
2000-01-01 00:01:15     2
2000-01-01 00:02:30     3
2000-01-01 00:03:45     4
2000-01-01 00:05:00    11
2000-01-01 00:06:15     7
2000-01-01 00:07:30     8
Freq: 75S, dtype: int64

## Methods for filling missing data points.
1. bfill: Uses the data from the next existing point in the original dataframe/series.
1. ffill: Uses the data from the previous existing point in the original dataframe/series.
1. None: Shows NaN for time indexes where no data exists.
1. pad: same as ffill

In [23]:
ts.head()

2011-01-01 00:00:00    2.116945
2011-01-01 01:00:00   -1.691962
2011-01-01 02:00:00    0.808121
2011-01-01 03:00:00    0.135274
2011-01-01 04:00:00   -1.060183
Freq: H, dtype: float64

In [21]:
converted = ts.asfreq('45Min', method='pad')
converted.head()

2011-01-01 00:00:00    2.116945
2011-01-01 00:45:00    2.116945
2011-01-01 01:30:00   -1.691962
2011-01-01 02:15:00    0.808121
2011-01-01 03:00:00    0.135274
Freq: 45T, dtype: float64

In [None]:
# Does asfreq change the # of rows?

In [None]:
# What do the different methods do?
# method : {‘backfill’, ‘bfill’, ‘pad’, ‘ffill’, None}

In [None]:
# Might any of these methods have pitfalls from a logical point of view?

In [25]:
# What's the difference between going to a higher frequency and a lower frequency?
ts.head()

2011-01-01 00:00:00    2.116945
2011-01-01 01:00:00   -1.691962
2011-01-01 02:00:00    0.808121
2011-01-01 03:00:00    0.135274
2011-01-01 04:00:00   -1.060183
Freq: H, dtype: float64

In [27]:
converted = ts.asfreq('90Min', method = 'bfill')
converted.head()

2011-01-01 00:00:00    2.116945
2011-01-01 01:30:00    0.808121
2011-01-01 03:00:00    0.135274
2011-01-01 04:30:00    1.293753
2011-01-01 06:00:00   -0.072679
Freq: 90T, dtype: float64

In [None]:
# What's different logically about going to a higher frequency vs a lower frequency? 
# What do you want to do when switching to a lower freqeuncy that is not logical when switching to a higher frequency?

In [28]:
ts.resample('D').sum()

2011-01-01    1.549852
2011-01-02    1.012978
2011-01-03    0.130671
Freq: D, dtype: float64

In [None]:
# What if you want to downsample and you don't want to ffill or bfill?

In [None]:
# What is the difference between .resample() and .asfreq()?

In [29]:
# What are some special things you can do with .resample() you can't do with .asfreq()?
irreg_ts = ts[list(np.random.choice(a = list(range(len(ts))), size = 10, replace=False))]
irreg_ts

2011-01-02 07:00:00    0.972082
2011-01-02 15:00:00   -0.645808
2011-01-01 00:00:00    2.116945
2011-01-02 16:00:00    1.619889
2011-01-02 02:00:00   -1.819982
2011-01-02 22:00:00   -1.024537
2011-01-01 01:00:00   -1.691962
2011-01-03 02:00:00    1.315676
2011-01-03 21:00:00    0.563186
2011-01-03 03:00:00    0.866676
dtype: float64

In [30]:
irreg_ts.asfreq(freq='D')

2011-01-02 07:00:00    0.972082
Freq: D, dtype: float64

In [31]:
irreg_ts = irreg_ts.sort_index()

In [34]:
print(irreg_ts.asfreq(freq='D'))
print(irreg_ts.asfreq(freq='D', method='bfill'))
print(irreg_ts.resample(rule='D').mean())

2011-01-01    2.116945
2011-01-02         NaN
2011-01-03         NaN
Freq: D, dtype: float64
2011-01-01    2.116945
2011-01-02   -1.819982
2011-01-03    1.315676
Freq: D, dtype: float64
2011-01-01    0.212491
2011-01-02   -0.179671
2011-01-03    0.915179
Freq: D, dtype: float64
