# Working With Time Series Data in Pandas

## Converting to DateTime Type

In [1]:
import pandas as pd

pd.to_datetime('Jan 1 1970')

Timestamp('1970-01-01 00:00:00')

In [12]:
#pd.to_datetime('Jan:1:1970')

In [3]:
pd.to_datetime('Jan:1:1970', format='%b:%d:%Y')

Timestamp('1970-01-01 00:00:00')

In [4]:
df = pd.read_csv('https://gist.githubusercontent.com/zgulde/c60209f379155744ced4dfc57068e55b/raw/97fc3ab1e2f6fff6ffd409d60029c53954d5784e/coffee_consumption.csv')

In [5]:
df.dtypes

date                   object
coffee_consumption    float64
dtype: object

In [6]:
df.head()

Unnamed: 0,date,coffee_consumption
0,2019-01-01,14.301915
1,2019-01-02,12.9059
2,2019-01-03,10.046015
3,2019-01-04,6.354805
4,2019-01-07,8.545563


In [7]:
# convert our date column to datetime type
df.date = pd.to_datetime(df.date)
df.dtypes

date                  datetime64[ns]
coffee_consumption           float64
dtype: object

## Working with DateTime Series

In [8]:
df['year'] = df.date.dt.year
df['month'] = df.date.dt.month
df['day'] = df.date.dt.month
df['weekday'] = df.date.dt.day_name()
df.head()

Unnamed: 0,date,coffee_consumption,year,month,day,weekday
0,2019-01-01,14.301915,2019,1,1,Tuesday
1,2019-01-02,12.9059,2019,1,1,Wednesday
2,2019-01-03,10.046015,2019,1,1,Thursday
3,2019-01-04,6.354805,2019,1,1,Friday
4,2019-01-07,8.545563,2019,1,1,Monday


In [11]:
df.weekday.value_counts()
df.day.value_counts()
df.year.value_counts()

2019    261
Name: year, dtype: int64

In [13]:
# cleanup demonstration columns
del df['year']
del df['day']
del df['month']
del df['weekday']

## DateTime Indexes

In [14]:
# this is a very important step!
df = df.set_index('date').sort_index()
df

Unnamed: 0_level_0,coffee_consumption
date,Unnamed: 1_level_1
2019-01-01,14.301915
2019-01-02,12.905900
2019-01-03,10.046015
2019-01-04,6.354805
2019-01-07,8.545563
...,...
2019-12-25,12.250875
2019-12-26,7.513206
2019-12-27,9.464345
2019-12-30,14.623106


In [16]:
# let's us see where the data starts and where it ends
df.index.min(), df.index.max()

(Timestamp('2019-01-01 00:00:00'), Timestamp('2019-12-31 00:00:00'))

In [None]:
df['2019-05']

In [None]:
df['2019-05-06':'2019-05-16'] # NB. inclusive

### Changing the Period

Vocab: **upsampling** and **downsampling**

In [None]:
by_day = df.asfreq('D')
by_day

### Filling Missing Values

In [None]:
by_day.assign(
    ffill=lambda df: df.coffee_consumption.ffill(),
    bfill=lambda df: df.coffee_consumption.bfill()
).head(15)

In [None]:
df = df.fillna(0)

### Resampling

In [None]:
df.resample('3W').mean() # Every 3 weeks

In [None]:
df.resample('M').sum()

## Plotting

In [None]:
df.plot()

In [None]:
df.resample('3M').mean().plot(marker='o')

### Rolling Windows

In [None]:
rolling_df = df.resample('W').mean().assign(
    rolling_3=lambda df: df.coffee_consumption.rolling(3).mean(),
    rolling_5=lambda df: df.coffee_consumption.rolling(5).mean(),
    rolling_7=lambda df: df.coffee_consumption.rolling(7).mean(),
)
rolling_df.plot()
rolling_df.head(20)

In [None]:
df.resample('W').mean().rolling(4).sum()

### Lagging and Leading

In [None]:
df['shift(-1)'] = df.coffee_consumption.shift(-1)
df['shift(1)'] = df.coffee_consumption.shift(1)
df['shift(3)'] = df.coffee_consumption.shift(3)
df['diff(1)'] = df.coffee_consumption.diff(1)
df['diff(3)'] = df.coffee_consumption.diff(3)
df.head(25)

In [None]:
# cleanup the additional columns
for col in ['shift(-1)', 'shift(1)', 'shift(3)', 'diff(1)', 'diff(3)']:
    del df[col]

## Strftime

In [None]:
df.index.strftime('%B %d, %Y')[:4]

## Timedeltas

In [None]:
start_date = pd.to_datetime('20190204')
graduation_date = pd.to_datetime('20190614')

days_between = graduation_date - start_date
days_between

In [None]:
days_between // pd.Timedelta('1d')

In [None]:
df['days_since_max'] = (df.index.max() - df.index) // pd.Timedelta('1d')
df.tail(10)