In [4]:
"""
Chapter 11 - Time Series:
Anything that is observed, measured at many points in time forms a time series.
Many time series are fixed frequency,
which is to say that data points coour at regular intervals according to some rule,
such as every 15 secs, 5 mins, or per month. Time Series can also be irregular without fixed unit
of time or offset between units. How to mark and refer to time series data depends on the application
and you may have one of the following:
    * Timestamps specific instants in time
    * Fixed periods, such as the month January 2007 of the  full year 2010
    * Intervals of time, indicated by a start and end timestamp.
        Periods can be thought of as special cases of intervals
    * Experiment or elapsed time;
        each timestamp is a measure of time relative to a particular start time:
            (the diameter of a cookie baking each second sicne being placed in the oven)
"""
# Date and Time Data Types and Tools 
import pandas as pd
import numpy as np

from datetime import datetime

now = datetime.now()

display(now)

print(now.year, now.month, now.day)

# datime stores both the date and time down to the microsecond.
# timedelta represents the temporal difference between two datetime objects:

delta = datetime(2011, 1, 7) - datetime(2008, 6, 24, 8, 15)

display(delta)

display(delta.days)

display(delta.seconds)

# add / subtract a timedelta | multiple thereof to datetime object to yield a new shifed object

from datetime import timedelta

start = datetime.now()

start + timedelta(12)

print('Types in datetime module page 325')

datetime.datetime(2021, 10, 8, 14, 39, 29, 932871)

2021 10 8


datetime.timedelta(days=926, seconds=56700)

926

56700

Types in datetime module page 325


In [5]:
# Converting Between String and Datetime 
# format datetime object & pd Timestamp object,using srt|strftime method, pass f specification

stamp = datetime(2011, 1, 3)

display(str(stamp))

display(stamp.strftime("%Y-%m-%d"))

print('Datetime format specification page 325')

'2011-01-03 00:00:00'

'2011-01-03'

Datetime format specification page 325


In [8]:
# Use many of the same format codes to conver strings to dates using datetime.strptime
    # some codes like %F cannot be used
    
value = '2011-01-03'
display(datetime.strptime(value, '%Y-%m-%d'))

display(datetime(2011, 1, 3, 0, 0))

daatestrs = ['7/6/2011', '8/6/2011']

display(daatestrs)

display([datetime.strptime(x, '%m/%d/%Y') for x in daatestrs])

# datetime.strptime is a good way to parse a date with a known format.
# avaoid writing format sects eachtime use the parser.parse method in 3rd-party dateutil package

from dateutil.parser import parse

display(parse('2011-01-03'))

# dateutil is capable of parsing most human -intelligible date representaions

display(parse('Jan 31, 1997 10:45 PM'))

# for day appearing before month pass dayfirst=True to indicate this

display(parse('6/12/2021', dayfirst=True))

# pd uses arr as axis idx|col in df.to_datetime parses different kinds of date representations.

datesrts = ['2011-07-06 12:00:00', '2011-08-06 00:00:00']

display(pd.to_datetime(datesrts))

print('Locale-specific date formatting')

datetime.datetime(2011, 1, 3, 0, 0)

datetime.datetime(2011, 1, 3, 0, 0)

['7/6/2011', '8/6/2011']

[datetime.datetime(2011, 7, 6, 0, 0), datetime.datetime(2011, 8, 6, 0, 0)]

datetime.datetime(2011, 1, 3, 0, 0)

datetime.datetime(1997, 1, 31, 22, 45)

datetime.datetime(2021, 12, 6, 0, 0)

DatetimeIndex(['2011-07-06 12:00:00', '2011-08-06 00:00:00'], dtype='datetime64[ns]', freq=None)

Locale-specific date formatting


In [None]:
"""
# Time Series Basics:
    A basic kind of time series object in pd is a series indexed by timestamps,
    which is often represented external to pd as py strings or datetime objects:
"""

from datetime import datetime

dates = [datetime(2011, 1, 2), datetime(2011, 1, 5),
         datetime(2011, 1, 7), datetime(2011, 1, 8),
         datetime(2011, 1, 10), datetime(2011, 1, 12)]

ts = pd.Series(np.random.randn(6), index=dates)

display(ts)

# Under the hood, these satetime objects have been put in a datetimeIndex:

display(ts.index)

# Like other Series, arithmetic operations between differently indexed time series automatically 
# aling on the dates:
ts + ts[::2]

# Scalar values from a DatietimeIndex are pd timsestamp objects:

stamp = ts.index[0]

display(stamp)

# A timestamp can be substitues anuwhere you would use a datetime object. Also it can store 
# frequency information and usntands how to do time zone conversions and other manipulations.

In [32]:
# Indexing, Selection, Subsetting
# Time series behaves like any other pd.Series when you are indexing and selecting data based on label:

stamp = ts.index[2]

display(ts[stamp])

# As a convenience, you can also pass s string that is interpretable as a date:

display(ts['1/10/2011'])

display(ts['20110110'])

# For longer time series, a year or only a year and month can be passed to easily selct slices of data:

longer_ts = pd.Series(np.random.randn(1000),
                      index=pd.date_range('1/1/2000', periods=1000))

display(longer_ts)

display(longer_ts['2001'][:5])

# Here the string "2001" is interpreted as a year and selects that time period. 
# This works if you specify the month:

display(longer_ts['2001-05'][:5])

# Slicing with datime onjects works as well:

display(ts[datetime(2011, 1, 7):])

# You can slice with timestamps not contained in a time series to perform a range query:

display(ts['1/6/2011':'1/11/2011'])

# Modifications on the slice will be reflactied in ther original data

# These is an equivalent instance methos , trucate, that slices a Series betweeen two dates:

display(ts.truncate(after='1/9/2011'))

# All this holds for the df as well, indexing on its rows:

0.183121269751764

-0.8300213734073932

-0.8300213734073932

2000-01-01    1.708890
2000-01-02   -1.142517
2000-01-03   -2.400360
2000-01-04    0.062005
2000-01-05   -2.671679
                ...   
2002-09-22   -0.292662
2002-09-23    1.157317
2002-09-24    1.185101
2002-09-25    2.368851
2002-09-26   -0.044699
Freq: D, Length: 1000, dtype: float64

2001-01-01    0.421043
2001-01-02   -0.690356
2001-01-03   -1.273559
2001-01-04    0.167545
2001-01-05   -2.102238
Freq: D, dtype: float64

2001-05-01   -0.548813
2001-05-02   -0.479556
2001-05-03    0.915147
2001-05-04    0.333357
2001-05-05   -0.431232
Freq: D, dtype: float64

2011-01-07    0.183121
2011-01-08   -0.463912
2011-01-10   -0.830021
2011-01-12    1.088414
dtype: float64

2011-01-07    0.183121
2011-01-08   -0.463912
2011-01-10   -0.830021
dtype: float64

2011-01-02   -1.045764
2011-01-05    0.310491
2011-01-07    0.183121
2011-01-08   -0.463912
dtype: float64

In [None]:
# Time Series with Duplucate Indices 
# In some apllications, there may be multiple data obsevations falling on a particualr timmestamp

In [39]:
"""
# 11.3: Date Ranges, Frequencies, and Shifting

You can convert the sample time series to be fixed daily frequency by calling resample:
"""

ts

resampler = ts.resample("D") # D daily frequency; Here we use base frequencies and multiples thereof

# Generating Date Ranges: pd.date_range is responsible for generating a DatatimeIndex 

index = pd.date_range('2012-04-01', '2012-06-01')

display(index)

# By default, date_range genarates daily timestamps. If you pass only a strat or end date
# You must pass a number of periods to generate:

may = pd.date_range(start='2012-04-01', periods=20)
        
june = pd.date_range(end='2012-06-01', periods=20)

display(may)
                     
display(june)

DatetimeIndex(['2012-04-01', '2012-04-02', '2012-04-03', '2012-04-04',
               '2012-04-05', '2012-04-06', '2012-04-07', '2012-04-08',
               '2012-04-09', '2012-04-10', '2012-04-11', '2012-04-12',
               '2012-04-13', '2012-04-14', '2012-04-15', '2012-04-16',
               '2012-04-17', '2012-04-18', '2012-04-19', '2012-04-20',
               '2012-04-21', '2012-04-22', '2012-04-23', '2012-04-24',
               '2012-04-25', '2012-04-26', '2012-04-27', '2012-04-28',
               '2012-04-29', '2012-04-30', '2012-05-01', '2012-05-02',
               '2012-05-03', '2012-05-04', '2012-05-05', '2012-05-06',
               '2012-05-07', '2012-05-08', '2012-05-09', '2012-05-10',
               '2012-05-11', '2012-05-12', '2012-05-13', '2012-05-14',
               '2012-05-15', '2012-05-16', '2012-05-17', '2012-05-18',
               '2012-05-19', '2012-05-20', '2012-05-21', '2012-05-22',
               '2012-05-23', '2012-05-24', '2012-05-25', '2012-05-26',
      

DatetimeIndex(['2012-04-01', '2012-04-02', '2012-04-03', '2012-04-04',
               '2012-04-05', '2012-04-06', '2012-04-07', '2012-04-08',
               '2012-04-09', '2012-04-10', '2012-04-11', '2012-04-12',
               '2012-04-13', '2012-04-14', '2012-04-15', '2012-04-16',
               '2012-04-17', '2012-04-18', '2012-04-19', '2012-04-20'],
              dtype='datetime64[ns]', freq='D')

DatetimeIndex(['2012-05-13', '2012-05-14', '2012-05-15', '2012-05-16',
               '2012-05-17', '2012-05-18', '2012-05-19', '2012-05-20',
               '2012-05-21', '2012-05-22', '2012-05-23', '2012-05-24',
               '2012-05-25', '2012-05-26', '2012-05-27', '2012-05-28',
               '2012-05-29', '2012-05-30', '2012-05-31', '2012-06-01'],
              dtype='datetime64[ns]', freq='D')

In [None]:
"""
The start and end dates define strict boundaries for the generated date index.

Base time series frequencies (not comphensive page 355)

date_range by default preserves the time (in any) of the start or end stomestamp:
"""                     
non_mod = pd.date_range('2012-05-02 12:56:31',
              periods=5)

# To mormalized to mifnight as a convention. Use the normalize option:

norm = pd.date_range('2012-05-02 12:56:31',
              periods=5,
              normalize=True)

display(non_mod)

display(norm)

In [60]:
# Frequencies and Date Offsets 

from pandas.tseries.offsets import Hour, Minute

hour = Hour()

display(hour)

# You can define a multipkle of an offset by passing an integer:

four_hours = Hour(4)

display(four_hours)

# Putting an interger before the base frequency creates a mulitple

multiple = pd.date_range("2000-01-01", '2000-01-03', freq='4h')

display(multiple)

# Many offsets can be combined together by addition:

Hour(2) + Minute(30)

# Pass a frequency strings, like '1h30min', that will effectively be parsed to the expressing:

custom_mod = pd.date_range('2021-01-01', periods=10, freq='1h30min')

display(custom_mod)

<Hour>

<4 * Hours>

DatetimeIndex(['2000-01-01 00:00:00', '2000-01-01 04:00:00',
               '2000-01-01 08:00:00', '2000-01-01 12:00:00',
               '2000-01-01 16:00:00', '2000-01-01 20:00:00',
               '2000-01-02 00:00:00', '2000-01-02 04:00:00',
               '2000-01-02 08:00:00', '2000-01-02 12:00:00',
               '2000-01-02 16:00:00', '2000-01-02 20:00:00',
               '2000-01-03 00:00:00'],
              dtype='datetime64[ns]', freq='4H')

DatetimeIndex(['2021-01-01 00:00:00', '2021-01-01 01:30:00',
               '2021-01-01 03:00:00', '2021-01-01 04:30:00',
               '2021-01-01 06:00:00', '2021-01-01 07:30:00',
               '2021-01-01 09:00:00', '2021-01-01 10:30:00',
               '2021-01-01 12:00:00', '2021-01-01 13:30:00'],
              dtype='datetime64[ns]', freq='90T')

In [78]:
# Week of month dates: WOM enables you to get dates like the third Friday of each month:

rng = pd.date_range('2020-01-01', '2020-09-01', freq='WOM-3FRI')

display(rng)

# Shifting (Leading and Lagging) Data
# "shifting" refers to moving data backward and foward through time.
# Both Series and df have shift method for doing naive shifts forward or abckeard
# Leaving index unmodified:

ts = pd.Series(np.random.randn(4),
              index=pd.date_range('1/1/2000', periods=4, freq='M'))

display(ts)

display(ts.shift(2))

display(ts.shift(-2))

# A common use of shift is computing % change in a time series or multiple series as df columns.

display(ts / ts.shift(1) -1)

# Frequency is known, it can passed to shitf to advance the timestamps instead of simple the data

display(ts.shift(2, freq='M'))

# Other frequencies can be apssed, too, giving you some flaexibility in how to lead and lag data

display(ts.shift(3, freq='D'))

display(ts.shift(1, freq='90T'))

# The T stands for minutes. Note that the freq parameter here indicates the offset to apply to the
# timestamps, but it does not change the underlying frequncy of the data, if any

# Shifting, dates wiiht offsets; The pandas date ofsets can also be used with datetime|Timestamp

from pandas.tseries.offsets import Day, MonthEnd

now = datetime(2011, 11, 17)

display(now + 3 * Day())

# If you add an anchored offset like MonthEnd, the first increment will 'roll forward' 
# a date to the next date according to the frequency rule:

display(now + MonthEnd())

display(now + MonthEnd(2))

# Anchored offets can explicitly 'roll' dates forward or backward by simply suning there 
# rollfowaed and rollback methods,respectively:

offset = MonthEnd()

display(offset.rollforward(now))

display(offset.rollback(now))

# A creative use of date offsets in to use these methods with groupby:

ts = pd.Series(np.random.randn(20),
              index=pd.date_range('1/15/2000', periods=20, freq='4d'))

display(ts)

display(ts.groupby(offset.rollback).mean())

DatetimeIndex(['2020-01-17', '2020-02-21', '2020-03-20', '2020-04-17',
               '2020-05-15', '2020-06-19', '2020-07-17', '2020-08-21'],
              dtype='datetime64[ns]', freq='WOM-3FRI')

2000-01-31   -0.001516
2000-02-29   -0.759315
2000-03-31    0.747325
2000-04-30   -0.160883
Freq: M, dtype: float64

2000-01-31         NaN
2000-02-29         NaN
2000-03-31   -0.001516
2000-04-30   -0.759315
Freq: M, dtype: float64

2000-01-31    0.747325
2000-02-29   -0.160883
2000-03-31         NaN
2000-04-30         NaN
Freq: M, dtype: float64

2000-01-31           NaN
2000-02-29    499.915079
2000-03-31     -1.984209
2000-04-30     -1.215279
Freq: M, dtype: float64

2000-03-31   -0.001516
2000-04-30   -0.759315
2000-05-31    0.747325
2000-06-30   -0.160883
Freq: M, dtype: float64

2000-02-03   -0.001516
2000-03-03   -0.759315
2000-04-03    0.747325
2000-05-03   -0.160883
dtype: float64

2000-01-31 01:30:00   -0.001516
2000-02-29 01:30:00   -0.759315
2000-03-31 01:30:00    0.747325
2000-04-30 01:30:00   -0.160883
dtype: float64

Timestamp('2011-11-20 00:00:00')

Timestamp('2011-11-30 00:00:00')

Timestamp('2011-12-31 00:00:00')

Timestamp('2011-11-30 00:00:00')

Timestamp('2011-10-31 00:00:00')

2000-01-15   -0.823032
2000-01-19    0.265913
2000-01-23    0.782343
2000-01-27   -0.994325
2000-01-31   -0.321077
2000-02-04   -0.373295
2000-02-08   -0.290048
2000-02-12    0.137106
2000-02-16    0.830140
2000-02-20    0.763444
2000-02-24    0.910139
2000-02-28    1.474334
2000-03-03    0.298148
2000-03-07    0.398748
2000-03-11   -0.151270
2000-03-15   -1.421740
2000-03-19    0.581271
2000-03-23    0.647762
2000-03-27   -1.433520
2000-03-31   -0.324535
Freq: 4D, dtype: float64

1999-12-31   -0.192275
2000-01-31    0.391343
2000-02-29   -0.154371
2000-03-31   -0.324535
dtype: float64