In [1]:
import numpy as np
import pandas as pd
import pytz
from datetime import datetime, timedelta
from pandas.tseries.offsets import Hour, Minute, Second
from pandas.tseries.offsets import Day, MonthEnd

### Basic date manipulations in Python

Python supports a datetime module which has several functionalitites:
1. Datetime objects representing a specific date in time with years, months and days
2. Addition and substraction of datetime objects
3. Format datetime objects based on different format specification

Pandas supports handling of null value timestampts with its NaT which stands for Not a Time.

In [2]:
current_date = datetime.now()
current_date

datetime.datetime(2023, 10, 2, 22, 16, 18, 206782)

In [3]:
current_date.year, current_date.month, current_date.day

(2023, 10, 2)

In [4]:
delta = datetime(2019, 1, 7) - datetime(2003, 10, 2, 8, 15)
delta

datetime.timedelta(days=5575, seconds=56700)

In [5]:
delta = delta + timedelta(10)
delta

datetime.timedelta(days=5585, seconds=56700)

In [6]:
stamp = datetime(2011,1,3)
print(str(stamp))

2011-01-03 00:00:00


In [7]:
stamp = stamp.strftime("%D")
stamp

'01/03/11'

In [8]:
value = "2011-01-03"
datetime.strptime(value, "%Y-%m-%d")

datetime.datetime(2011, 1, 3, 0, 0)

In [9]:
dates_arr = ["2015-03-18", "1996-07-03", None]
datetime_index = pd.to_datetime(dates_arr)
datetime_index

DatetimeIndex(['2015-03-18', '1996-07-03', 'NaT'], dtype='datetime64[ns]', freq=None)

### Time Series in Pandas

Times series are essentially an a Pandas series with its index set to the timestamps. 

Timestamps show the specific value of something in that time. For now we will just fill the values with random ones

In [10]:
dates = [datetime(2011, 1, 2), datetime(2011, 1, 5),
         datetime(2011, 1, 7), datetime(2011, 1, 8),
         datetime(2011, 1, 10), datetime(2011, 1, 12)]
ts = pd.Series(np.random.standard_normal(6), index = dates)
ts

2011-01-02    1.220327
2011-01-05   -1.057032
2011-01-07    0.229131
2011-01-08    0.930194
2011-01-10   -1.711029
2011-01-12    2.300997
dtype: float64

In [11]:
ts.index

DatetimeIndex(['2011-01-02', '2011-01-05', '2011-01-07', '2011-01-08',
               '2011-01-10', '2011-01-12'],
              dtype='datetime64[ns]', freq=None)

The same things for regular Pandas series apply here, operations are broadcast and we can also select a value by using the date as an index in our timeseries.

In [12]:
ts = ts*2
ts

2011-01-02    2.440654
2011-01-05   -2.114065
2011-01-07    0.458263
2011-01-08    1.860388
2011-01-10   -3.422058
2011-01-12    4.601994
dtype: float64

In [13]:
ts["2011-01-12"]

4.601993738973464

Something more interesting is when we are manipulating a big timeseries. We will use ```pd.date_range``` for the creation of our timeseries. Let's demonstrate selecting months and slicing.

In [14]:
longer_ts = pd.Series(np.random.standard_normal(1000),index = pd.date_range("2000-01-01", periods = 1000))


In [15]:
longer_ts["2001-03-04" : "2002-03-04"]

2001-03-04   -0.527597
2001-03-05   -1.211596
2001-03-06    0.622887
2001-03-07    0.292985
2001-03-08    1.321900
                ...   
2002-02-28   -0.627686
2002-03-01    1.264150
2002-03-02    1.028213
2002-03-03    1.601282
2002-03-04    0.088177
Freq: D, Length: 366, dtype: float64

In [16]:
longer_ts[datetime(2000,3,4): datetime(2002,1,1)]

2000-03-04   -0.975578
2000-03-05    0.509179
2000-03-06    0.624847
2000-03-07    0.130119
2000-03-08   -0.394621
                ...   
2001-12-28    0.479291
2001-12-29    0.140200
2001-12-30   -0.712504
2001-12-31    1.391901
2002-01-01    0.522234
Freq: D, Length: 669, dtype: float64

We can also slice with timestamps that are not present in our timeseries.

In [17]:
ts

2011-01-02    2.440654
2011-01-05   -2.114065
2011-01-07    0.458263
2011-01-08    1.860388
2011-01-10   -3.422058
2011-01-12    4.601994
dtype: float64

In [18]:
ts["2011-01-06":"2011-01-11"]

2011-01-07    0.458263
2011-01-08    1.860388
2011-01-10   -3.422058
dtype: float64

In [19]:
ts.truncate(after = "2011-01-04") #truncate based on the date provided, here we select everything after the provided date

2011-01-02    2.440654
dtype: float64

In [20]:
dates = pd.date_range("2000-01-01", periods = 100, freq = "W-WED")
df_towns = pd.DataFrame(np.random.standard_normal((100,4)), index = dates, columns = ["Tokyo", "Montreal", "Madrid", "London"])
df_towns

Unnamed: 0,Tokyo,Montreal,Madrid,London
2000-01-05,-0.646853,-0.343833,-1.926051,-0.574718
2000-01-12,-0.956370,0.393256,-0.784467,0.957365
2000-01-19,-1.164328,-1.514412,0.829060,0.211351
2000-01-26,-0.272980,2.260282,1.501285,2.419184
2000-02-02,0.643508,-0.610025,-1.860355,-0.655078
...,...,...,...,...
2001-10-31,-0.650916,0.242820,-0.050783,0.665412
2001-11-07,-0.045285,-1.080848,1.819835,1.092263
2001-11-14,-1.289246,0.441228,0.152442,-0.793646
2001-11-21,0.436082,-0.526629,0.107992,-0.464209


In [21]:
df_towns.loc["2001-10-31"]

Tokyo      -0.650916
Montreal    0.242820
Madrid     -0.050783
London      0.665412
Name: 2001-10-31 00:00:00, dtype: float64

### Working with duplicates

In [22]:
dates = pd.DatetimeIndex(["2000-01-01", "2000-01-02", "2000-01-02","2000-01-02", "2000-01-03"])
duplicates = pd.Series(data = [1,2,3,3,4], index = dates)
duplicates

2000-01-01    1
2000-01-02    2
2000-01-02    3
2000-01-02    3
2000-01-03    4
dtype: int64

In [23]:
duplicates["2000-01-02"]

2000-01-02    2
2000-01-02    3
2000-01-02    3
dtype: int64

In [24]:
duplicates.groupby(level = 0).mean()

2000-01-01    1.000000
2000-01-02    2.666667
2000-01-03    4.000000
dtype: float64

### Working with frequencies
When working with time series, we have many options for frequencies. For example once a week, every hour or different frequencies. Pandas has a ```resample``` method for this purpose.

In [25]:
ts

2011-01-02    2.440654
2011-01-05   -2.114065
2011-01-07    0.458263
2011-01-08    1.860388
2011-01-10   -3.422058
2011-01-12    4.601994
dtype: float64

In [26]:
ts = ts.resample("D") #parameter indicates how to resample
ts

<pandas.core.resample.DatetimeIndexResampler object at 0x000002C2D14E0640>

```pd.date_range``` can be used to generate large sequences of data starting or ending on a particular date. Also these periods can be a specific number of days or other.

In [27]:
dates = pd.date_range("2016-12-12", "2017-12-12")
dates

DatetimeIndex(['2016-12-12', '2016-12-13', '2016-12-14', '2016-12-15',
               '2016-12-16', '2016-12-17', '2016-12-18', '2016-12-19',
               '2016-12-20', '2016-12-21',
               ...
               '2017-12-03', '2017-12-04', '2017-12-05', '2017-12-06',
               '2017-12-07', '2017-12-08', '2017-12-09', '2017-12-10',
               '2017-12-11', '2017-12-12'],
              dtype='datetime64[ns]', length=366, freq='D')

In [28]:
date_quarterly = pd.date_range("2016-12-01", "2017-12-12", freq = "Q-JAN")
date_quarterly

DatetimeIndex(['2017-01-31', '2017-04-30', '2017-07-31', '2017-10-31'], dtype='datetime64[ns]', freq='Q-JAN')

In [29]:
date_20days = pd.date_range("2018-01-03 08:46:21", periods = 20, normalize = True)
date_20days

DatetimeIndex(['2018-01-03', '2018-01-04', '2018-01-05', '2018-01-06',
               '2018-01-07', '2018-01-08', '2018-01-09', '2018-01-10',
               '2018-01-11', '2018-01-12', '2018-01-13', '2018-01-14',
               '2018-01-15', '2018-01-16', '2018-01-17', '2018-01-18',
               '2018-01-19', '2018-01-20', '2018-01-21', '2018-01-22'],
              dtype='datetime64[ns]', freq='D')

In [30]:
hour = Hour()
minute = Minute()
second = Second()
fifteen_seconds = Second(15)
high_freq = pd.date_range("2023-01-01", "2023-01-01 23:59",freq = fifteen_seconds)
high_freq

DatetimeIndex(['2023-01-01 00:00:00', '2023-01-01 00:00:15',
               '2023-01-01 00:00:30', '2023-01-01 00:00:45',
               '2023-01-01 00:01:00', '2023-01-01 00:01:15',
               '2023-01-01 00:01:30', '2023-01-01 00:01:45',
               '2023-01-01 00:02:00', '2023-01-01 00:02:15',
               ...
               '2023-01-01 23:56:45', '2023-01-01 23:57:00',
               '2023-01-01 23:57:15', '2023-01-01 23:57:30',
               '2023-01-01 23:57:45', '2023-01-01 23:58:00',
               '2023-01-01 23:58:15', '2023-01-01 23:58:30',
               '2023-01-01 23:58:45', '2023-01-01 23:59:00'],
              dtype='datetime64[ns]', length=5757, freq='15S')

In [31]:
print("Shifting provides us with a way to move data backwards or forwards in time.")

Shifting provides us with a way to move data backwards or forwards in time.


In [32]:
shifting_example = pd.date_range("2015-01-01", "2015-01-05")
shifting_example

DatetimeIndex(['2015-01-01', '2015-01-02', '2015-01-03', '2015-01-04',
               '2015-01-05'],
              dtype='datetime64[ns]', freq='D')

In [33]:
shifting_example.shift(5)

DatetimeIndex(['2015-01-06', '2015-01-07', '2015-01-08', '2015-01-09',
               '2015-01-10'],
              dtype='datetime64[ns]', freq='D')

In [34]:
shifting_example.shift(-5)

DatetimeIndex(['2014-12-27', '2014-12-28', '2014-12-29', '2014-12-30',
               '2014-12-31'],
              dtype='datetime64[ns]', freq='D')

In [35]:
series_shifting = pd.Series([1,2,3,4,5], index = shifting_example)
series_shifting

2015-01-01    1
2015-01-02    2
2015-01-03    3
2015-01-04    4
2015-01-05    5
Freq: D, dtype: int64

An interesting trick is to compute how much percentage wise has a timeseries valu changed, based on a shift.

In [36]:
percentage = (series_shifting / series_shifting.shift(1) - 1) * 100
percentage

2015-01-01           NaN
2015-01-02    100.000000
2015-01-03     50.000000
2015-01-04     33.333333
2015-01-05     25.000000
Freq: D, dtype: float64

Roll forward "rolls" a date to the end of the specific month.

Roll backward "rolls" a date to the end of the previous month.

In [37]:
offset = MonthEnd()
now = datetime(2015,10,5)
now

datetime.datetime(2015, 10, 5, 0, 0)

In [38]:
offset.rollforward(now)

Timestamp('2015-10-31 00:00:00')

In [39]:
offset.rollback(now)

Timestamp('2015-09-30 00:00:00')

Timezones can be tricky in real life, let's see how pandas handles them.

In [45]:
pytz.common_timezones[-10:]

['Pacific/Wake',
 'Pacific/Wallis',
 'US/Alaska',
 'US/Arizona',
 'US/Central',
 'US/Eastern',
 'US/Hawaii',
 'US/Mountain',
 'US/Pacific',
 'UTC']

In [52]:
dates = pd.date_range("2018-01-01", periods = 5)
ts = pd.Series(data = np.random.standard_normal(5), index = dates)
ts

2018-01-01   -0.444583
2018-01-02    0.315528
2018-01-03   -1.706139
2018-01-04    1.574981
2018-01-05   -0.183472
Freq: D, dtype: float64

In [53]:
print(ts.index.tz)

None


In [54]:
ts_utc = ts.tz_localize("UTC")
ts_utc

2018-01-01 00:00:00+00:00   -0.444583
2018-01-02 00:00:00+00:00    0.315528
2018-01-03 00:00:00+00:00   -1.706139
2018-01-04 00:00:00+00:00    1.574981
2018-01-05 00:00:00+00:00   -0.183472
Freq: D, dtype: float64

In [56]:
ts_utc.tz_convert("Pacific/Wake")

2018-01-01 12:00:00+12:00   -0.444583
2018-01-02 12:00:00+12:00    0.315528
2018-01-03 12:00:00+12:00   -1.706139
2018-01-04 12:00:00+12:00    1.574981
2018-01-05 12:00:00+12:00   -0.183472
Freq: D, dtype: float64

In [57]:
period = pd.Period("2016", freq = "A-DEC")
period

Period('2016', 'A-DEC')

In [58]:
period + 5

Period('2021', 'A-DEC')

In [59]:
periods = pd.period_range("2001-06-17", "2003-09-12", freq = "M")
periods

PeriodIndex(['2001-06', '2001-07', '2001-08', '2001-09', '2001-10', '2001-11',
             '2001-12', '2002-01', '2002-02', '2002-03', '2002-04', '2002-05',
             '2002-06', '2002-07', '2002-08', '2002-09', '2002-10', '2002-11',
             '2002-12', '2003-01', '2003-02', '2003-03', '2003-04', '2003-05',
             '2003-06', '2003-07', '2003-08', '2003-09'],
            dtype='period[M]')

In [63]:
periods.asfreq("Y", how = "start")

PeriodIndex(['2001', '2001', '2001', '2001', '2001', '2001', '2001', '2002',
             '2002', '2002', '2002', '2002', '2002', '2002', '2002', '2002',
             '2002', '2002', '2002', '2003', '2003', '2003', '2003', '2003',
             '2003', '2003', '2003', '2003'],
            dtype='period[A-DEC]')

In [87]:
dates = pd.date_range("2000-01-01", periods=100)

ts = pd.Series(np.random.standard_normal(len(dates)), index=dates)

In [88]:
ts

2000-01-01   -1.056923
2000-01-02    0.400543
2000-01-03    0.665739
2000-01-04   -0.724825
2000-01-05   -1.809964
                ...   
2000-04-05   -0.405345
2000-04-06   -1.657829
2000-04-07   -0.466999
2000-04-08    0.133835
2000-04-09   -0.867997
Freq: D, Length: 100, dtype: float64

In [89]:
resampled = ts.resample("M").mean()
resampled

2000-01-31    0.031155
2000-02-29    0.165130
2000-03-31    0.156038
2000-04-30   -0.216767
Freq: M, dtype: float64

In [90]:
seconds = pd.date_range("2013-01-01", periods = 300, freq = "S")
ts_seconds = pd.Series(np.random.standard_normal(len(seconds)), index = seconds)
ts_seconds

2013-01-01 00:00:00   -0.978123
2013-01-01 00:00:01    0.865040
2013-01-01 00:00:02   -0.417401
2013-01-01 00:00:03    0.725050
2013-01-01 00:00:04   -0.443904
                         ...   
2013-01-01 00:04:55    1.015147
2013-01-01 00:04:56   -0.125289
2013-01-01 00:04:57    0.883452
2013-01-01 00:04:58    1.193659
2013-01-01 00:04:59   -0.685545
Freq: S, Length: 300, dtype: float64

In [91]:
ts_seconds.resample("1min").sum()

2013-01-01 00:00:00    11.308651
2013-01-01 00:01:00    -1.368125
2013-01-01 00:02:00     6.987248
2013-01-01 00:03:00     4.993200
2013-01-01 00:04:00    -2.410260
Freq: T, dtype: float64

In [92]:
ts = pd.Series(np.random.permutation(np.arange(len(dates))), index = dates)
ts.resample("D").ohlc()

Unnamed: 0,open,high,low,close
2000-01-01,23,23,23,23
2000-01-02,46,46,46,46
2000-01-03,63,63,63,63
2000-01-04,78,78,78,78
2000-01-05,90,90,90,90
...,...,...,...,...
2000-04-05,47,47,47,47
2000-04-06,86,86,86,86
2000-04-07,29,29,29,29
2000-04-08,89,89,89,89
