## pandas timeseries
a series or dataframe using datetime index instead of a range index

used for storing events/data that fits on a timeline
- weather data
- temperature readings
- heart rate monetoring
- quarterly sales
- stock prices

In [5]:
import pandas as pd
import numpy as np

# creating a datetime index
- start
- end
- periods
- freq

In [10]:
datetimeindex = pd.date_range(start="2018-03-15", end="2019-12-31", freq="2D5H", periods=10)
datetimeindex

DatetimeIndex(['2018-01-01 00:00:00', '2018-01-03 05:00:00',
               '2018-01-05 10:00:00', '2018-01-07 15:00:00',
               '2018-01-09 20:00:00', '2018-01-12 01:00:00',
               '2018-01-14 06:00:00', '2018-01-16 11:00:00',
               '2018-01-18 16:00:00', '2018-01-20 21:00:00'],
              dtype='datetime64[ns]', freq='53H')

In [15]:
df = pd.DataFrame(
    dict(
        n = range(len(datetimeindex)),
        rand = np.random.random(len(datetimeindex))
    ),
    index=datetimeindex
)

df

Unnamed: 0,n,rand
2018-01-01 00:00:00,0,0.382417
2018-01-03 05:00:00,1,0.81462
2018-01-05 10:00:00,2,0.375086
2018-01-07 15:00:00,3,0.317559
2018-01-09 20:00:00,4,0.265596
2018-01-12 01:00:00,5,0.022302
2018-01-14 06:00:00,6,0.907966
2018-01-16 11:00:00,7,0.990518
2018-01-18 16:00:00,8,0.780815
2018-01-20 21:00:00,9,0.966471


In [18]:
df.loc[:, "rand"]

2018-01-01 00:00:00    0.382417
2018-01-03 05:00:00    0.814620
2018-01-05 10:00:00    0.375086
2018-01-07 15:00:00    0.317559
2018-01-09 20:00:00    0.265596
2018-01-12 01:00:00    0.022302
2018-01-14 06:00:00    0.907966
2018-01-16 11:00:00    0.990518
2018-01-18 16:00:00    0.780815
2018-01-20 21:00:00    0.966471
Freq: 53H, Name: rand, dtype: float64

### resampling
resampling is the practice of creating new samples with a lower or higher freq than the orginal data

### downsampling
when the sample freq is lower than the orginal we can aggregate values from the orginal data

In [24]:
from helpers import hdisplay

resample_method = "2D"

hdisplay([
    df.head(10),
    df.resample("2D").agg({"n": "max", "rand": "sum"}).head(10)],
    ["Orginal", f"resampled using '{resample_method}'"]
)

Unnamed: 0,n,rand
2018-01-01 00:00:00,0,0.382417
2018-01-03 05:00:00,1,0.81462
2018-01-05 10:00:00,2,0.375086
2018-01-07 15:00:00,3,0.317559
2018-01-09 20:00:00,4,0.265596
2018-01-12 01:00:00,5,0.022302
2018-01-14 06:00:00,6,0.907966
2018-01-16 11:00:00,7,0.990518
2018-01-18 16:00:00,8,0.780815
2018-01-20 21:00:00,9,0.966471

Unnamed: 0,n,rand
2018-01-01 00:00:00,0,0.382417
2018-01-03 00:00:00,1,0.81462
2018-01-05 00:00:00,2,0.375086
2018-01-07 00:00:00,3,0.317559
2018-01-09 00:00:00,4,0.265596
2018-01-11 00:00:00,5,0.022302
2018-01-13 00:00:00,6,0.907966
2018-01-15 00:00:00,7,0.990518
2018-01-17 00:00:00,8,0.780815
2018-01-19 00:00:00,9,0.966471


### upsampling
when the sample freq is higher than the orginal we can fill between values: 
- ffill
- bfill
- nearest
- interpolation
- fillna

In [28]:
from helpers import hdisplay

resample_method = "6H"

hdisplay([
    df.head(10),
    df.resample(resample_method).nearest().head(10)],
    ["Orginal", f"resampled using '{resample_method}'"]
)

Unnamed: 0,n,rand
2018-01-01 00:00:00,0,0.382417
2018-01-03 05:00:00,1,0.81462
2018-01-05 10:00:00,2,0.375086
2018-01-07 15:00:00,3,0.317559
2018-01-09 20:00:00,4,0.265596
2018-01-12 01:00:00,5,0.022302
2018-01-14 06:00:00,6,0.907966
2018-01-16 11:00:00,7,0.990518
2018-01-18 16:00:00,8,0.780815
2018-01-20 21:00:00,9,0.966471

Unnamed: 0,n,rand
2018-01-01 00:00:00,0,0.382417
2018-01-01 06:00:00,0,0.382417
2018-01-01 12:00:00,0,0.382417
2018-01-01 18:00:00,0,0.382417
2018-01-02 00:00:00,0,0.382417
2018-01-02 06:00:00,1,0.81462
2018-01-02 12:00:00,1,0.81462
2018-01-02 18:00:00,1,0.81462
2018-01-03 00:00:00,1,0.81462
2018-01-03 06:00:00,1,0.81462


## working with real data

In [43]:
apple = pd.read_csv("../Data/HistoricalData_Apple.csv", parse_dates=["Date"], index_col="Date")
apple.rename(columns={"Close/Last": "Close"}, inplace=True)

to_int_columns = ["Close", "Open", "High", "Low"]

apple[["Close", "Open", "High", "Low"]] = apple[["Close", "Open", "High", "Low"]].map(lambda value: value.replace("$", "")).astype("float64")

# apple = apple.astype({col : "float64" for col in to_int_columns})

apple.sort_index(inplace=True)

apple

Unnamed: 0_level_0,Close,Volume,Open,High,Low
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-10-10,178.39,43698020,178.1,179.72,177.95
2023-10-11,179.8,47551100,178.2,179.85,177.6
2023-10-12,180.71,56743120,180.07,182.34,179.04
2023-10-13,178.85,51456080,181.42,181.93,178.14
2023-10-16,178.72,52516980,176.75,179.075,176.51
2023-10-17,177.15,57549350,176.645,178.42,174.8
2023-10-18,175.84,54764380,175.58,177.575,175.11
2023-10-19,175.46,59302860,176.04,177.84,175.19
2023-10-20,172.88,64244030,175.31,175.42,172.64
2023-10-23,173.0,55980110,170.91,174.01,169.93


In [47]:
apple.loc["2018", "Close"].max()
apple.resample("YS").mean()

KeyError: '2018'

In [48]:
import seaborn as sns

data = apple.loc["2020-10"].resample("M").mean()
data = apple.loc["2020"].resample("QS").mean()
sns.lineplot(data=apple, x=apple_index, y="Open")