In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
import tensorflow as tf
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import matplotlib.pyplot as plt
import warnings
import datetime as dt
plt.style.use('seaborn')
plt.rcParams['figure.figsize'] = [16, 9]
plt.rcParams['figure.dpi'] = 300
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
ibm_df = yf.download("IBM", start = "1962-01-03", end = "2022-05-16", progress = False)

In [None]:
ibm_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1962-01-02,7.374124,7.374124,7.291268,7.291268,1.673324,407940
1962-01-03,7.291268,7.355003,7.291268,7.355003,1.68795,305955
1962-01-04,7.355003,7.355003,7.278521,7.281708,1.67113,274575
1962-01-05,7.272148,7.272148,7.125558,7.138305,1.638219,384405
1962-01-08,7.131931,7.131931,6.9471,7.004461,1.607504,572685


In [None]:
ibm_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 15197 entries, 1962-01-02 to 2022-05-13
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Open       15197 non-null  float64
 1   High       15197 non-null  float64
 2   Low        15197 non-null  float64
 3   Close      15197 non-null  float64
 4   Adj Close  15197 non-null  float64
 5   Volume     15197 non-null  int64  
dtypes: float64(5), int64(1)
memory usage: 831.1 KB


In [None]:
ibm_df = ibm_df.sort_values("Date")
ibm_df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1962-01-02,7.374124,7.374124,7.291268,7.291268,1.673324,407940
1962-01-03,7.291268,7.355003,7.291268,7.355003,1.68795,305955
1962-01-04,7.355003,7.355003,7.278521,7.281708,1.67113,274575
1962-01-05,7.272148,7.272148,7.125558,7.138305,1.638219,384405
1962-01-08,7.131931,7.131931,6.9471,7.004461,1.607504,572685


Asset prices are (usually) non-stationary.
By transforming the prices into returns, we attempt to make the time series stationary, which is the desired
property in statistical modeling.

In [None]:
# log return (excluding dividens)
ibm_df = ibm_df.loc[:, ["Adj Close"]]
ibm_df.rename(columns={'Adj Close':'adj_close'}, inplace=True)

In [None]:
ibm_df.head()

Unnamed: 0_level_0,adj_close
Date,Unnamed: 1_level_1
1962-01-02,1.673324
1962-01-03,1.68795
1962-01-04,1.67113
1962-01-05,1.638219
1962-01-08,1.607504


In [None]:
ibm_df['simple_rtn'] = ibm_df.adj_close.pct_change()
ibm_df['log_rtn'] = np.log(ibm_df.adj_close/ibm_df.adj_close.shift(1))

* Simple returns: They aggregate over assets; the simple return of a portfolio is the
weighted sum of the returns of the individual assets in the portfolio.
* Log returns: They aggregate over time; it is easier to understand with the help
of an example—the log return for a given month is the sum of the log returns of
the days within that month.

* Py for Finance Cookbook - page 18 (how to account inflation in the returns series) *

In [None]:
ibm_df.head()

Unnamed: 0_level_0,adj_close,simple_rtn,log_rtn
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1962-01-02,1.673324,,
1962-01-03,1.68795,0.008741,0.008703
1962-01-04,1.67113,-0.009965,-0.010014
1962-01-05,1.638219,-0.019694,-0.01989
1962-01-08,1.607504,-0.018749,-0.018927
