### Imports

In [1]:
import pandas as pd
import numpy as np
import yfinance as yf

from ta.momentum import RSIIndicator
from ta.volatility import BollingerBands
from ta.trend import IchimokuIndicator

import sklearn.mixture as mix

import matplotlib.pyplot as plt

### Data Extraction and Feature Engineering

In [2]:
# Extract Data
start_date = '2017-01-01'
end_date = '2022-06-01'
ticker ='BTC-USD'

df = yf.download(ticker,start_date,end_date)

[*********************100%%**********************]  1 of 1 completed


In [3]:
# Add returns range and bench_cum_rets
df['returns'] = df['Close'].pct_change()
df['range'] = df['High']/df['Low'] - 1
df['exp_rets_cum']= (1 + df['returns']).cumprod() - 1
df.dropna(inplace=True)
df_fe=df.copy()

##### Add Indicators

In [4]:
# RSI
rsi = RSIIndicator(close=df_fe['Close'],window=14).rsi()
df_fe['rsi']=rsi

# Bollinger Bands Indicator
indicator_bb = BollingerBands(close=df["Close"], window=20, window_dev=2)

# Add Bollinger Bands features
df_fe['bb_bbm'] = indicator_bb.bollinger_mavg()
df_fe['bb_bbh'] = indicator_bb.bollinger_hband()
df_fe['bb_bbl'] = indicator_bb.bollinger_lband()

# Moving Averages
df_fe['ma_20']=df_fe['Close'].rolling(window=20).mean()
df_fe['ma_50']=df_fe['Close'].rolling(window=50).mean()
df_fe['ma_100']=df_fe['Close'].rolling(window=50).mean()

# Day of Week
df_fe['dow']=df_fe.index.day_of_week

# Ichimoku
ichimoku = IchimokuIndicator(high=df_fe['High'],low=df_fe['Low'],window1=7,window2=14,window3=21)

df_fe['tenkan_sen'] = ichimoku.ichimoku_conversion_line()
df_fe['kijun_sen'] = ichimoku.ichimoku_base_line()
df_fe['senkou_span_a'] = ichimoku.ichimoku_a()
df_fe['senkou_span_b'] = ichimoku.ichimoku_b()

In [5]:
# Look back in time for certain indicators

t_steps = [1,2,3,4,5,6,7] # day intervals to look back
t_features=['returns','range','rsi','ma_20','ma_50','ma_100']

for step in t_steps:
  for feature in t_features:
    df_fe[f"{feature}_T{step}"]=df_fe[feature].shift(step)


In [6]:
# Show DF
df_fe.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,returns,range,exp_rets_cum,rsi,...,rsi_T6,ma_20_T6,ma_50_T6,ma_100_T6,returns_T7,range_T7,rsi_T7,ma_20_T7,ma_50_T7,ma_100_T7
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-05-27,29251.140625,29346.943359,28326.613281,28627.574219,28627.574219,36582005748,-0.021856,0.03602,27.675606,36.240181,...,36.380962,32305.261523,37627.045391,37627.045391,-0.036735,0.064993,35.393691,32757.104883,37964.03375,37964.03375
2022-05-28,28622.625,28814.900391,28554.566406,28814.900391,28814.900391,35519577634,0.006544,0.009117,27.863246,37.238957,...,40.172711,31894.98125,37316.140859,37316.140859,0.007927,0.019343,36.380962,32305.261523,37627.045391,37627.045391
2022-05-29,29019.867188,29498.009766,28841.107422,29445.957031,29445.957031,18093886409,0.0219,0.022777,28.495361,40.613855,...,36.917221,31462.404102,36969.047734,36969.047734,0.03029,0.039306,40.172711,31894.98125,37316.140859,37316.140859
2022-05-30,29443.365234,31949.630859,29303.572266,31726.390625,31726.390625,39277993274,0.077445,0.090298,30.779621,50.890862,...,39.323912,30960.264844,36629.705938,36629.705938,-0.040391,0.055738,36.917221,31462.404102,36969.047734,36969.047734
2022-05-31,31723.865234,32249.863281,31286.154297,31792.310547,31792.310547,33538210634,0.002078,0.030803,30.845652,51.154006,...,39.055191,30609.625879,36309.83332,36309.83332,0.01913,0.034313,39.323912,30960.264844,36629.705938,36629.705938


In [7]:
# Make Volume Column more Stationary
df_fe['Volume']=df_fe['Volume'].pct_change()

In [8]:
# Check for NaN's
df_fe.dropna(inplace=True)

# Check for Inf
df_inf = df_fe.isin([np.inf,-np.inf])
count = np.isinf(df_inf).values.sum()
count

df_fe.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1920 entries, 2017-02-27 to 2022-05-31
Data columns (total 63 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Open           1920 non-null   float64
 1   High           1920 non-null   float64
 2   Low            1920 non-null   float64
 3   Close          1920 non-null   float64
 4   Adj Close      1920 non-null   float64
 5   Volume         1920 non-null   float64
 6   returns        1920 non-null   float64
 7   range          1920 non-null   float64
 8   exp_rets_cum   1920 non-null   float64
 9   rsi            1920 non-null   float64
 10  bb_bbm         1920 non-null   float64
 11  bb_bbh         1920 non-null   float64
 12  bb_bbl         1920 non-null   float64
 13  ma_20          1920 non-null   float64
 14  ma_50          1920 non-null   float64
 15  ma_100         1920 non-null   float64
 16  dow            1920 non-null   int64  
 17  tenkan_sen     1920 non-null   flo

In [9]:
# Save the DF
df_fe.to_csv('data/btc-usd.csv')