In [1]:
# Import

import pandas as pd
import numpy as np
import hvplot.pandas
from pathlib import Path
from datetime import datetime, timedelta
import yfinance as yf


# Setting these options will allow for reviewing more of the DataFrames
pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_columns', 2000)
pd.set_option('display.width', 1000)

## Fetch the data from yfinance

In [2]:
# Import the dataset from yfinance
stock_symbol_BTC = "BTC-USD"  # Change this to the desired stock symbol
stock_symbol_ETH = "ETH-USD"  # Change this to the desired stock symbol
stock_symbol_BNB = "BNB-USD"  # Change this to the desired stock symbol
stock_symbol_XRP = "XRP-USD"  # Change this to the desired stock symbol
index_symbol_us10 = "^TNX"
# Calculate the start and end dates for the past 5 years
end_date = datetime.today().date()
start_date = end_date - timedelta(days=3*365)  # 5 years * 365 days

# Fetch historical price data
stock_data_BTC = yf.download(stock_symbol_BTC, start=start_date, end=end_date)
stock_data_ETH = yf.download(stock_symbol_ETH, start=start_date, end=end_date)
stock_data_BNB = yf.download(stock_symbol_BNB, start=start_date, end=end_date)
stock_data_XRP = yf.download(stock_symbol_XRP, start=start_date, end=end_date)
index_data_us10 = yf.download(index_symbol_us10, start=start_date, end=end_date)
# Print the retrieved data
print(stock_data_BTC)
print(stock_data_ETH)
print(stock_data_BNB)
print(stock_data_XRP) 
print(index_data_us10)

[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
[*********************100%%**********************]  1 of 1 completed
                    Open          High           Low         Close     Adj Close        Volume
Date                                                                                          
2020-10-17  11322.123047  11386.261719  11285.345703  11358.101562  11358.101562   19130430174
2020-10-18  11355.982422  11483.359375  11347.578125  11483.359375  11483.359375   18283314340
2020-10-19  11495.038086  11799.092773  11408.290039  11742.037109  11742.037109   23860769928
2020-10-20  11745.974609  11999.917969  11681.480469  11916.334961  11916.334961   30915821592
2020-10-21  11913.077148  13184.566406  11900.928711  12823.689453  12823.689453   43

In [3]:
Combined_df = pd.concat([stock_data_BTC.Close,stock_data_ETH.Close,stock_data_BNB.Close,stock_data_XRP.Close],axis=1)
Combined_df.columns=['BTC','ETH','BNB','XRP']
Combined_df.tail()

Unnamed: 0_level_0,BTC,ETH,BNB,XRP
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-10-12,26756.798828,1539.612427,205.229416,0.48305
2023-10-13,26862.375,1552.089478,206.036118,0.485699
2023-10-14,26861.707031,1555.256836,206.601898,0.486775
2023-10-15,27159.652344,1558.069824,209.742508,0.487846
2023-10-16,28519.466797,1600.534302,214.823959,0.497977


In [4]:
df_normalized = Combined_df / Combined_df.iloc[0]
df_normalized.head()

Unnamed: 0_level_0,BTC,ETH,BNB,XRP
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-10-17,1.0,1.0,1.0,1.0
2020-10-18,1.011028,1.02537,1.013597,1.005371
2020-10-19,1.033803,1.030038,0.989316,1.02089
2020-10-20,1.049148,1.000762,0.952236,1.010966
2020-10-21,1.129035,1.063261,0.984267,1.044345


In [5]:
Combined_df[['BTC_return','ETH_return','BNB_return','XRP_return']] = Combined_df.pct_change()
Combined_df.dropna().head()

Unnamed: 0_level_0,BTC,ETH,BNB,XRP,BTC_return,ETH_return,BNB_return,XRP_return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-10-18,11483.359375,378.213684,30.659523,0.242225,0.011028,0.02537,0.013597,0.005371
2020-10-19,11742.037109,379.935608,29.925068,0.245964,0.022526,0.004553,-0.023955,0.015436
2020-10-20,11916.334961,369.136902,28.803444,0.243573,0.014844,-0.028422,-0.037481,-0.009721
2020-10-21,12823.689453,392.189972,29.772354,0.251615,0.076144,0.062451,0.033639,0.033017
2020-10-22,12965.891602,413.77298,30.658192,0.256807,0.011089,0.055032,0.029754,0.020635


In [6]:
df_btc = stock_data_BTC[["Open","High","Low","Close","Volume"]]
df_btc = df_btc / df_btc.iloc[0]
df_btc["BTC_return"] = Combined_df["BTC_return"]
df_btc = pd.DataFrame(df_btc)
df_btc.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,BTC_return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-10-12,2.373521,2.364379,2.353346,2.355746,0.490993,-0.004336
2023-10-13,2.362885,2.37942,2.364688,2.365041,0.792732,0.003946
2023-10-14,2.372894,2.368556,2.376054,2.364982,0.281652,-2.5e-05
2023-10-15,2.372171,2.396675,2.376347,2.391214,0.371042,0.011092
2023-10-16,2.399076,2.586287,2.404044,2.510936,1.454953,0.050067


In [7]:
df_eth = stock_data_ETH[["Open","High","Low","Close","Volume"]]
df_eth["ETH_return"] = Combined_df["ETH_return"]
df_eth = pd.DataFrame(df_eth)
df_eth.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,ETH_return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-10-12,1566.355957,1566.878174,1523.237549,1539.612427,5003930677,-0.01701
2023-10-13,1539.432861,1571.75061,1537.921143,1552.089478,4575141511,0.008104
2023-10-14,1552.263794,1560.325073,1545.73877,1555.256836,2429214718,0.002041
2023-10-15,1555.07605,1565.760864,1550.554565,1558.069824,2923337883,0.001809
2023-10-16,1558.313477,1628.15625,1555.989624,1600.534302,8846928526,0.027255


In [8]:
df_bnb = stock_data_BNB[["Open","High","Low","Close","Volume"]]
df_bnb["BNB_return"] = Combined_df["BNB_return"]
df_bnb = pd.DataFrame(df_bnb)
df_bnb.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,BNB_return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-10-12,206.535873,206.659103,203.658447,205.229416,302337006,-0.006314
2023-10-13,205.2258,207.942535,204.617615,206.036118,296977151,0.003931
2023-10-14,206.040283,207.025543,205.827286,206.601898,203846460,0.002746
2023-10-15,206.600281,210.231613,206.478577,209.742508,352524312,0.015201
2023-10-16,209.74881,217.121231,209.74881,214.823959,521099239,0.024227


In [9]:
df_xrp = stock_data_XRP[["Open","High","Low","Close","Volume"]]
df_xrp["XRP_return"] = Combined_df["XRP_return"]
df_xrp = pd.DataFrame(df_xrp)
df_xrp.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,XRP_return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-10-12,0.488798,0.488857,0.475637,0.48305,812745907,-0.01184
2023-10-13,0.483049,0.489789,0.479341,0.485699,696915013,0.005484
2023-10-14,0.48569,0.48798,0.483974,0.486775,357321203,0.002215
2023-10-15,0.486773,0.489992,0.485665,0.487846,385469444,0.0022
2023-10-16,0.487838,0.505458,0.486157,0.497977,1082594336,0.020767


---

## Part 1: 

## Data Preparation

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Needed for decision tree visualization
import pydotplus
from IPython.display import Image

In [11]:
df_btc = df_btc.dropna()

In [12]:
# Define features set
X_btc = df_btc.copy()
X_btc.drop(["Close","BTC_return"], axis=1, inplace=True)
X_btc.dropna().head()
X_btc = X_btc.iloc[1:]


#### Create the target set

In [13]:
# Define target vector
#y_btc = df_btc["Close"].shift()


In [14]:
#y_btc = df_btc["Close"].shift().ravel()[1:]

In [15]:
y_btc = df_btc["Close"].shift().iloc[1:]

In [16]:
X_btc.dropna().tail()


Unnamed: 0_level_0,Open,High,Low,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-10-12,2.373521,2.364379,2.353346,0.490993
2023-10-13,2.362885,2.37942,2.364688,0.792732
2023-10-14,2.372894,2.368556,2.376054,0.281652
2023-10-15,2.372171,2.396675,2.376347,0.371042
2023-10-16,2.399076,2.586287,2.404044,1.454953


In [17]:
from pandas.tseries.offsets import DateOffset
training_begin = X_btc.index.min()
training_end = X_btc.index.min() + DateOffset(months=36)
X_train_btc = X_btc.loc[training_begin:training_end]
y_train_btc = y_btc.loc[training_begin:training_end]
X_test_btc = X_btc.loc[training_end:]
y_test_btc = y_btc.loc[training_end:]

In [18]:
training_end

Timestamp('2023-10-19 00:00:00')

In [19]:
training_begin

Timestamp('2020-10-19 00:00:00')

In [20]:
#y_test_btc

In [21]:
# Splitting into Train and Test sets
X_train_btc, X_test_btc, y_train_btc, y_test_btc = train_test_split(X_btc, y_btc, random_state=78)


In [22]:
# Creating StandardScaler instance
#scaler_btc = StandardScaler()
# Fitting Standard Scaller
#X_scaler_btc = scaler_btc.fit(X_train_btc)

In [23]:
# Transform the X_train and X_test DataFrames using the X_scaler
#X_train_scaled_btc = X_scaler_btc.transform(X_train_btc)
#X_test_scaled_btc = X_scaler_btc.transform(X_test_btc)

In [24]:
#X_train_scaled_btc

## Fitting the Random Forest Model

In [52]:
# Create a random forest classifier
rfr_model = RandomForestRegressor(n_estimators=100, random_state=78)


In [53]:
# Fitting the model
rfr_model_btc = rfr_model.fit(X_train_btc, y_train_btc)

In [54]:
# Making predictions using the testing data
predictions = rfr_model.predict(X_test_btc)

In [55]:
predictions

array([2.55831751, 1.83288214, 5.03141379, 4.3344823 , 1.48327663,
       2.50701611, 2.4825865 , 1.68663299, 1.49092344, 2.69171925,
       4.46385839, 2.28662767, 3.30086672, 2.67905784, 2.37013838,
       3.77188202, 1.42893445, 1.14869938, 5.12126736, 2.57605743,
       2.05336711, 3.49846931, 4.122525  , 2.46791146, 1.71247553,
       2.85973264, 2.03903079, 3.83861616, 2.60996093, 3.44843935,
       2.58459947, 1.76718621, 2.00884356, 2.79687708, 1.50813625,
       1.61245781, 3.56469956, 4.05307016, 2.1648696 , 2.44740722,
       2.43202735, 2.06383508, 1.97336524, 1.47566566, 4.19443512,
       2.30150142, 4.9911277 , 3.45381922, 2.61247389, 1.48761422,
       1.70410631, 2.6682044 , 1.81910292, 2.49076275, 2.66946949,
       3.77016524, 2.39625984, 3.01903442, 3.02625716, 1.47029474,
       2.11421734, 2.66171886, 2.49966138, 1.88606774, 1.85665765,
       2.68400205, 3.65962023, 1.86404726, 2.17285011, 3.71732531,
       2.5204498 , 5.36408371, 4.71733647, 2.98521236, 2.03686

In [56]:
# Create a copy of the original data
df_predict = pd.DataFrame({"predictions":predictions, "y_test_btc":y_test_btc})

# Display sample data
df_predict = df_predict.reset_index()
df_predict = df_predict.set_index("Date")
df_predict.sort_values(by='Date', ascending=True).tail()


Unnamed: 0_level_0,predictions,y_test_btc
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-09-11,2.241257,2.274344
2023-09-20,2.39626,2.395745
2023-09-30,2.370138,2.369385
2023-10-03,2.42252,2.42389
2023-10-05,2.447407,2.447539


In [57]:
# Create a line plot of the predicted salary values
best_fit_line = df_predict.hvplot.scatter(
    x = "predictions",
    y = "y_test_btc",
    color = "red"
)
best_fit_line

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type


## Model Evaluation

In [58]:
# Import relevant metrics - score, r2, mse, rmse, std - from Scikit-learn
from sklearn.metrics import mean_squared_error, r2_score

In [59]:
# Compute the metrics for the linear regression model
score = rfr_model.score(X_test_btc, y_test_btc, sample_weight=None)
r2 = r2_score(y_test_btc, predictions)
mse = mean_squared_error(y_test_btc, predictions)
rmse = np.sqrt(mse)
std = np.std(y_test_btc)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.999891683620886.
The r2 is 0.999891683620886.
The mean squared error is 0.0001454050337784201.
The root mean squared error is 0.012058400962748754.
The standard deviation is 1.1586243565889187.


In [60]:
# We will compare the actual close of the previous day to the predicted close value.
# If it is higher, we will buy, else sell
df_predict['previous_close'] = df_predict['y_test_btc'].shift(1)
df_predict = df_predict.dropna()


In [61]:
# Create a new column in the trading_df called signal setting its value to zero.
df_predict["signal"] = 0.0

# Create the signal to buy
df_predict.loc[(df_predict["predictions"] >= df_predict['previous_close']), "signal"] = 1

# Create the signal to sell
df_predict.loc[(df_predict["predictions"] < df_predict['previous_close']), "signal"] = -1

# Copy the new signal column to a new Series called y.
df_predict.sort_values(by='Date', ascending=True).tail()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_predict["signal"] = 0.0


Unnamed: 0_level_0,predictions,y_test_btc,previous_close,signal
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-09-11,2.241257,2.274344,2.62064,-1.0
2023-09-20,2.39626,2.395745,3.760892,-1.0
2023-09-30,2.370138,2.369385,2.676421,-1.0
2023-10-03,2.42252,2.42389,3.804204,-1.0
2023-10-05,2.447407,2.447539,2.169489,1.0


In [62]:
# Calculate the points in time when the Signal value changes
# Identify trade entry (1) and exit (-1) points
df_predict['Entry/Exit'] = df_predict['signal'].diff()

# Review the DataFrame
df_predict.tail(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_predict['Entry/Exit'] = df_predict['signal'].diff()


Unnamed: 0_level_0,predictions,y_test_btc,previous_close,signal,Entry/Exit
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-10-28,5.14828,5.148958,4.362186,1.0,0.0
2022-03-19,3.677622,3.680294,5.148958,-1.0,-2.0
2022-05-10,2.665793,2.667431,3.680294,-1.0,0.0
2020-12-12,1.587969,1.589958,2.667431,-1.0,0.0
2020-11-17,1.469376,1.471735,1.589958,-1.0,0.0
2022-08-16,2.126394,2.125089,1.471735,1.0,2.0
2023-03-21,2.443985,2.444708,2.125089,1.0,0.0
2022-10-29,1.816264,1.813274,2.444708,-1.0,-2.0
2022-09-24,1.699032,1.69902,1.813274,-1.0,0.0
2021-02-10,4.089695,4.092331,1.69902,1.0,2.0


In [63]:
df_combined_predict = pd.concat([df_predict,df_btc.Close],axis=1)
df_combined_predict.dropna().tail()

Unnamed: 0_level_0,predictions,y_test_btc,previous_close,signal,Entry/Exit,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-09-11,2.241257,2.274344,2.62064,-1.0,0.0,2.215393
2023-09-20,2.39626,2.395745,3.760892,-1.0,-2.0,2.38878
2023-09-30,2.370138,2.369385,2.676421,-1.0,0.0,2.374333
2023-10-03,2.42252,2.42389,3.804204,-1.0,0.0,2.415014
2023-10-05,2.447407,2.447539,2.169489,1.0,2.0,2.413776


In [64]:
# Visualize exit position relative to close price
exit = df_combined_predict[df_combined_predict['Entry/Exit'] == -2.0]['Close'].hvplot.scatter(
    color='red',
    marker='v',
    size=200,
    legend=False,
    ylabel='Price in $',
    width=1000,
    height=400
)

# Visualize entry position relative to close price
entry = df_combined_predict[df_combined_predict['Entry/Exit'] == 2.0]['Close'].hvplot.scatter(
    color='green',
    marker='^',
    size=200,
    legend=False,
    ylabel='Price in $',
    width=1000,
    height=400
)

# Visualize close price for the investment
security_close = df_combined_predict[['Close']].hvplot(
    line_color='lightgray',
    ylabel='Price in $',
    width=1000,
    height=400
)

# Visualize moving averages
moving_avgs = df_combined_predict[['predictions', 'y_test_btc']].hvplot(
    ylabel='Price in $',
    width=1000,
    height=400
)

# Create the overlay plot
entry_exit_plot = security_close * moving_avgs * entry * exit

# Show the plot with a title
entry_exit_plot.opts(
    title="BTC - predictions, y_test_btc, Entry and Exit Points"
)

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type


In [83]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
import numpy as np

n_samples, n_features = 1000, 20
rng = np.random.RandomState(0)
X_btc = df_btc.copy()
X_btc.drop(["Close","BTC_return"], axis=1, inplace=True)
X_btc.dropna()
# positive integer target correlated with X[:, 5] with many zeros:
y_btc = df_btc["Close"]
X_train_btc, X_test_btc, y_train_btc, y_test_btc = train_test_split(X_btc, y_btc, random_state=rng)
regressor = DecisionTreeRegressor(criterion="poisson", random_state=0)
regressor.fit(X_train_btc, y_train_btc)

In [86]:
# Making predictions using the testing data
predictions_rfr = regressor.predict(X_test_btc)
predictions_rfr

array([1.50806229, 1.83661408, 1.82868953, 2.66733014, 3.87052881,
       1.40477595, 4.9205523 , 4.75148486, 4.50838471, 4.3050084 ,
       2.3524434 , 2.31794572, 3.71458664, 2.67069881, 1.91662803,
       2.31794572, 1.24437231, 4.99243416, 1.86929621, 2.55379902,
       2.2760177 , 1.99057597, 3.05065458, 1.75693475, 5.57434809,
       1.63354025, 2.04371639, 2.35814744, 2.04623507, 2.31957761,
       2.41448182, 2.37905487, 5.12694098, 2.63330432, 2.06684776,
       4.11222028, 3.66489022, 4.3057805 , 2.31957761, 4.82224761,
       2.0394807 , 4.22122115, 2.40153779, 1.44956278, 1.60039812,
       2.27756361, 3.80404578, 2.58135094, 4.5565194 , 2.49587849,
       2.65020219, 2.58449641, 5.08945673, 2.98439371, 2.00386563,
       2.560622  , 2.9464517 , 2.68244549, 3.72312783, 3.06130226,
       3.30822713, 1.61734792, 2.08978139, 5.05375809, 1.99297996,
       2.3613077 , 3.49679503, 1.91829621, 3.27119822, 3.71519091,
       1.83661408, 3.0850962 , 4.93115425, 1.48966837, 2.70251

In [88]:
df_predict_rfr = pd.DataFrame({"predictions":predictions_rfr, "y_test_btc":y_test_btc})


In [89]:
# Create a line plot of the predicted salary values
best_fit_line = df_predict_rfr.hvplot.scatter(
    x = "predictions",
    y = "y_test_btc",
    color = "red"
)
best_fit_line

  return dataset.data.dtypes[idx].type
  return dataset.data.dtypes[idx].type


In [97]:
# Compute the metrics for the linear regression model
score = regressor.score(X_test_btc, y_test_btc, sample_weight=None)
r2 = r2_score(y_test_btc, predictions_rfr)
mse = mean_squared_error(y_test_btc, predictions_rfr)
rmse = np.sqrt(mse)
std = np.std(y_test_btc)

# Print relevant metrics.
print(f"The score is {score}.")
print(f"The r2 is {r2}.")
print(f"The mean squared error is {mse}.")
print(f"The root mean squared error is {rmse}.")
print(f"The standard deviation is {std}.")

The score is 0.99578349740984.
The r2 is 0.99578349740984.
The mean squared error is 0.005333822389789824.
The root mean squared error is 0.07303302259793047.
The standard deviation is 1.1247165592339243.
