In [49]:
# Imports
import pandas as pd
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt
import hvplot.pandas

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

%matplotlib inline

## Import/clean Raw data

In [2]:
# Create and view Dataframe
csv_path = Path("../Resources/Eth_USD_18_23.csv")
eth_data = pd.read_csv(csv_path)
eth_data.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2018-07-27,464.009003,473.221985,458.290985,469.665985,469.665985,1734260000.0
1,2018-07-28,469.678009,471.593994,462.989014,466.89801,466.89801,1531890000.0
2,2018-07-29,466.915009,470.355988,462.712006,466.665009,466.665009,1631910000.0
3,2018-07-30,466.826996,467.951996,448.640991,457.080994,457.080994,2141590000.0
4,2018-07-31,457.244995,457.244995,430.444,433.867004,433.867004,1820680000.0


In [3]:
# Check datatypes 
eth_data.dtypes

Date          object
Open         float64
High         float64
Low          float64
Close        float64
Adj Close    float64
Volume       float64
dtype: object

In [4]:
# Convert Date time
eth_data['Date'] = pd.to_datetime(eth_data['Date'])
eth_data.tail()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
1823,2023-07-24,1888.80957,1889.761963,1836.852417,1850.002075,1850.002075,6344374000.0
1824,2023-07-25,1850.027954,1867.323242,1845.981567,1857.741943,1857.741943,4163382000.0
1825,2023-07-26,1857.696533,1886.974121,1849.43689,1872.159912,1872.159912,5781548000.0
1826,2023-07-27,,,,,,
1827,2023-07-28,1860.816284,1863.987915,1860.660034,1861.275757,1861.275757,4093237000.0


In [5]:
#Confirm Datetime changed
eth_data.dtypes

Date         datetime64[ns]
Open                float64
High                float64
Low                 float64
Close               float64
Adj Close           float64
Volume              float64
dtype: object

In [6]:
#Copy dataframe date and close columns only
eth_close = eth_data[['Date', 'Close']].copy()
eth_close.set_index('Date', inplace=True)
eth_close.head()

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2018-07-27,469.665985
2018-07-28,466.89801
2018-07-29,466.665009
2018-07-30,457.080994
2018-07-31,433.867004


## Calculate data features from raw closing data

In [7]:
#Calculate Rolling 30-day Average
eth_30_rollingMean = eth_close.rolling(window=30).mean()
eth_30_rollingMean.set_axis(['RollingAvg_30'], axis='columns', inplace=True)
eth_30_rollingMean.head()

Unnamed: 0_level_0,RollingAvg_30
Date,Unnamed: 1_level_1
2018-07-27,
2018-07-28,
2018-07-29,
2018-07-30,
2018-07-31,


In [29]:
#Calculate Rolling 10-day Average
eth_10_rollingMean = eth_close.rolling(window=10).mean()
eth_10_rollingMean.set_axis(['RollingAvg_10'], axis='columns', inplace=True)
eth_10_rollingMean.head(15)

Unnamed: 0_level_0,RollingAvg_10
Date,Unnamed: 1_level_1
2018-07-27,
2018-07-28,
2018-07-29,
2018-07-30,
2018-07-31,
2018-08-01,
2018-08-02,
2018-08-03,
2018-08-04,
2018-08-05,436.357501


In [14]:
# Calculate the Daily Returns 
"""
(tried to rename column in this cell using many strategies, i.e. .rename function, .set_axis function(which worked on the rolling average df), and .columns function. 
None of these worked here so I renamed after concating two cells down)
"""
eth_daily_returns = eth_close['Close'].pct_change()
eth_daily_returns.head()

Date
2018-07-27         NaN
2018-07-28   -0.005893
2018-07-29   -0.000499
2018-07-30   -0.020537
2018-07-31   -0.050787
Name: Close, dtype: float64

In [30]:
# Combine all results
eth_all = pd.concat([eth_close, eth_30_rollingMean, eth_10_rollingMean, eth_daily_returns], axis="columns", join="inner")
eth_all.head()

Unnamed: 0_level_0,Close,RollingAvg_30,RollingAvg_10,Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-07-27,469.665985,,,
2018-07-28,466.89801,,,-0.005893
2018-07-29,466.665009,,,-0.000499
2018-07-30,457.080994,,,-0.020537
2018-07-31,433.867004,,,-0.050787


In [31]:
# Rename daily returns column only
eth_all.set_axis(['Close', '30-Day_RollingAvg', '10-Day_RollingAvg', 'Daily_PctChange'], axis='columns', inplace=True)
eth_all.head()

Unnamed: 0_level_0,Close,30-Day_RollingAvg,10-Day_RollingAvg,Daily_PctChange
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-07-27,469.665985,,,
2018-07-28,466.89801,,,-0.005893
2018-07-29,466.665009,,,-0.000499
2018-07-30,457.080994,,,-0.020537
2018-07-31,433.867004,,,-0.050787


In [32]:
#Clean up nulls in dataframe
eth_all.dropna(inplace=True)
eth_all.head()

Unnamed: 0_level_0,Close,30-Day_RollingAvg,10-Day_RollingAvg,Daily_PctChange
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-08-25,279.645996,352.134536,286.773804,-0.011736
2018-08-26,275.196991,345.652236,285.488904,-0.015909
2018-08-27,285.602997,339.609069,282.476303,0.037813
2018-08-28,296.498993,333.936868,282.545001,0.038151
2018-08-29,289.312012,328.344569,281.392801,-0.024239


In [33]:
# Visualize the Rolling 30-Day Average compared to the Close
eth_all['Close'].hvplot.line(rot=45) * eth_all['30-Day_RollingAvg'].hvplot.line(label='Rolling 30-Day Average')

In [34]:
# Visualize the Rolling 30-Day Average compared to the Close
eth_all['Close'].hvplot.line(rot=45) * eth_all['10-Day_RollingAvg'].hvplot.line(label='Rolling 10-Day Average')

In [None]:
"""
col         = 'consumption_energy'
conditions  = [ df2[col] >= 400, (df2[col] < 400) & (df2[col]> 200), df2[col] <= 200 ]
choices     = [ "high", 'medium', 'low' ]
    
eth_all['Target2'] = np.select(conditions, choices, default=np.nan)
"""

In [43]:
# Create the Target Column
# Target '1' when trending upward
eth_all.loc[(eth_all['Daily_PctChange'] >= 0), 'Target'] = 1
# Target '0' when trending downward
eth_all.loc[(eth_all['Daily_PctChange'] < 0), 'Target'] = 0
eth_all.head()

Unnamed: 0_level_0,Close,30-Day_RollingAvg,10-Day_RollingAvg,Daily_PctChange,Target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-08-25,279.645996,352.134536,286.773804,-0.011736,0.0
2018-08-26,275.196991,345.652236,285.488904,-0.015909,0.0
2018-08-27,285.602997,339.609069,282.476303,0.037813,1.0
2018-08-28,296.498993,333.936868,282.545001,0.038151,1.0
2018-08-29,289.312012,328.344569,281.392801,-0.024239,0.0


## Define Data Features, Scale and Split Data

In [44]:
# Define features sets
y = eth_all['Target'].copy()
X = eth_all.copy()
X.drop('Target', axis=1, inplace=True)
X.head()

Unnamed: 0_level_0,Close,30-Day_RollingAvg,10-Day_RollingAvg,Daily_PctChange
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-08-25,279.645996,352.134536,286.773804,-0.011736
2018-08-26,275.196991,345.652236,285.488904,-0.015909
2018-08-27,285.602997,339.609069,282.476303,0.037813
2018-08-28,296.498993,333.936868,282.545001,0.038151
2018-08-29,289.312012,328.344569,281.392801,-0.024239


In [45]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [46]:
# Create instance for scaler
scaler = StandardScaler()

In [47]:
# Fit the scaler with X training info
X_scaler = scaler.fit(X_train)

In [48]:
# Transform the X data with scaler
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Run the Random Forest Classifier Model

In [50]:
# Create the Random Forest Classifier Instance
rf_model = RandomForestClassifier(n_estimators=500, random_state=1)

In [51]:
# Fit the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [52]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [53]:
# Create the confusion matrix dataframe
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculate the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [54]:
# Display results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,234,0
Actual 1,0,216


Accuracy Score : 1.0
Classification Report
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       234
         1.0       1.00      1.00      1.00       216

    accuracy                           1.00       450
   macro avg       1.00      1.00      1.00       450
weighted avg       1.00      1.00      1.00       450

