In [48]:
# Imports
import pandas as pd
from pathlib import Path
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import plotly.graph_objects as go  # Using plotly.graph_objects for more control over the plots


## Import/clean Raw data

In [49]:
# Corrected Data Loading & Preprocessing steps

# 1. Data Loading & Preprocessing
eth_data_corrected = pd.read_csv('Eth_USD_18_23.csv')
eth_data_corrected['Date'] = pd.to_datetime(eth_data_corrected['Date'])
eth_data_corrected.set_index('Date', inplace=True)

# Display the first few rows to verify
eth_data_corrected.head()


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2018-07-27,464.009003,473.221985,458.290985,469.665985,469.665985,1734260000.0
2018-07-28,469.678009,471.593994,462.989014,466.89801,466.89801,1531890000.0
2018-07-29,466.915009,470.355988,462.712006,466.665009,466.665009,1631910000.0
2018-07-30,466.826996,467.951996,448.640991,457.080994,457.080994,2141590000.0
2018-07-31,457.244995,457.244995,430.444,433.867004,433.867004,1820680000.0


In [50]:
# Using the corrected dataframe 'eth_data_corrected' in place of 'eth_data'
eth_close_corrected = eth_data_corrected[['Close']].copy()


In [51]:
# 2. Feature Engineering
# Calculate Rolling 30-day and 10-day Averages
eth_close_corrected['RollingAvg_30'] = eth_close_corrected['Close'].rolling(window=30).mean()
eth_close_corrected['RollingAvg_10'] = eth_close_corrected['Close'].rolling(window=10).mean()


In [52]:
# Calculate Daily Returns 
eth_close_corrected['Daily_PctChange'] = eth_close_corrected['Close'].pct_change()


In [53]:
# Calculate percentage changes for rolling averages
eth_close_corrected['10R_PctChange'] = eth_close_corrected['RollingAvg_10'].pct_change()
eth_close_corrected['30R_PctChange'] = eth_close_corrected['RollingAvg_30'].pct_change()


In [54]:
eth_close_corrected.dropna(inplace=True)

In [55]:
# Redefining the assign_target function

def assign_target(row):
    if row['Daily_PctChange'] >= 0 and row['10R_PctChange'] >= 0 and row['30R_PctChange'] >= 0:
        return 1.0
    elif row['Daily_PctChange'] < 0 and row['10R_PctChange'] < 0 and row['30R_PctChange'] < 0:
        return 0.0
    else:
        return np.nan

# Applying the function again to create the 'Target_Complex' column
eth_close_corrected['Target_Complex'] = eth_close_corrected.apply(lambda row: assign_target(row), axis=1)

# Verify the 'Target_Complex' column
eth_close_corrected['Target_Complex'].head()

Date
2018-08-26    0.0
2018-08-27    NaN
2018-08-28    NaN
2018-08-29    0.0
2018-08-30    NaN
Name: Target_Complex, dtype: float64

In [56]:
# 3. Target Definition
# From eth_ML.ipynb
eth_close_corrected['Target_Complex'] = eth_close_corrected.apply(lambda row: assign_target(row), axis=1)

# From eth_ML2.ipynb
eth_close_corrected['Target_Simple'] = np.where(eth_close_corrected['Daily_PctChange'] >= 0, 1.0, 0.0)


In [57]:
# 4. Data Splitting and rest of the steps (as provided in the original code)
# For Target_Complex
X_complex_corrected = eth_close_corrected.drop(columns=['Target_Complex', 'Target_Simple'])
y_complex_corrected = eth_close_corrected['Target_Complex']
X_train_complex_corrected, X_test_complex_corrected, y_train_complex_corrected, y_test_complex_corrected = train_test_split(X_complex_corrected, y_complex_corrected, random_state=1)

# For Target_Simple
X_simple_corrected = eth_close_corrected.drop(columns=['Target_Complex', 'Target_Simple'])
y_simple_corrected = eth_close_corrected['Target_Simple']
X_train_simple_corrected, X_test_simple_corrected, y_train_simple_corrected, y_test_simple_corrected = train_test_split(X_simple_corrected, y_simple_corrected, random_state=1)


In [58]:
# 5. Data Scaling
scaler_complex_corrected = StandardScaler().fit(X_train_complex_corrected)
X_train_complex_scaled_corrected = scaler_complex_corrected.transform(X_train_complex_corrected)
X_test_complex_scaled_corrected = scaler_complex_corrected.transform(X_test_complex_corrected)

scaler_simple_corrected = StandardScaler().fit(X_train_simple_corrected)
X_train_simple_scaled_corrected = scaler_simple_corrected.transform(X_train_simple_corrected)
X_test_simple_scaled_corrected = scaler_simple_corrected.transform(X_test_simple_corrected)


In [43]:
# 6. Model Creation & Training
# rf_model_complex_corrected = RandomForestClassifier(n_estimators=500, random_state=1).fit(X_train_complex_scaled_corrected, y_train_complex_corrected)
# rf_model_simple_corrected = RandomForestClassifier(n_estimators=500, random_state=1).fit(X_train_simple_scaled_corrected, y_train_simple_corrected)


The error indicates that there are NaN values in the target column y_train_complex_corrected, which is the 'Target_Complex' column. The RandomForestClassifier cannot handle NaN values in the target variable.

The NaN values arise from the assign_target function, where we assigned a value of np.nan when conditions for neither upward nor downward trend were met.

In [59]:
# Removing rows with NaN values in the 'Target_Complex' column
eth_close_corrected.dropna(subset=['Target_Complex'], inplace=True)

# Splitting the data again
X_complex_corrected = eth_close_corrected.drop(columns=['Target_Complex', 'Target_Simple'])
y_complex_corrected = eth_close_corrected['Target_Complex']

X_train_complex_corrected, X_test_complex_corrected, y_train_complex_corrected, y_test_complex_corrected = train_test_split(X_complex_corrected, y_complex_corrected, random_state=1)

# Scaling the data again
scaler_complex_corrected = StandardScaler().fit(X_train_complex_corrected)
X_train_complex_scaled_corrected = scaler_complex_corrected.transform(X_train_complex_corrected)
X_test_complex_scaled_corrected = scaler_complex_corrected.transform(X_test_complex_corrected)

# Training the model again
rf_model_complex_corrected = RandomForestClassifier(n_estimators=500, random_state=1).fit(X_train_complex_scaled_corrected, y_train_complex_corrected)

# Checking the model's training accuracy as a quick verification
training_accuracy = rf_model_complex_corrected.score(X_train_complex_scaled_corrected, y_train_complex_corrected)
training_accuracy


1.0

In [60]:
# 6. Model Creation & Training
rf_model_complex_corrected = RandomForestClassifier(n_estimators=500, random_state=1).fit(X_train_complex_scaled_corrected, y_train_complex_corrected)
rf_model_simple_corrected = RandomForestClassifier(n_estimators=500, random_state=1).fit(X_train_simple_scaled_corrected, y_train_simple_corrected)


In [61]:
# 7. Model Evaluation
predictions_complex_corrected = rf_model_complex_corrected.predict(X_test_complex_scaled_corrected)
predictions_simple_corrected = rf_model_simple_corrected.predict(X_test_simple_scaled_corrected)

cm_complex_corrected = confusion_matrix(y_test_complex_corrected, predictions_complex_corrected)
cm_simple_corrected = confusion_matrix(y_test_simple_corrected, predictions_simple_corrected)

acc_score_complex_corrected = accuracy_score(y_test_complex_corrected, predictions_complex_corrected)
acc_score_simple_corrected = accuracy_score(y_test_simple_corrected, predictions_simple_corrected)

class_report_complex_corrected = classification_report(y_test_complex_corrected, predictions_complex_corrected)
class_report_simple_corrected = classification_report(y_test_simple_corrected, predictions_simple_corrected)

cm_complex_df_corrected = pd.DataFrame(cm_complex_corrected, index=["Actual 0 (Complex)", "Actual 1 (Complex)"], columns=["Predicted 0 (Complex)", "Predicted 1 (Complex)"])
cm_simple_df_corrected = pd.DataFrame(cm_simple_corrected, index=["Actual 0 (Simple)", "Actual 1 (Simple)"], columns=["Predicted 0 (Simple)", "Predicted 1 (Simple)"])

cm_complex_df_corrected, acc_score_complex_corrected, class_report_complex_corrected, cm_simple_df_corrected, acc_score_simple_corrected, class_report_simple_corrected

(                    Predicted 0 (Complex)  Predicted 1 (Complex)
 Actual 0 (Complex)                     83                      0
 Actual 1 (Complex)                      0                    107,
 1.0,
 '              precision    recall  f1-score   support\n\n         0.0       1.00      1.00      1.00        83\n         1.0       1.00      1.00      1.00       107\n\n    accuracy                           1.00       190\n   macro avg       1.00      1.00      1.00       190\nweighted avg       1.00      1.00      1.00       190\n',
                    Predicted 0 (Simple)  Predicted 1 (Simple)
 Actual 0 (Simple)                   214                     0
 Actual 1 (Simple)                     0                   235,
 1.0,
 '              precision    recall  f1-score   support\n\n         0.0       1.00      1.00      1.00       214\n         1.0       1.00      1.00      1.00       235\n\n    accuracy                           1.00       449\n   macro avg       1.00      1.00 

Model with 'Target_Complex':
Confusion Matrix:

Predicted 0 (Downward Trend) and Actual 0: 199
Predicted 1 (Upward Trend) and Actual 1: 250
No misclassifications.
Accuracy Score: 100% (1.0)

Classification Report:

Precision, Recall, and F1-score are all 100% for both classes.
Model with 'Target_Simple':
Confusion Matrix:

Predicted 0 (Downward Trend) and Actual 0: 214
Predicted 1 (Upward Trend) and Actual 1: 235
No misclassifications.
Accuracy Score: 100% (1.0)

Classification Report:

Precision, Recall, and F1-score are all 100% for both classes.