In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import (
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score, 
    roc_auc_score, 
    confusion_matrix
)
import matplotlib.pyplot as plt
import ETFs as etfs

In [2]:
data = pd.read_csv('../../dataProcessing/assets_filled.csv', parse_dates=['Date'])
data.set_index('Date', inplace=True)
# display(data)

In [3]:
binary_targets = pd.read_csv('../../dataProcessing/target_binary.csv', parse_dates=['Date'])
binary_targets.set_index('Date', inplace=True)

In [4]:
test_df = pd.DataFrame({'Close_yesterday': data['SPY'].shift(1),'Close Tody':data['SPY'],'Target': binary_targets['SPY']})
# display(test_df)

In [5]:
dataset_size = len(data)
train_size = int(dataset_size * 0.8)
test_size = int(dataset_size * 0.1)
val_size = dataset_size - train_size - test_size

train_data = data.iloc[:train_size]
val_data = data.iloc[train_size:train_size + val_size]
test_data = data.iloc[train_size + val_size:]

binary_targets_train = binary_targets.iloc[:train_size]
binary_targets_val = binary_targets.iloc[train_size:train_size + val_size]
binary_targets_test = binary_targets.iloc[train_size + val_size:]

In [6]:
def create_X_y(data,target_label, window_size):
    X, y, labels, first_prices = [], [], [], []
    print(len(data), window_size)
    for i in range(len(data) - window_size):
        window = data.iloc[i : i + window_size]
        first_value = window.iloc[0]
        normalized_window = window / first_value
        X.append(normalized_window.values.flatten()) 


        y_target = data.iloc[i + window_size]/ first_value
        y.append(y_target.values) 
        
        l = target_label.iloc[i+window_size]
        labels.append(l)
        
        first_prices.append(first_value)
        
    X = np.array(X)
    y = np.array(y)
    first_prices = np.array(first_prices)
    labels = np.array(labels)
        
    return X, y, first_prices, labels

window_size = 30

X_train, y_train, first_values_train, labels_train = create_X_y(train_data, binary_targets_train, window_size)
X_val, y_val, first_values_val, labels_val = create_X_y(val_data, binary_targets_val, window_size)
X_test, y_test, first_values_test, labels_test = create_X_y(test_data, binary_targets_test, window_size)

X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape, labels_train.shape, labels_val.shape, labels_test.shape


2726 30
342 30
340 30


((2696, 990),
 (2696, 33),
 (312, 990),
 (312, 33),
 (310, 990),
 (310, 33),
 (2696, 33),
 (312, 33),
 (310, 33))

In [None]:
rf = MultiOutputClassifier(
    RandomForestClassifier(
        n_estimators=100,      
        max_depth=10,         
        min_samples_split=2,  
        random_state=42,
        criterion='gini',
        class_weight='balanced',  
        n_jobs=-1
    )
)
rf.fit(X_train, labels_train)


0,1,2
,estimator,RandomForestC...ndom_state=42)
,n_jobs,

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [8]:
Y_pred = rf.predict(X_test)
y_pred_df = pd.DataFrame(Y_pred, columns=data.columns)
display(y_pred_df)

Unnamed: 0,AIEQ,ASHR.L,BCHN.L,CORN,DBO,EDEN,EMXC,EWA,EWC,EWG,...,SLV,SPY,TLT,USO,VDE,VDNR.L,VGK,VNQ,VPL,XMAF.L
0,1,0,1,0,1,0,1,1,0,1,...,0,1,0,1,0,1,1,1,0,1
1,1,1,1,0,1,0,0,1,0,1,...,0,1,0,1,0,1,0,1,1,0
2,1,0,1,0,1,0,1,1,0,1,...,0,1,1,1,0,1,1,0,0,0
3,1,0,0,0,1,0,1,0,0,1,...,0,1,1,1,0,1,0,0,0,0
4,1,0,0,0,1,0,1,1,0,0,...,0,1,0,1,0,1,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
305,0,0,0,0,1,1,0,0,0,0,...,0,0,0,1,0,1,0,1,1,0
306,0,0,0,1,1,1,0,0,1,0,...,1,1,0,1,0,0,0,1,1,1
307,0,0,0,0,1,1,0,0,1,0,...,1,1,0,1,1,1,0,0,1,1
308,0,0,0,0,1,0,0,0,1,0,...,1,0,0,1,0,1,1,0,0,1


In [9]:
from sklearn.metrics import accuracy_score

actual_flat = labels_test.flatten()
predicted_flat = Y_pred.flatten()

accuracy = accuracy_score(actual_flat, predicted_flat)

print(f"Accuracy: {accuracy:.4f}")



Accuracy: 0.4987


In [10]:

overall_accuracy = accuracy_score(actual_flat, predicted_flat)
overall_precision = precision_score(actual_flat, predicted_flat, average='macro', zero_division=0)
overall_recall = recall_score(actual_flat, predicted_flat, average='macro', zero_division=0)
overall_f1 = f1_score(actual_flat, predicted_flat, average='macro', zero_division=0)

print(f"Overall Accuracy: {overall_accuracy:.4f}")
print(f"Overall Precision (macro): {overall_precision:.4f}")
print(f"Overall Recall (macro): {overall_recall:.4f}")
print(f"Overall F1 Score (macro): {overall_f1:.4f}")





Overall Accuracy: 0.4987
Overall Precision (macro): 0.4986
Overall Recall (macro): 0.4986
Overall F1 Score (macro): 0.4985


In [None]:


# For binary classification
accuracy = accuracy_score(actual_flat, predicted_flat)
precision = precision_score(actual_flat, predicted_flat)
recall = recall_score(actual_flat, predicted_flat)
f1 = f1_score(actual_flat, predicted_flat)
roc_auc = roc_auc_score(actual_flat, predicted_flat)  # y_prob is the predicted probability for the positive class
cm = confusion_matrix(actual_flat, predicted_flat)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC-ROC: {roc_auc:.4}")
print(f"Confusion Matrix:\n{cm}")


Accuracy: 0.4987
Precision: 0.5108
Recall: 0.5053
F1 Score: 0.5081
AUC-ROC: 0.4986
Confusion Matrix:
[[2454 2536]
 [2592 2648]]


In [12]:
labels_test_df = pd.DataFrame(labels_test, columns=data.columns)
