In [2]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from optStrat import OptStrat



df = pd.read_csv('/Users/hatim/Desktop/Applied Forecasting/Final Project/Algo Trading/Data.csv', index_col=0, parse_dates=True)
df.head()

Unnamed: 0,Open,High,Low,Close,Volume,Risk Free Rate
2015-07-20,277.98,280.0,277.37,280.0,782.88342,0.02372
2015-07-21,279.96,281.27,276.85,277.32,4943.559434,0.0234
2015-07-22,277.33,278.54,275.01,277.89,4687.909383,0.02322
2015-07-23,277.96,279.75,276.28,277.39,5306.919575,0.02277
2015-07-24,277.23,291.52,276.43,289.12,7362.469083,0.02271


# Finding the optimal trading strategy in Hindsight

- This is dynamic programming problem and we can use a top down approach to solve it.

1. Indexing:
    - Time Index: $t\in \{ 0,1,\dots ,T-1\}$ (we cannot have an optimal strategy for the last period because we do not know the future)
    - State Space: $s\in \{0,1\}$ where:
        - 0 is going short (we buy all the risk free asset that we can)
        - 1 is going long (we buy all the BTC that we can)

2. Value Function:
    - Let $V(t,s)$ be the maximum expected return from time $t$ to $T$ given that we are in state $s$ at time $t$.
    - For each time $t$ and state $s$ we have the following:
    $$
       V(t, s) = max\{\space V_{BTC}(t, s)\space, \space V_{Rf}(t, s)\space\}
    $$
    - where:
    $$
    V_{BTC}(t, s) = r_{BTC}(t) + V(t+1, 1) - c \cdot (1-s)
    $$
    $$
    V_{RF}(t, s) = r_{RF}(t) + V(t+1, 0) - c \cdot s
    $$
    - Here:
        - $r_{BTC}(t)$ is the return (daily) of BTC at time $t$
        - $r_{RF}(t)$ is the return (daily)of the risk free asset at time $t$
        - $c$ is the transaction cost
        - $s$ is the state of the world at time $t$


3. Recursive Relation:
    - Base Case: 
    $$
    V(T, s) = 0 \quad \forall s
    $$
    - Algorithm:
        - Initialize memoization table of size $(T+1) \times 2$ to store the value function for each time and state. (T+1 because we t=0)
        - Initialize decision table of size $T \times 2$ to store the optimal decision for each time and state.
        - We then fill in these tables backwards from $t=T-1$ to $t=0$ using the recursive relation (notice that we need $V(t+1, s)$ to fill $V(t, s)$)
        - For each time step we compute both possible actions (go long or go short)




In [3]:
strat=OptStrat(df=df)

In [4]:
strat.df['Signals'].value_counts()

Signals
 0.0    2345
 1.0     609
-1.0     608
Name: count, dtype: int64

In [5]:
df

Unnamed: 0,Open,High,Low,Close,Volume,Risk Free Rate,Signals,Strategy Returns
2015-07-20,277.98,280.00,277.37,280.00,782.883420,0.02372,0.0,0.000000
2015-07-21,279.96,281.27,276.85,277.32,4943.559434,0.02340,1.0,-0.004936
2015-07-22,277.33,278.54,275.01,277.89,4687.909383,0.02322,0.0,0.002055
2015-07-23,277.96,279.75,276.28,277.39,5306.919575,0.02277,0.0,-0.001799
2015-07-24,277.23,291.52,276.43,289.12,7362.469083,0.02271,0.0,0.042287
...,...,...,...,...,...,...,...,...
2025-04-15,84590.36,86491.40,83592.77,83629.78,6460.941442,0.04323,1.0,-0.004883
2025-04-16,83622.52,85526.40,83088.02,84028.72,8243.059013,0.04279,0.0,0.004770
2025-04-17,84028.71,85494.94,83711.69,84961.97,5862.464055,0.04333,0.0,0.011106
2025-04-18,84961.97,85150.94,84287.06,84466.47,1884.039112,0.04333,0.0,-0.005832


In [None]:
df.drop(columns='Strategy Returns', inplace=True)

# Learning signals and predicting for the future

## Feature Engineering

## Getting Features

In [None]:
from features import FeatureEngineer


engineer = FeatureEngineer(df)
engineer.add_all_features()
X, y = engineer.get_feature_target_split()
X_best, scores, selected_features = engineer.select_best_features(k=25)   

## Cleaning and Preprocessing

### Check Missing Values

In [None]:
X_selected = X[selected_features]



### Standardize Features

In [None]:
def test_train_split(X, y, test_size=0.2):
    assert len(X) == len(y)
    n = len(X)
    test_size = int(n * test_size)
    X_train = X[:-test_size]
    X_test = X[-test_size:]
    y_train = y[:-test_size]
    y_test = y[-test_size:]
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = test_train_split(X_selected, y, test_size=0.2)

In [None]:
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)



# Models

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt


rf = RandomForestClassifier(
    n_estimators=500, 
    random_state=42
)

rf.fit(X_train_scaled, y_train)

y_pred = rf.predict(X_test_scaled)
y_pred_proba = rf.predict_proba(X_test_scaled)

decisions = pd.DataFrame(y_pred_proba, columns=['Sell', 'Hold', 'Buy'], index=X_test.index)

conf_matrix = confusion_matrix(y_test, y_pred)

# plot the confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Sell', 'Hold', 'Buy'], yticklabels=['Sell', 'Hold', 'Buy'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
importances = rf.feature_importances_
# plot the feature importances
plt.figure(figsize=(10, 7))
plt.barh(X_train.columns, importances)
plt.xlabel('Feature Importance')
plt.title('Random Forest Feature Importances')
plt.show()

# Simulate Trade

In [None]:
btc_data = df[['Close', 'Risk Free Rate']]
trade_data = pd.merge(decisions, btc_data, left_index=True, right_index=True, how='left')
trade_data.head()



In [None]:
px.histogram(trade_data[['Sell', 'Hold', 'Buy']], barmode='group', title='Probabilities of Predictions').show()

In [None]:
from trader import TradingSimulator

simulator = TradingSimulator(trade_data, initial_capital=100, transaction_cost=0.005)
simulator.simulate(decision_method='highest_prob')
simulator.plot_portfolio_performance()
simulator.plot_performance_metrics()
