In [71]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input
import tensorflow as tf

In [63]:
# Load data
arb2_features = pd.read_parquet('../data/on_chain_data/processed/arb2_chain_processed.parquet')
arb1_prices   = pd.read_parquet('../data/price_data/processed/arb_processed_data.parquet')

# Convert price timestamp to datetime and set as index
arb1_prices['time_period_start'] = pd.to_datetime(arb1_prices['time_period_start'])
arb1_prices.set_index('time_period_start', inplace=True)

# Convert indexes to tz-naive (if necessary)
arb2_features.index = arb2_features.index.tz_convert(None)
arb1_prices.index   = arb1_prices.index.tz_convert(None)

# Join features and prices (inner join on the index)
data = arb2_features.join(arb1_prices[['price_close']], how='inner')


In [64]:
data['price_diff'] = data['price_close'].diff()
threshold = data['price_diff'].std()

# Define signal generation function
def get_signal(diff, threshold):
    if diff > threshold:
        return 'Buy'
    elif diff < -threshold:
        return 'Sell'
    else:
        return 'Hold'

data['Signal'] = data['price_diff'].apply(lambda x: get_signal(x, threshold))
data = data.dropna()

In [65]:
features = data[['average_gas_limit', 'average_gas_used', 'average_size']].values
labels   = data['Signal'].values

# Encode labels and convert to one-hot
label_encoder   = LabelEncoder()
integer_encoded = label_encoder.fit_transform(labels)
y               = to_categorical(integer_encoded)

# Scale features
scaler = StandardScaler()
X      = scaler.fit_transform(features)

In [66]:
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
# Further split train into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.2, random_state=42
)

# Build the standard MLP model
model = Sequential()
model.add(Dense(64, input_dim=X.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(3, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

# Train the model using the explicit validation set
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=50,
    batch_size=32
)

# Evaluate on test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Standard MLP - Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")

# Display class distribution
unique_labels, label_counts = np.unique(labels, return_counts=True)
for lbl, cnt in zip(unique_labels, label_counts):
    print(f"Label: {lbl}, Count: {cnt}, Percentage: {cnt/len(labels)*100:.2f}%")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m338/338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7729 - loss: 0.7123 - val_accuracy: 0.8417 - val_loss: 0.5197
Epoch 2/50
[1m338/338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8415 - loss: 0.5163 - val_accuracy: 0.8417 - val_loss: 0.5143
Epoch 3/50
[1m338/338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8400 - loss: 0.5124 - val_accuracy: 0.8417 - val_loss: 0.5145
Epoch 4/50
[1m338/338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8346 - loss: 0.5194 - val_accuracy: 0.8417 - val_loss: 0.5141
Epoch 5/50
[1m338/338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8328 - loss: 0.5275 - val_accuracy: 0.8417 - val_loss: 0.5134
Epoch 6/50
[1m338/338[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8383 - loss: 0.5183 - val_accuracy: 0.8417 - val_loss: 0.5159
Epoch 7/50
[1m338/338[0m 

In [67]:
signal_to_position = {'Buy': 1, 'Hold': 0, 'Sell': -1}
data['position'] = data['Signal'].map(signal_to_position)

# Compute asset returns from price_close (assumed daily returns)
data['asset_return'] = data['price_close'].pct_change()

# Compute strategy returns:
# The position from the previous period is applied to the current period's asset return
data['strategy_return'] = data['position'].shift(1) * data['asset_return']
data = data.dropna()

# Calculate Sharpe ratio (annualized, assuming 252 trading days)
sharpe_ratio = (data['strategy_return'].mean() / data['strategy_return'].std()) * np.sqrt(252)
print("Sharpe Ratio:", sharpe_ratio)

# Compute cumulative returns and total PnL
data['cumulative_return'] = (1 + data['strategy_return']).cumprod() - 1
total_pnl = data['cumulative_return'].iloc[-1]
print("Total PnL (as %):", total_pnl)
initial_capital = 100000
absolute_pnl = initial_capital * total_pnl
print("Absolute PnL:", absolute_pnl)

Sharpe Ratio: -0.014515260178131425
Total PnL (as %): -0.3327097690158085
Absolute PnL: -33270.97690158085


In [72]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model_custom = Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax')
])

optimizer_custom = tf.keras.optimizers.Adam()

# Define custom training step:
# We assume the mapping: [Buy, Hold, Sell] -> [1, 0, -1]
@tf.function
def train_step(x_batch, asset_batch):
    with tf.GradientTape() as tape:
        # Forward pass: get predicted probabilities for each class
        y_pred = model_custom(x_batch, training=True)  # shape: (batch_size, 3)
        # Compute expected (soft) positions: positions = P(Buy) - P(Sell)
        positions = y_pred[:, 0] - y_pred[:, 2]
        # Compute strategy returns for each sample in the batch
        strat_returns = positions * asset_batch
        # Compute mean and std of strategy returns in the batch
        mean_ret = tf.reduce_mean(strat_returns)
        std_ret  = tf.math.reduce_std(strat_returns)
        # Compute Sharpe ratio (avoid division by zero)
        sharpe = mean_ret / (std_ret + 1e-6)
        # Our loss is negative Sharpe ratio (we want to maximize Sharpe)
        loss_value = -sharpe
    grads = tape.gradient(loss_value, model_custom.trainable_variables)
    optimizer_custom.apply_gradients(zip(grads, model_custom.trainable_variables))
    return loss_value, sharpe

# For the custom loop, we need asset returns aligned with X_train.
# Here, we assume that the first len(X_train) entries of data['asset_return'] correspond to X_train.
# In practice, ensure that asset_returns_train is correctly aligned with X_train.
asset_returns_train = data['asset_return'].iloc[:len(X_train)].values.astype(np.float32)

# Prepare a tf.data.Dataset for custom training
batch_size_custom = 32
dataset = tf.data.Dataset.from_tensor_slices((X_train, asset_returns_train))
dataset = dataset.shuffle(buffer_size=1024, seed=42).batch(batch_size_custom)

# Custom training loop
epochs_custom = 50
for epoch in range(epochs_custom):
    epoch_loss_avg = tf.keras.metrics.Mean()
    epoch_sharpe_avg = tf.keras.metrics.Mean()
    
    for x_batch, asset_batch in dataset:
        loss_value, batch_sharpe = train_step(x_batch, asset_batch)
        epoch_loss_avg.update_state(loss_value)
        epoch_sharpe_avg.update_state(batch_sharpe)
    
    print(f"Custom Epoch {epoch+1:02d}: Loss = {epoch_loss_avg.result().numpy():.4f}, "
          f"Sharpe = {epoch_sharpe_avg.result().numpy():.4f}")

Custom Epoch 01: Loss = 0.0026, Sharpe = -0.0026
Custom Epoch 02: Loss = -0.0128, Sharpe = 0.0128
Custom Epoch 03: Loss = -0.0008, Sharpe = 0.0008
Custom Epoch 04: Loss = -0.0045, Sharpe = 0.0045
Custom Epoch 05: Loss = -0.0027, Sharpe = 0.0027
Custom Epoch 06: Loss = -0.0052, Sharpe = 0.0052
Custom Epoch 07: Loss = -0.0129, Sharpe = 0.0129
Custom Epoch 08: Loss = -0.0066, Sharpe = 0.0066
Custom Epoch 09: Loss = -0.0093, Sharpe = 0.0093
Custom Epoch 10: Loss = -0.0141, Sharpe = 0.0141
Custom Epoch 11: Loss = -0.0093, Sharpe = 0.0093
Custom Epoch 12: Loss = -0.0071, Sharpe = 0.0071
Custom Epoch 13: Loss = -0.0131, Sharpe = 0.0131
Custom Epoch 14: Loss = -0.0098, Sharpe = 0.0098
Custom Epoch 15: Loss = -0.0103, Sharpe = 0.0103
Custom Epoch 16: Loss = -0.0115, Sharpe = 0.0115
Custom Epoch 17: Loss = -0.0100, Sharpe = 0.0100
Custom Epoch 18: Loss = -0.0062, Sharpe = 0.0062
Custom Epoch 19: Loss = -0.0088, Sharpe = 0.0088
Custom Epoch 20: Loss = -0.0102, Sharpe = 0.0102
Custom Epoch 21: Los

In [74]:
import numpy as np

# --- Step 1: Predict on the Test Set ---
# The custom model outputs probabilities for each class: [Buy, Hold, Sell]
y_pred_test = model_custom.predict(X_test)  # Shape: (num_samples, 3)

# --- Step 2: Compute "Soft" Positions ---
# Here, we compute positions as: positions = P(Buy) - P(Sell)
positions_test = y_pred_test[:, 0] - y_pred_test[:, 2]

# --- Step 3: Align Asset Returns for Test Set ---
# In this example, we assume the last len(X_test) rows of data['asset_return']
# correspond to the test set. Adjust this alignment as needed.
asset_returns_test = data['asset_return'].iloc[-len(X_test):].values.astype(np.float32)

# --- Step 4: Compute Strategy Returns ---
# We assume that the signal (position) generated in one period is applied to
# the next period's asset return.
strategy_returns_test = np.roll(positions_test, shift=1) * asset_returns_test
# For the first sample, where there is no previous signal, set strategy return to zero.
strategy_returns_test[0] = 0

# --- Step 5: Compute Cumulative Returns (PnL) ---
# Assuming an initial capital of 1 (or 100%), we compute the cumulative product
# to simulate the capital growth over time.
cumulative_returns_test = (1 + strategy_returns_test).cumprod() - 1

# The final value represents the total return (PnL) over the test period
total_pnl_percentage = cumulative_returns_test[-1]
print("Total PnL (percentage):", total_pnl_percentage)

# --- Step 6: (Optional) Compute Absolute PnL ---
# For example, if you started with $100,000:
initial_capital = 100000
absolute_pnl = total_pnl_percentage * initial_capital
print("Absolute PnL:", absolute_pnl)


[1m106/106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Total PnL (percentage): 0.6175524
Absolute PnL: 61755.239963531494


In [82]:
aval_data = pd.read_parquet('../data/price_data/processed/aval_processed_data.parquet')
feat_avax2 = pd.read_parquet('../data/on_chain_data/processed/avax2_chain_processed.parquet')

In [85]:
import pandas as pd
import numpy as np
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Sequential

# -------------------------------
# 1. Load and Process Data
# -------------------------------
# Load price data and on-chain features
aval_data = pd.read_parquet('../data/price_data/processed/aval_processed_data.parquet')
feat_avax2 = pd.read_parquet('../data/on_chain_data/processed/avax2_chain_processed.parquet')

# Convert the price data time column to datetime and set as index
aval_data['time_open'] = pd.to_datetime(aval_data['time_open'])
aval_data.set_index('time_open', inplace=True)
aval_data.index = aval_data.index.tz_localize(None)



if feat_avax2.index.tz is not None:
    feat_avax2.index = feat_avax2.index.tz_localize(None)
# If needed, ensure that feat_avax2's index is a datetime index.
# For example, if feat_avax2 has a time column, convert and set it as index.
# Here, we assume feat_avax2 is already indexed by time.

# -------------------------------
# 2. Join DataFrames on Timestamp
# -------------------------------
# Perform an inner join so that only overlapping timestamps are kept.
data_aval = feat_avax2.join(aval_data[['price_close']], how='inner')

# -------------------------------
# 3. Compute Asset Returns
# -------------------------------
# Calculate asset returns based on the closing price.
data_aval['asset_return'] = data_aval['price_close'].pct_change()
data_aval = data_aval.dropna()  # Drop initial NaN from pct_change

# -------------------------------
# 4. Extract & Scale Features
# -------------------------------
# Extract the same feature columns used during training.
features_aval = data_aval[['average_gas_limit', 'average_gas_used', 'average_size']].values

# Transform features using the previously fitted scaler (assumed to be available as 'scaler')
X_aval = scaler.transform(features_aval)

# Extract asset returns and ensure they are float32
asset_returns_aval = data_aval['asset_return'].values.astype(np.float32)

# -------------------------------
# 5. Generate Predictions & Compute Positions
# -------------------------------
# Use your custom model (model_custom) to predict probabilities.
# The model outputs probabilities for [Buy, Hold, Sell]
y_pred_aval = model_custom.predict(X_aval)

# Compute the "soft" positions: position = P(Buy) - P(Sell)
positions_aval = y_pred_aval[:, 0] - y_pred_aval[:, 2]

# -------------------------------
# 6. Compute Strategy Returns & PnL
# -------------------------------
# Assume that the signal (position) produced in one period is used for the next period's return.
strategy_returns_aval = np.roll(positions_aval, shift=1) * asset_returns_aval
strategy_returns_aval[0] = 0  # For the first period, set return to zero

# Compute cumulative returns (compound growth of capital)
cumulative_returns_aval = (1 + strategy_returns_aval).cumprod() - 1
total_pnl_percentage = cumulative_returns_aval[-1]

# Compute the annualized Sharpe ratio (assuming daily data with 252 trading days)
sharpe_ratio_aval = (np.mean(strategy_returns_aval) / np.std(strategy_returns_aval)) * np.sqrt(252)

# Compute absolute PnL given an initial capital (e.g., $100,000)
initial_capital = 100000
absolute_pnl_aval = total_pnl_percentage * initial_capital

# -------------------------------
# 7. Print Final Results
# -------------------------------
print("Final Sharpe Ratio on aval_processed_data:", sharpe_ratio_aval)
print("Final Absolute PnL on aval_processed_data:", absolute_pnl_aval)


[1m952/952[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 873us/step
Final Sharpe Ratio on aval_processed_data: -0.06371625056558915
Final Absolute PnL on aval_processed_data: -97129.08267974854


In [86]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.utils import to_categorical

# -------------------------------
# 1. Load and Process Data
# -------------------------------
# Load on-chain features and price data for ARB coin
arb2_features = pd.read_parquet('../data/on_chain_data/processed/arb2_chain_processed.parquet')
arb1_prices   = pd.read_parquet('../data/price_data/processed/arb_processed_data.parquet')

# Process price data: convert time, set index, and ensure tz-naive
arb1_prices['time_period_start'] = pd.to_datetime(arb1_prices['time_period_start'])
arb1_prices.set_index('time_period_start', inplace=True)
arb1_prices.index = arb1_prices.index.tz_localize(None)

# Ensure arb2_features' index is tz-naive (if it is a DatetimeIndex)
if arb2_features.index.tz is not None:
    arb2_features.index = arb2_features.index.tz_localize(None)

# Join on-chain features with price data on their index (inner join)
data = arb2_features.join(arb1_prices[['price_close']], how='inner')

# -------------------------------
# 2. Generate Trading Signals
# -------------------------------
# Compute price differences and set a threshold based on volatility
data['price_diff'] = data['price_close'].diff()
threshold = data['price_diff'].std()

def get_signal(diff, threshold):
    if diff > threshold:
        return 'Buy'
    elif diff < -threshold:
        return 'Sell'
    else:
        return 'Hold'

# Generate a signal column and drop NaNs (first row from diff)
data['Signal'] = data['price_diff'].apply(lambda x: get_signal(x, threshold))
data = data.dropna()

# Ensure the DataFrame is sorted by date
data = data.sort_index()

# -------------------------------
# 3. Split Data by Time (First Half for Training)
# -------------------------------
# Use only the first half of dates for training.
midpoint = data.index[int(len(data) / 2)]
train_data = data[data.index <= midpoint]
# Optionally, you could set aside the remaining data as test_data:
# test_data = data[data.index > midpoint]

# -------------------------------
# 4. Prepare Features and Labels for Training
# -------------------------------
features_train = train_data[['average_gas_limit', 'average_gas_used', 'average_size']].values
labels_train = train_data['Signal'].values

# Encode string labels into integers then one-hot encode them
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(labels_train)
y_train = to_categorical(integer_encoded)

# Fit a scaler on training features and transform them
scaler = StandardScaler()
X_train = scaler.fit_transform(features_train)

# -------------------------------
# 5. Build and Train the MLP Model
# -------------------------------
model = Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)


Epoch 1/50
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.7789 - loss: 0.7365 - val_accuracy: 0.5469 - val_loss: 1.0997
Epoch 2/50
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8567 - loss: 0.4868 - val_accuracy: 0.5469 - val_loss: 1.1567
Epoch 3/50
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8561 - loss: 0.4860 - val_accuracy: 0.5469 - val_loss: 1.1423
Epoch 4/50
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8533 - loss: 0.4884 - val_accuracy: 0.5469 - val_loss: 1.1582
Epoch 5/50
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8525 - loss: 0.4904 - val_accuracy: 0.5469 - val_loss: 1.1491
Epoch 6/50
[1m211/211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8521 - loss: 0.4904 - val_accuracy: 0.5469 - val_loss: 1.1872
Epoch 7/50
[1m211/211[0m 

In [87]:
import numpy as np
import pandas as pd

# -------------------------------
# 1. Split Data: Latter Half as Test Set
# -------------------------------
# Assuming 'data' is your full DataFrame (joined, with signals computed and sorted by time)
# and that you already split it into training using the first half of dates.
midpoint = data.index[int(len(data) / 2)]
train_data = data[data.index <= midpoint]
test_data  = data[data.index > midpoint]

# -------------------------------
# 2. Prepare Test Features & Labels for Classification Evaluation
# -------------------------------
features_test = test_data[['average_gas_limit', 'average_gas_used', 'average_size']].values
labels_test   = test_data['Signal'].values

# Transform features using the scaler fitted on training data
X_test = scaler.transform(features_test)

# Encode labels using the same label encoder used during training
integer_encoded_test = label_encoder.transform(labels_test)
y_test = to_categorical(integer_encoded_test)

# Evaluate the model on test data (classification performance)
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Loss: {:.4f}, Test Accuracy: {:.4f}".format(loss, accuracy))

# -------------------------------
# 3. Compute Trading Performance (PnL & Sharpe Ratio)
# -------------------------------
# For trading evaluation, we use the model's predictions to get soft positions.
# The custom strategy maps the output probabilities to a "soft" position as:
#    position = P(Buy) - P(Sell)
y_pred_test = model.predict(X_test)
positions_test = y_pred_test[:, 0] - y_pred_test[:, 2]

# Make sure asset returns are computed. We'll compute them here from price_close.
# (If already computed in test_data as 'asset_return', you can use that instead.)
test_data['asset_return'] = test_data['price_close'].pct_change()
test_data = test_data.dropna()  # Drop first row where pct_change is NaN

# Align predictions and asset returns.
# Note: After dropna, the length of test_data may be one less than X_test.
# We'll recompute predictions on the aligned set:
features_test_aligned = test_data[['average_gas_limit', 'average_gas_used', 'average_size']].values
X_test_aligned = scaler.transform(features_test_aligned)
y_pred_test_aligned = model.predict(X_test_aligned)
positions_test_aligned = y_pred_test_aligned[:, 0] - y_pred_test_aligned[:, 2]

# Get the asset returns as a NumPy array.
asset_returns_test = test_data['asset_return'].values.astype(np.float32)

# Assume the position signal generated in one period is applied to the next period's return.
# Shift the positions by one period.
strategy_returns_test = np.roll(positions_test_aligned, shift=1) * asset_returns_test
strategy_returns_test[0] = 0  # For the first period, there's no prior signal

# Compute cumulative returns (i.e., compound growth of capital)
cumulative_returns_test = (1 + strategy_returns_test).cumprod() - 1
total_pnl_percentage = cumulative_returns_test[-1]

# Compute the annualized Sharpe ratio (assuming daily data with 252 trading days)
sharpe_ratio_test = (np.mean(strategy_returns_test) / np.std(strategy_returns_test)) * np.sqrt(252)

# Compute absolute PnL given an initial capital (e.g., $100,000)
initial_capital = 100000
absolute_pnl_test = total_pnl_percentage * initial_capital

print("\nTrading Performance on Test Data (Latter Half):")
print("Total PnL (percentage): {:.2%}".format(total_pnl_percentage))
print("Absolute PnL: ${:,.2f}".format(absolute_pnl_test))
print("Annualized Sharpe Ratio: {:.4f}".format(sharpe_ratio_test))


[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8128 - loss: 0.7064
Test Loss: 0.5532, Test Accuracy: 0.8810
[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 994us/step
[1m140/264[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 766us/step

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['asset_return'] = test_data['price_close'].pct_change()


[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 818us/step

Trading Performance on Test Data (Latter Half):
Total PnL (percentage): -1.85%
Absolute PnL: $-1,849.72
Annualized Sharpe Ratio: -0.0656


In [92]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler

#############################################
# 1. Load and Process Data
#############################################

# Load on-chain features and price data for ARB coin
arb2_features = pd.read_parquet('../data/on_chain_data/processed/arb2_chain_processed.parquet')
arb1_prices   = pd.read_parquet('../data/price_data/processed/arb_processed_data.parquet')

# Process price data: convert time, set index, and ensure tz-naive
arb1_prices['time_period_start'] = pd.to_datetime(arb1_prices['time_period_start'])
arb1_prices.set_index('time_period_start', inplace=True)
arb1_prices.index = arb1_prices.index.tz_localize(None)

# Ensure arb2_features' index is tz-naive (if it is a DatetimeIndex)
if arb2_features.index.tz is not None:
    arb2_features.index = arb2_features.index.tz_localize(None)

# Join the on-chain features with price data (inner join on timestamp)
data = arb2_features.join(arb1_prices[['price_close']], how='inner')

# Compute asset returns (percentage change in closing price)
data['asset_return'] = data['price_close'].pct_change()
data = data.dropna()
data = data.sort_index()

#############################################
# 2. Split Data by Time (First Half for Training)
#############################################

# Use the first half of the dates for training
midpoint = data.index[int(len(data) / 2)]
train_data = data[data.index <= midpoint]

#############################################
# 3. Prepare Features and PnL Objective Data
#############################################

# Extract training features and asset returns
features_train = train_data[['average_gas_limit', 'average_gas_used', 'average_size','average_difficulty','average_total_difficulty']].values
asset_returns_train = train_data['asset_return'].values.astype(np.float32)

# Scale features using StandardScaler (fit only on training data)
scaler = StandardScaler()
X_train = scaler.fit_transform(features_train)

#############################################
# 4. Build the MLP Model with PnL Maximization Objective
#############################################

# Our model outputs probabilities for 3 classes: [Buy, Hold, Sell]
# We will interpret the output as a "soft" signal: position = P(Buy) - P(Sell)
model_pnl = Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax')
])

# Use a relatively low learning rate for stability
optimizer_pnl = Adam(learning_rate=1e-4)

# Define a custom training step that maximizes average strategy return (PnL)
# For each batch, we compute:
#   positions = P(Buy) - P(Sell)
#   strategy_return = positions * asset_return
# Our loss is negative average strategy return.
@tf.function
def train_step(x_batch, asset_batch):
    with tf.GradientTape() as tape:
        y_pred = model_pnl(x_batch, training=True)  # shape: (batch_size, 3)
        positions = y_pred[:, 0] - y_pred[:, 2]
        strat_returns = positions * asset_batch
        # Use the mean strategy return as a proxy for PnL
        pnl = tf.reduce_mean(strat_returns)
        loss_value = -pnl  # We want to maximize pnl, so minimize its negative.
    grads = tape.gradient(loss_value, model_pnl.trainable_variables)
    optimizer_pnl.apply_gradients(zip(grads, model_pnl.trainable_variables))
    return loss_value, pnl

#############################################
# 5. Create tf.data.Dataset and Train the Model
#############################################

batch_size = 32
dataset = tf.data.Dataset.from_tensor_slices((X_train, asset_returns_train))
dataset = dataset.shuffle(buffer_size=1024, seed=42).batch(batch_size)

epochs = 50
for epoch in range(epochs):
    epoch_loss_avg = tf.keras.metrics.Mean()
    epoch_pnl_avg = tf.keras.metrics.Mean()
    
    for x_batch, asset_batch in dataset:
        loss_val, pnl_val = train_step(x_batch, asset_batch)
        epoch_loss_avg.update_state(loss_val)
        epoch_pnl_avg.update_state(pnl_val)
    
    print(f"Epoch {epoch+1:02d}: Loss = {epoch_loss_avg.result().numpy():.6f}, Average PnL = {epoch_pnl_avg.result().numpy():.6f}")

#############################################
# 6. Evaluate Training Performance (PnL)
#############################################

# Use the trained model to get predictions on the training set
y_pred_train = model_pnl.predict(X_train)
# Compute soft positions: position = P(Buy) - P(Sell)
positions_train = y_pred_train[:, 0] - y_pred_train[:, 2]

# Compute strategy returns on the training set by assuming that
# the position from the previous period is applied to the current period's asset return.
strategy_returns_train = np.roll(positions_train, shift=1) * asset_returns_train
strategy_returns_train[0] = 0  # No previous signal for the first sample

# Compute cumulative returns (compound growth of capital)
cumulative_returns_train = (1 + strategy_returns_train).cumprod() - 1
total_pnl_percentage_train = cumulative_returns_train[-1]

# Print final training performance (PnL)
print("\nTraining Performance (First Half):")
print("Total PnL (percentage): {:.2%}".format(total_pnl_percentage_train))

initial_capital = 100000  # Example initial capital
absolute_pnl_train = total_pnl_percentage_train * initial_capital
print("Absolute PnL: ${:,.2f}".format(absolute_pnl_train))


Epoch 01: Loss = 0.000002, Average PnL = -0.000002
Epoch 02: Loss = -0.000003, Average PnL = 0.000003
Epoch 03: Loss = -0.000009, Average PnL = 0.000009
Epoch 04: Loss = -0.000013, Average PnL = 0.000013
Epoch 05: Loss = -0.000018, Average PnL = 0.000018
Epoch 06: Loss = -0.000023, Average PnL = 0.000023
Epoch 07: Loss = -0.000025, Average PnL = 0.000025
Epoch 08: Loss = -0.000029, Average PnL = 0.000029
Epoch 09: Loss = -0.000039, Average PnL = 0.000039
Epoch 10: Loss = -0.000039, Average PnL = 0.000039
Epoch 11: Loss = -0.000044, Average PnL = 0.000044
Epoch 12: Loss = -0.000053, Average PnL = 0.000053
Epoch 13: Loss = -0.000052, Average PnL = 0.000052
Epoch 14: Loss = -0.000056, Average PnL = 0.000056
Epoch 15: Loss = -0.000068, Average PnL = 0.000068
Epoch 16: Loss = -0.000064, Average PnL = 0.000064
Epoch 17: Loss = -0.000068, Average PnL = 0.000068
Epoch 18: Loss = -0.000072, Average PnL = 0.000072
Epoch 19: Loss = -0.000093, Average PnL = 0.000093
Epoch 20: Loss = -0.000093, Ave

In [93]:
import numpy as np

# -------------------------------
# 1. Split Data: Use the Second Half as Test Set
# -------------------------------
# Assuming 'data' is your full DataFrame (joined and sorted by date) and 'midpoint' was computed earlier.
test_data = data[data.index > midpoint]

# -------------------------------
# 2. Prepare Test Features & Asset Returns
# -------------------------------
# Extract test features using the same columns as training.
features_test = test_data[['average_gas_limit', 'average_gas_used', 'average_size','average_difficulty','average_total_difficulty']].values
# Scale features using the previously fitted scaler.
X_test = scaler.transform(features_test)

# Extract asset returns (ensure these were computed on the full data earlier)
asset_returns_test = test_data['asset_return'].values.astype(np.float32)

# -------------------------------
# 3. Generate Model Predictions & Compute Soft Positions
# -------------------------------
# Get the model's predictions (probabilities for [Buy, Hold, Sell])
y_pred_test = model_pnl.predict(X_test)
# Compute the soft positions as: position = P(Buy) - P(Sell)
positions_test = y_pred_test[:, 0] - y_pred_test[:, 2]

# -------------------------------
# 4. Compute Strategy Returns and PnL
# -------------------------------
# We assume that the signal from the previous period is applied to the current period's asset return.
strategy_returns_test = np.roll(positions_test, shift=1) * asset_returns_test
strategy_returns_test[0] = 0  # For the first period, set return to zero

# Compute cumulative returns: simulate compound growth
cumulative_returns_test = (1 + strategy_returns_test).cumprod() - 1
total_pnl_percentage_test = cumulative_returns_test[-1]

# Compute the annualized Sharpe ratio (assuming daily data with 252 trading days)
annualized_sharpe = (np.mean(strategy_returns_test) / np.std(strategy_returns_test)) * np.sqrt(252)

# Compute absolute PnL given an initial capital (e.g., $100,000)
initial_capital = 100000
absolute_pnl_test = total_pnl_percentage_test * initial_capital

# -------------------------------
# 5. Print Final Test Results
# -------------------------------
print("Test Performance (Second Half):")
print("Total PnL (percentage): {:.2%}".format(total_pnl_percentage_test))
print("Absolute PnL: ${:,.2f}".format(absolute_pnl_test))
print("Annualized Sharpe Ratio: {:.4f}".format(annualized_sharpe))


[1m264/264[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 933us/step
Test Performance (Second Half):
Total PnL (percentage): -76.97%
Absolute PnL: $-76,970.09
Annualized Sharpe Ratio: -0.1699
