# Ethereum Price Exploration
This starter notebook loads historical ETH prices from the local Parquet file so you can experiment with custom analysis in new cells.

In [None]:
import pandas as pd
from pathlib import Path

ETH_PRICE_FILE = Path("./data/ethereum_price.parquet")
eth_prices = pd.read_parquet(ETH_PRICE_FILE)
eth_prices

# Basic shape and summary to confirm the data loaded as expected
eth_prices.shape, eth_prices.describe(include='all')

## Quick LSTM baseline
This cell trains a small univariate LSTM on daily ETH returns to predict the next day's return. \
Requires `tensorflow` (e.g., `pip install tensorflow`).


In [None]:
import numpy as np
import pandas as pd

try:
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers
except ImportError as exc:
    raise ImportError("Install tensorflow first: pip install tensorflow") from exc

try:
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import classification_report, confusion_matrix
except ImportError as exc:
    raise ImportError("Install scikit-learn first: pip install scikit-learn") from exc

# Prepare returns
prices = eth_prices.sort_values('date').copy()
prices['return'] = prices['price'].pct_change()
returns = prices['return'].dropna().to_numpy(dtype='float32')

def build_sequences(arr: np.ndarray, window: int):
    X, y = [], []
    for i in range(window, len(arr)):
        window_slice = arr[i-window:i]
        target_ret = arr[i]
        if target_ret >= 0.02:
            label = 1  # >= +2%
        elif target_ret <= -0.02:
            label = 2  # <= -2%
        else:
            label = 0  # between -2% and +2%
        X.append(window_slice)
        y.append(label)
    X = np.array(X, dtype='float32')[..., np.newaxis]
    y = np.array(y, dtype='int64')
    return X, y

def apply_confidence_threshold(prob_array: np.ndarray, threshold: float) -> np.ndarray:
    if prob_array.size == 0:
        return np.array([], dtype=int)
    classes = np.argmax(prob_array, axis=1)
    confidences = prob_array[np.arange(len(prob_array)), classes]
    final = np.zeros_like(classes)
    for idx, (cls, conf) in enumerate(zip(classes, confidences)):
        if cls == 0:
            final[idx] = 0
        elif conf >= threshold:
            final[idx] = cls
        else:
            final[idx] = 0
    return final

SEQ_LEN = 30
CONF_THRESHOLD = 0.7  # require high confidence before predicting +/- 2%
X, y = build_sequences(returns, SEQ_LEN)
if len(X) < 10:
    raise ValueError('Not enough data to build sequences; collect more daily prices.')

RANDOM_STATE = 42
# 80/10/10 split with shuffling
X_trainval, X_test, y_trainval, y_test = train_test_split(
    X, y, test_size=0.10, random_state=RANDOM_STATE, shuffle=True
)
X_train, X_val, y_train, y_val = train_test_split(
    X_trainval, y_trainval, test_size=0.1111, random_state=RANDOM_STATE, shuffle=True
)

model = keras.Sequential([
    layers.Input(shape=(SEQ_LEN, 1)),
    layers.LSTM(32),
    layers.Dense(3, activation='softmax'),
])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=20,
    batch_size=16,
    verbose=0
)

test_loss, test_acc = model.evaluate(X_test, y_test, verbose=0) if len(X_test) else (float('nan'), float('nan'))
test_probs = model.predict(X_test, verbose=0) if len(X_test) else np.empty((0, 3))
raw_test_classes = np.argmax(test_probs, axis=1) if len(test_probs) else np.array([], dtype=int)
threshold_test_classes = apply_confidence_threshold(test_probs, CONF_THRESHOLD)

if len(raw_test_classes):
    print('--- Raw test classification report ---')
    print(classification_report(y_test, raw_test_classes, digits=3))
    print('--- Thresholded test classification report ---')
    print(classification_report(y_test, threshold_test_classes, digits=3))
    print('Confusion matrix (thresholded):')
    print(confusion_matrix(y_test, threshold_test_classes, labels=[0, 1, 2]))
    confident_preds = np.sum(threshold_test_classes != 0)
    print(f'Confident +/- predictions on test set: {confident_preds} / {len(threshold_test_classes)}')

probs_next = model.predict(X[-1:], verbose=0)[0] if len(X) else np.array([np.nan] * 3)
next_class_raw = int(np.argmax(probs_next)) if np.all(np.isfinite(probs_next)) else -1
next_class_thresholded = apply_confidence_threshold(probs_next[np.newaxis, :], CONF_THRESHOLD)[0] if np.all(np.isfinite(probs_next)) else -1
class_names = {0: 'between -2% and +2%', 1: '>= +2%', 2: '<= -2%'}
last_price = float(prices['price'].iloc[-1]) if len(prices) else float('nan')

print(f'Train samples: {len(X_train)}, Val samples: {len(X_val)}, Test samples: {len(X_test)}')
print(f'Test loss: {test_loss:.4f}, Test accuracy: {test_acc:.4f}')
print('Next-day raw class:', next_class_raw, '-', class_names.get(next_class_raw, 'n/a'))
print('Next-day thresholded class:', next_class_thresholded, '-', class_names.get(next_class_thresholded, 'n/a'))
print('Next-day class probabilities:', probs_next)
print(f'Last close: {last_price:.2f}')
