In [None]:
import pandas as pd
import numpy as np

def add_volatility_and_volume_std(df, window=30):
    """
    Adds rolling volatility (log return std) and volume std to a DataFrame.

    Parameters:
        df (pd.DataFrame): DataFrame with 'timestamp', 'close', and 'volume' columns.
        window (int): Rolling window size.

    Returns:
        pd.DataFrame: Original DataFrame with added 'volatility' and 'volume_std' columns.
    """
    # Ensure proper column casing
    df = df.rename(columns={
        'timestamp': 'Timestamp',
        'open': 'Open',
        'high': 'High',
        'low': 'Low',
        'close': 'Close',
        'volume': 'Volume'
    })

    # Parse timestamps if not already
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    df = df.sort_values('Timestamp').reset_index(drop=True)

    # Compute log returns
    df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))

    # Rolling volatility (std of log returns)
    df['volatility'] = df['log_return'].rolling(window=window).std()

    # Rolling volume std
    df['volume_std'] = df['Volume'].rolling(window=window).std()

    return df


In [None]:
def add_time_to_pump_label(df):
    """
    Adds a 'time_to_pump' column with the time difference (in seconds)
    between each row's timestamp and its corresponding pump_time.

    Assumes one pump_time per symbol, or uniform pump_time across the dataset.
    """
    # Ensure datetime conversion
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    df['pump_time'] = pd.to_datetime(df['pump_time'])

    # Compute label: time difference in seconds
    df['time_to_pump'] = (df['pump_time'] - df['Timestamp']).dt.total_seconds()

    return df


In [None]:
import pandas as pd
import numpy as np

def engineer_lstm_features(df, window=30):
    """
    Adds engineered features for LSTM modeling, including volatility, 
    volume_std, momentum, return acceleration, and normalized metrics.
    """
    # Standardize column names
    df = df.rename(columns={
        'timestamp': 'Timestamp',
        'open': 'Open',
        'high': 'High',
        'low': 'Low',
        'close': 'Close',
        'volume': 'Volume'
    })

    # Parse time
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    df['pump_time'] = pd.to_datetime(df['pump_time'])
    df = df.sort_values('Timestamp').reset_index(drop=True)

    # Log return
    df['log_return'] = np.log(df['Close'] / df['Close'].shift(1))

    # Rolling stats
    df['volatility'] = df['log_return'].rolling(window).std()
    df['volume_std'] = df['Volume'].rolling(window).std()

    # Momentum (price change over n steps)
    df['momentum'] = df['Close'] - df['Close'].shift(5)

    # Volume ratio (relative to rolling mean)
    df['volume_ratio'] = df['Volume'] / (df['Volume'].rolling(window).mean() + 1e-6)

    # Return acceleration
    df['return_diff'] = df['log_return'] - df['log_return'].shift(1)

    # Normalized price and volume
    df['norm_close'] = df['Close'] / (df['Close'].rolling(window).mean() + 1e-6)
    df['norm_volume'] = df['Volume'] / (df['Volume'].rolling(window).mean() + 1e-6)

    # Label: time difference to pump (in seconds)
    df['time_to_pump'] = (df['pump_time'] - df['Timestamp']).dt.total_seconds()

    return df.dropna()


In [None]:
import numpy as np

def create_lstm_sequences(df, feature_columns, label_column='time_to_pump', sequence_length=30):
    """
    Converts DataFrame into sequences of features for LSTM input and time-to-pump targets.

    Parameters:
        df (pd.DataFrame): DataFrame with features and label
        feature_columns (list): Columns to use as LSTM input
        label_column (str): Column to use as target (e.g., 'time_to_pump')
        sequence_length (int): Number of time steps in each input sequence

    Returns:
        X (np.ndarray): shape (num_samples, sequence_length, num_features)
        y (np.ndarray): shape (num_samples,)
    """
    X, y = [], []

    for i in range(len(df) - sequence_length):
        seq = df.iloc[i:i+sequence_length][feature_columns].values
        target = df.iloc[i+sequence_length][label_column]
        X.append(seq)
        y.append(target)

    return np.array(X), np.array(y)


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

class LSTMRegressor(nn.Module):
    def __init__(self, input_size, hidden_size=32, num_layers=1, dropout=0.2):
        super(LSTMRegressor, self).__init__()
        self.lstm = nn.LSTM(input_size=input_size,
                            hidden_size=hidden_size,
                            num_layers=num_layers,
                            batch_first=True,
                            dropout=dropout if num_layers > 1 else 0)
        self.fc = nn.Linear(hidden_size, 1)

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        out = self.fc(hn[-1])
        return out.squeeze()


In [None]:
def train_lstm(model, train_loader, test_loader, epochs=10, lr=1e-3):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            preds = model(xb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        avg_loss = total_loss / len(train_loader)

        # Optional: evaluate on test set each epoch
        model.eval()
        test_loss = 0
        with torch.no_grad():
            for xb, yb in test_loader:
                xb, yb = xb.to(device), yb.to(device)
                preds = model(xb)
                loss = criterion(preds, yb)
                test_loss += loss.item()
        avg_test_loss = test_loss / len(test_loader)

        print(f"Epoch {epoch+1}/{epochs} - Train Loss: {avg_loss:.4f} - Test Loss: {avg_test_loss:.4f}")

    return model


In [None]:
X_train, y_train = create_lstm_sequences(train_df, feature_cols)
X_test, y_test = create_lstm_sequences(test_df, feature_cols)

batch_size = 64

train_ds = TensorDataset(torch.tensor(X_train, dtype=torch.float32),
                         torch.tensor(y_train, dtype=torch.float32))
test_ds = TensorDataset(torch.tensor(X_test, dtype=torch.float32),
                        torch.tensor(y_test, dtype=torch.float32))

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=batch_size)

input_size = X_train.shape[2]  # number of features per timestep
model = LSTMRegressor(input_size=input_size)

trained_model = train_lstm(model, train_loader, test_loader, epochs=10, lr=0.001)