# Original code

In [None]:
import ccxt
from ccxt.base.errors import RequestTimeout
import pandas as pd
from datetime import datetime
from datetime import timedelta
import time


binance = ccxt.binance()


def to_timestamp(dt):
    return binance.parse8601(dt.isoformat())


def download(symbol, start, end):
    '''
    Download all the transaction for a given symbol from the start date to the end date
    @param symbol: the symbol of the coin for which download the transactions
    @param start: the start date from which download the transaction
    @param end: the end date from which download the transaction
    '''

    records = []
    since = start
    ten_minutes = 60000 * 10

    print('Downloading {} from {} to {}'.format(symbol, binance.iso8601(start), binance.iso8601(end)))

    while since < end:
        #print('since: ' + binance.iso8601(since)) #uncomment this line of code for verbose download
        try:
            orders = binance.fetch_trades(symbol + '/BTC', since)
        except RequestTimeout:
            time.sleep(5)
            orders = binance.fetch_trades(symbol + '/BTC', since)

        if len(orders) > 0:

            latest_ts = orders[-1]['timestamp']
            if since != latest_ts:
                since = latest_ts
            else:
                since += ten_minutes

            for l in orders:
                records.append({
                    'symbol': l['symbol'],
                    'timestamp': l['timestamp'],
                    'datetime': l['datetime'],
                    'side': l['side'],
                    'price': l['price'],
                    'amount': l['amount'],
                    'btc_volume': float(l['price']) * float(l['amount']),
                })
        else:
            since += ten_minutes

    return pd.DataFrame.from_records(records)


def download_binance(days_before=7, days_after=7):
    '''
    Download all the transactions for all the pumps in binance in a given interval
    @param days_before: the number of days before the pump
    @param days_after: the number of days after the pump
    '''

    df = pd.read_csv('pump_telegram.csv')
    binance_only = df[df['exchange'] == 'binance']

    for i, pump in binance_only.iterrows():
        symbol = pump['symbol']
        date = pump['date'] + ' ' + pump['hour']
        pump_time = datetime.strptime(date, "%Y-%m-%d %H:%M")
        before = to_timestamp(pump_time - timedelta(days=days_before))
        after = to_timestamp(pump_time + timedelta(days=days_after))
        # to comment out
        import os
        if os.path.exists('data/{}_{}'.format(symbol, str(date).replace(':', '.') + '.csv')):
            print(symbol)
            continue
        #
        df = download(symbol, before, after)
        df.to_csv('data/{}_{}'.format(symbol, str(date).replace(':', '.') + '.csv'), index=False)


if __name__ == '__main__':
    download_binance(days_before=12, days_after=7)


# using different API

In [15]:
import os
import time
import ccxt
import pandas as pd
from datetime import datetime, timedelta

In [16]:
binance = ccxt.binance({
    "enableRateLimit": True,           # pace requests automatically
})
TIMEFRAME   = '1m'                     # 1-minute candles
TF_MS       = 60_000                   # ms in one TF bucket
MAX_LIMIT   = 1000                     # Binance max rows / fetch_ohlcv call


In [17]:

def to_ts(dt):
    """Datetime → milliseconds epoch."""
    return binance.parse8601(dt.isoformat())

def fetch_ohlcv(symbol, start_ts, end_ts):
    """
    Pull 1-minute candles from start_ts → end_ts (exclusive).
    Returns a DataFrame with timestamp, o/h/l/c/volume.
    """
    rows, since = [], start_ts
    print(f"\n→ {symbol}: downloading OHLCV {binance.iso8601(start_ts)} → {binance.iso8601(end_ts)}")
    while since < end_ts:
        batch = binance.fetch_ohlcv(symbol, timeframe=TIMEFRAME,
                                    since=since, limit=MAX_LIMIT)
        if not batch:
            break
        rows.extend(batch)
        since = batch[-1][0] + TF_MS         # next minute
        pct   = 100 * (since - start_ts) / (end_ts - start_ts)
        print(f"  …{pct:5.1f}% ({binance.iso8601(since)})")
        time.sleep(binance.rateLimit / 1000) # polite; enableRateLimit mostly handles this
    cols = ['timestamp', 'open', 'high', 'low', 'close', 'volume']
    return pd.DataFrame(rows, columns=cols)



In [18]:
def download_binance(days_before=7, days_after=7, single_file=False, window_ms=60_000):
    """
    Pull OHLCV around pump-times from pump_telegram.csv.
    * window_ms: pump flag half-width (default ±1 minute)
    """
    os.makedirs('data', exist_ok=True)
    pumps      = pd.read_csv("pump_telegram.csv")
    pumps      = pumps[pumps['exchange'] == 'binance']
    all_frames = []

    for _, pump in pumps.iterrows():
        coin     = pump['symbol']
        pair     = f"{coin}/BTC"             # change to /USDT if needed
        dt_str   = f"{pump['date']} {pump['hour']}"
        pump_dt  = datetime.strptime(dt_str, "%Y-%m-%d %H:%M")
        pump_ts  = to_ts(pump_dt)
        start_ts = to_ts(pump_dt - timedelta(days=days_before))
        end_ts   = to_ts(pump_dt + timedelta(days=days_after))

        df = fetch_ohlcv(pair, start_ts, end_ts)

        # add labels & metadata
        df['symbol']          = coin
        df['pump_time']       = pump_dt
        df['is_pump']         = df['timestamp'] == pump_ts
        df['is_pump_window']  = df['timestamp'].between(pump_ts - window_ms,
                                                        pump_ts + window_ms)

        if single_file:
            all_frames.append(df)
        else:
            out = f"data/{coin}_{dt_str.replace(':','.')}.csv"
            df.to_csv(out, index=False)
            print(f"✓ saved {out}")

    if single_file:
        master = pd.concat(all_frames, ignore_index=True)
        master.to_csv("data/all_pumps_ohlcv.csv", index=False)
        print("✓ saved data/all_pumps_ohlcv.csv")

In [19]:
download_binance(days_before=12, days_after=7, single_file=False)



→ BRD/BTC: downloading OHLCV 2018-12-10T17:00:00.000Z → 2018-12-29T17:00:00.000Z
  …  3.7% (2018-12-11T09:40:00.000Z)
  …  7.3% (2018-12-12T02:20:00.000Z)
  … 11.0% (2018-12-12T19:00:00.000Z)
  … 14.6% (2018-12-13T11:40:00.000Z)
  … 18.3% (2018-12-14T04:20:00.000Z)
  … 21.9% (2018-12-14T21:00:00.000Z)
  … 25.6% (2018-12-15T13:40:00.000Z)
  … 29.2% (2018-12-16T06:20:00.000Z)
  … 32.9% (2018-12-16T23:00:00.000Z)
  … 36.5% (2018-12-17T15:40:00.000Z)
  … 40.2% (2018-12-18T08:20:00.000Z)
  … 43.9% (2018-12-19T01:00:00.000Z)
  … 47.5% (2018-12-19T17:40:00.000Z)
  … 51.2% (2018-12-20T10:20:00.000Z)
  … 54.8% (2018-12-21T03:00:00.000Z)
  … 58.5% (2018-12-21T19:40:00.000Z)
  … 62.1% (2018-12-22T12:20:00.000Z)
  … 65.8% (2018-12-23T05:00:00.000Z)
  … 69.4% (2018-12-23T21:40:00.000Z)
  … 73.1% (2018-12-24T14:20:00.000Z)
  … 76.8% (2018-12-25T07:00:00.000Z)
  … 80.4% (2018-12-25T23:40:00.000Z)
  … 84.1% (2018-12-26T16:20:00.000Z)
  … 87.7% (2018-12-27T09:00:00.000Z)
  … 91.4% (2018-12-28T01:40:00

In [2]:
import os, glob, warnings, random, pathlib
from datetime import timedelta
import numpy as np, pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torch.nn.utils import clip_grad_norm_

In [8]:
# ---------------------------------------------------------------
# pnd_final_lstm.py   •   with checkpoints & early-stopping
# ---------------------------------------------------------------
import os, glob, warnings, random, pathlib
from datetime import timedelta
import numpy as np, pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold

import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from torch.nn.utils import clip_grad_norm_

# ----------------------------- CONFIG --------------------------
DATA_DIR       = r"C:\Users\michael\Desktop\memedenis\data"
CHECK_DIR      = "./checkpoints"     # where to save .pt files
SEQ_LEN        = 60
HORIZON_MIN    = 90
POS_WEIGHT     = 15.0
FOCAL_GAMMA    = 2.0
BATCH_SIZE     = 256
N_EPOCHS       = 15
HIDDEN_SIZE    = 64
DROPOUT        = 0.4
LR             = 3e-4
N_SPLITS       = 5
PATIENCE       = 3                   # early-stop patience (epochs)
SEED           = 42
DEVICE         = "cuda" if torch.cuda.is_available() else "cpu"
# ---------------------------------------------------------------
torch.manual_seed(SEED); np.random.seed(SEED); random.seed(SEED)
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
pathlib.Path(CHECK_DIR).mkdir(exist_ok=True)

# -------------------- 1. LOAD ----------------------------------
def load_folder(folder):
    dfs = [pd.read_csv(f) for f in glob.glob(os.path.join(folder,"*.csv"))]
    if not dfs: raise FileNotFoundError("No CSVs in data dir")
    df = pd.concat(dfs, ignore_index=True)
    df["dt"]        = pd.to_datetime(df["timestamp"], unit="ms", utc=True)
    df["pump_time"] = pd.to_datetime(df["pump_time"], utc=True)
    return df.sort_values("dt").reset_index(drop=True)

df = load_folder(DATA_DIR)
pump_times = sorted(df.loc[df["is_pump"], "pump_time"].unique())
print(f"Rows: {len(df):,} | Pumps: {len(pump_times)}")

# -------------------- 2. FEATURES ------------------------------
def rsi(s, p=14):
    d=s.diff(); up=d.clip(lower=0).ewm(alpha=1/p).mean()
    dn=(-d.clip(upper=0)).ewm(alpha=1/p).mean()
    return 100-100/(1+up/(dn+1e-9))
df["log_close"] = np.log(df["close"].replace(0,np.nan)).ffill()
df["log_return"]= df["log_close"].diff().fillna(0)
df["volatility_30"] = df["log_return"].rolling(30).std().bfill()
df["volume_ratio_30"]= df["volume"]/df["volume"].rolling(30).mean().bfill()
df["price_diff"]    = df["close"].diff().fillna(0)
df["vol_diff"]      = df["volume"].diff().fillna(0)
df["roll_mean_10"]  = df["close"].rolling(10).mean().bfill()
df["roll_std_10"]   = df["close"].rolling(10).std().bfill()
df["rsi_14"]        = rsi(df["close"])
df["log_vol"]       = np.log1p(df["volume"])
df["has_volume"]    = (df["volume"]>0).astype(int)

feature_cols = [
    "close","log_vol","has_volume",
    "price_diff","vol_diff",
    "roll_mean_10","roll_std_10","rsi_14",
    "volatility_30","volume_ratio_30"
]
df[feature_cols] = df[feature_cols].fillna(0)

# -------------------- 3. LABELS --------------------------------
ahead = (df["pump_time"]-df["dt"]).dt.total_seconds()/60.0
df["y_cls"] = ((ahead>0)&(ahead<=HORIZON_MIN)).astype(float)
df["y_reg"] = np.clip(ahead/60.0, 0, 12.0)      # hours

# -------------------- 4. WINDOW DATASET ------------------------
def make_windows(frame):
    Xs,yc,yr=[],[],[]
    X_arr = frame[feature_cols].values.astype("float32")
    c_arr = frame["y_cls"].values.astype("float32")
    r_arr = frame["y_reg"].values.astype("float32")
    for i in range(SEQ_LEN-1,len(frame)):
        Xs.append(X_arr[i-SEQ_LEN+1:i+1]); yc.append(c_arr[i]); yr.append(r_arr[i])
    return np.stack(Xs),np.array(yc),np.array(yr)

class SeqDS(Dataset):
    def __init__(self,X,yc,yr):
        self.X=torch.from_numpy(X); self.yc=torch.from_numpy(yc); self.yr=torch.from_numpy(yr)
    def __len__(self): return len(self.X)
    def __getitem__(self,i): return self.X[i], self.yc[i], self.yr[i]

# -------------------- 5. MODEL ---------------------------------
class AttnLSTM(nn.Module):
    def __init__(self,d_feat,hid=64,drop=0.4):
        super().__init__()
        self.rnn = nn.LSTM(d_feat,hid,2,bidirectional=True,dropout=drop,batch_first=True)
        self.attn= nn.Linear(2*hid,1,bias=False)
        self.cls = nn.Linear(2*hid,1)
        self.reg = nn.Linear(2*hid,1)
        self.drop= nn.Dropout(drop)
    def forward(self,x):
        h,_=self.rnn(x); a=torch.softmax(self.attn(h),1); ctx=(a*h).sum(1)
        ctx=self.drop(ctx)
        return self.cls(ctx).squeeze(-1), self.reg(ctx).squeeze(-1)

class FocalBCELoss(nn.Module):
    def __init__(self, alpha, gamma):
        super().__init__()
        self.alpha, self.gamma = alpha, gamma
        self.bce = nn.BCEWithLogitsLoss(reduction='none')
    def forward(self, logit, target):
        b = self.bce(logit,target)
        p = torch.sigmoid(logit).detach()
        loss = (self.alpha*target + (1-target)) * (1-p)**self.gamma * b
        return loss.mean()

bce = FocalBCELoss(alpha=POS_WEIGHT, gamma=FOCAL_GAMMA).to(DEVICE)
l1  = nn.SmoothL1Loss()

# -------------------- 6. PUMP-AUC w/ NEGATIVES -----------------
def pump_auc(val_df, preds):
    val_df = val_df.copy()
    val_df["prob"] = np.nan
    val_df.loc[val_df.index[SEQ_LEN-1:], "prob"] = preds

    pos_times = val_df["pump_time"].unique()
    # choose equal-sized negative events (3h away from any pump)
    mask = ~val_df["is_pump"]
    neg_times = (val_df.loc[mask,"dt"]
                 .drop_duplicates()
                 .sample(len(pos_times), random_state=SEED))
    def max_prob(center):
        w=val_df["dt"].between(center-timedelta(minutes=HORIZON_MIN),
                               center+timedelta(minutes=HORIZON_MIN))
        return val_df.loc[w,"prob"].max(skipna=True)
    s_pos=[max_prob(t) for t in pos_times]
    s_neg=[max_prob(t) for t in neg_times]
    y = np.array([1]*len(s_pos)+[0]*len(s_neg))
    s = np.array(s_pos+s_neg)
    return roc_auc_score(y,s)

# -------------------- 7. TRAIN ONE FOLD -------------------------
def train_fold(tr_df,va_df,fold_tag):
    scaler=StandardScaler().fit(tr_df[feature_cols])
    tr_df[feature_cols]=scaler.transform(tr_df[feature_cols])
    va_df[feature_cols]=scaler.transform(va_df[feature_cols])

    Xtr,yc_tr,yr_tr = make_windows(tr_df)
    Xva,yc_va,yr_va = make_windows(va_df)
    tr_ds,va_ds=SeqDS(Xtr,yc_tr,yr_tr),SeqDS(Xva,yc_va,yr_va)
    sampler=WeightedRandomSampler((tr_ds.yc*POS_WEIGHT+1).numpy(),
                                  len(tr_ds),replacement=True)
    tr_loader=DataLoader(tr_ds,BATCH_SIZE,sampler=sampler,drop_last=True)
    va_loader=DataLoader(va_ds,BATCH_SIZE,shuffle=False)

    model=AttnLSTM(len(feature_cols),HIDDEN_SIZE,DROPOUT).to(DEVICE)
    opt=torch.optim.AdamW(model.parameters(),lr=LR,weight_decay=1e-3)

    best_auc,best_state,pat=PATIENCE,{},PATIENCE
    for ep in range(1,N_EPOCHS+1):
        # ---- train ----
        model.train()
        for xb,yc,yr in tr_loader:
            xb,yc,yr=xb.to(DEVICE),yc.to(DEVICE),yr.to(DEVICE)
            logit,reg=model(xb)
            loss=bce(logit,yc)+l1(reg,yr)
            opt.zero_grad(); loss.backward()
            clip_grad_norm_(model.parameters(),1.0); opt.step()
        # ---- val ----
        model.eval(); preds,yt=[],[]
        with torch.no_grad():
            for xb,yc,_ in va_loader:
                logit,_=model(xb.to(DEVICE))
                preds.append(torch.sigmoid(logit).cpu()); yt.append(yc)
        preds=np.concatenate(preds); yt=np.concatenate(yt)
        auc_m=roc_auc_score(yt,preds)
        auc_p=pump_auc(va_df,preds)
        print(f"{fold_tag} Ep{ep:02d}  minute-AUC {auc_m:.3f} | pump-AUC {auc_p:.3f}")

        # ---- early stop & checkpoint ----
        if auc_m > (best_auc if isinstance(best_auc,float) else 0) + 0.003:
            best_auc=auc_m; pat=PATIENCE
            best_state=model.state_dict()
        else:
            pat-=1
            if pat==0:
                break

    # save checkpoint for this fold
    ckpt_path=f"{CHECK_DIR}/{fold_tag.replace(' ','_')}_best.pt"
    torch.save(best_state, ckpt_path)
    print(f"{fold_tag} • saved best checkpoint to {ckpt_path} (minute-AUC {best_auc:.3f})")
    return best_auc, auc_p

# -------------------- 8. K-FOLD CV ------------------------------
kf=KFold(n_splits=N_SPLITS,shuffle=True,random_state=SEED)
fold_results=[]
for i,(tr_idx,va_idx) in enumerate(kf.split(pump_times),1):
    tr_p = np.array(pump_times)[tr_idx]
    va_p = np.array(pump_times)[va_idx]
    best_m,best_p = train_fold(df[df["pump_time"].isin(tr_p)].copy(),
                               df[df["pump_time"].isin(va_p)].copy(),
                               fold_tag=f"[fold {i}/{N_SPLITS}]")
    fold_results.append((best_m,best_p))

print("\n=== Summary ===")
for i,(m,p) in enumerate(fold_results,1):
    print(f"Fold {i}: best minute-AUC {m:.3f} | pump-AUC {p:.3f}")
print("Mean minute-AUC:", np.mean([m for m,_ in fold_results]).round(3))
print("Mean pump-AUC  :", np.mean([p for _,p in fold_results]).round(3))


Rows: 9,392,000 | Pumps: 336
[fold 1/5] Ep01  minute-AUC 0.584 | pump-AUC 0.810
[fold 1/5] Ep02  minute-AUC 0.589 | pump-AUC 0.751
[fold 1/5] Ep03  minute-AUC 0.554 | pump-AUC 0.707
[fold 1/5] Ep04  minute-AUC 0.559 | pump-AUC 0.735
[fold 1/5] Ep05  minute-AUC 0.548 | pump-AUC 0.688


RuntimeError: Parent directory ./checkpoints/[fold_1 does not exist.

In [None]:
df['Close_diff'] = df['Close'].diff()
df['Close_pct_change'] = df['Close'].pct_change() * 100
bins = [-np.inf, -2, -0.2, 0.2,2, np.inf] #find defenitions for bins
df['Close_pct_category'] = pd.cut(df['Close_pct_change'], bins=bins, labels=['sd', 'd', 'nc','i', 'si'])
df = df[['Date','Close','Close_diff','Close_pct_change','Close_pct_category']]

In [None]:
MAX_HORIZON = 12 * 60   # 12 hours
ahead_sec   = np.clip(ahead.dt.total_seconds(), 0, MAX_HORIZON*60)
df["label"] = ahead_sec / 60.0      # minutes-to-pump regression target

# 2. Loss: regression
crit = nn.SmoothL1Loss()

In [4]:
# 1.  How many positives in this val fold?
pos_rate = va_df["label"].mean()
print(f"{pos_rate:.3%} of validation minutes are positive")

# 2.  Do we at least rank the pumps correctly?
by_pump = (pd.DataFrame({
    "pump": va_df["pump_time"],
    "prob": torch.sigmoid(all_logits_val).cpu().numpy()
})
 .groupby("pump").max())          # max prob in −60…0 window
event_auc = roc_auc_score(
    (by_pump.index == actual_pump_times_in_val).astype(int),
    by_pump["prob"]
)
print("Event-level AUC:", event_auc)


NameError: name 'va_df' is not defined

In [None]:
# ---------------------------------------------------------------------------
# ORIGINAL PnD LSTM — local-file version
#
#  • Reads every  *.csv  in  DATA_DIR
#  • Re-creates the “old” handcrafted features
#  • Predicts  minutes-to-pump  as a regression target
#  • Uses the same single-layer LSTM → Linear head you started with
# ---------------------------------------------------------------------------
import os, glob, warnings, math, numpy as np, pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader

warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)
torch.manual_seed(0);  np.random.seed(0)

# --------- 1. local data ---------------------------------------------------
DATA_DIR   = r"C:\Users\michael\Desktop\memedenis\data"         # <-- put all your ±7-day files here
SEQ_LEN    = 60                   # minutes per sample
VAL_SPLIT  = 0.2                  # last 20 % of timeline → validation
BATCH_SIZE = 256
N_EPOCHS   = 10
LR         = 3e-4
DEVICE     = "cuda" if torch.cuda.is_available() else "cpu"

# --------- 2. load & concatenate ------------------------------------------
df = pd.concat([pd.read_csv(fp) for fp in glob.glob(os.path.join(DATA_DIR,"*.csv"))],
               ignore_index=True)
df["dt"]        = pd.to_datetime(df["timestamp"], unit="ms", utc=True)
df["pump_time"] = pd.to_datetime(df["pump_time"], utc=True)
df = df.sort_values("dt").reset_index(drop=True)
print("rows:", len(df))

# --------- 3. original handcrafted features -------------------------------
df["log_close"] = np.log(df["close"].replace(0,np.nan)).ffill()
df["log_ret"]   = df["log_close"].diff().fillna(0)
df["return_diff"]   = df["log_ret"].diff().fillna(0)
df["volatility"]    = df["log_ret"].rolling(10).std().bfill()
df["momentum"]      = df["close"].diff(5).fillna(0)
df["volume_std"]    = df["volume"].rolling(10).std().bfill()
df["volume_ratio"]  = df["volume"] / df["volume"].rolling(10).mean().bfill()
df["norm_close"]    = df["close"] / df["close"].rolling(30).mean().bfill()
df["norm_volume"]   = df["volume"]/ df["volume"].rolling(30).mean().bfill()

feature_cols = [
    "close","volume",
    "log_ret","return_diff","volatility","momentum",
    "volume_std","volume_ratio","norm_close","norm_volume"
]
df[feature_cols] = df[feature_cols].fillna(0)

# --------- 4. regression label: minutes-to-pump ---------------------------
ahead_min = (df["pump_time"] - df["dt"]).dt.total_seconds() / 60.0
df["y"]   = ahead_min.clip(lower=-12*60, upper=12*60)   # predict up to 12 h

# --------- 5. time-aware train/val split ----------------------------------
cut_dt = df["dt"].quantile(1 - VAL_SPLIT)
train_df = df[df["dt"] <  cut_dt].copy()
val_df   = df[df["dt"] >= cut_dt].copy()

scaler = StandardScaler().fit(train_df[feature_cols])
train_df[feature_cols] = scaler.transform(train_df[feature_cols])
val_df  [feature_cols] = scaler.transform(val_df  [feature_cols])

def make_windows(frame: pd.DataFrame):
    X,y = [],[]
    arr = frame[feature_cols].values.astype("float32")
    lab = frame["y"].values.astype("float32")
    for i in range(SEQ_LEN-1,len(frame)):
        X.append(arr[i-SEQ_LEN+1:i+1])
        y.append(lab[i])
    return np.stack(X), np.array(y)

X_tr,y_tr = make_windows(train_df)
X_va,y_va = make_windows(val_df)

class SeqDS(Dataset):
    def __init__(self,X,y): self.X=torch.from_numpy(X); self.y=torch.from_numpy(y)
    def __len__(self): return len(self.X)
    def __getitem__(self,i): return self.X[i], self.y[i]

tr_loader = DataLoader(SeqDS(X_tr,y_tr), BATCH_SIZE, shuffle=True, drop_last=True)
va_loader = DataLoader(SeqDS(X_va,y_va), BATCH_SIZE, shuffle=False)

# --------- 6. original single-layer LSTM ----------------------------------
class SimpleLSTM(nn.Module):
    def __init__(self,d_feat,hidden=64):
        super().__init__()
        self.rnn = nn.LSTM(d_feat, hidden, batch_first=True)
        self.head= nn.Linear(hidden,1)
    def forward(self,x):
        _, (h_last, _) = self.rnn(x)     # h_last shape (1,B,H)
        return self.head(h_last.squeeze(0)).squeeze(-1)

model = SimpleLSTM(len(feature_cols)).to(DEVICE)
opt   = torch.optim.Adam(model.parameters(), lr=LR)
crit  = nn.SmoothL1Loss()

# --------- 7. training loop ----------------------------------------------
for ep in range(1,N_EPOCHS+1):
    model.train()
    for xb,yb in tr_loader:
        xb,yb = xb.to(DEVICE), yb.to(DEVICE)
        pred  = model(xb)
        loss  = crit(pred, yb)
        opt.zero_grad(); loss.backward(); opt.step()

    model.eval(); preds,ys=[],[]
    with torch.no_grad():
        for xb,yb in va_loader:
            preds.append(model(xb.to(DEVICE)).cpu()); ys.append(yb)
    preds = torch.cat(preds).numpy(); ys = torch.cat(ys).numpy()
    mae   = mean_absolute_error(ys, preds)
    print(f"Epoch {ep:02d}  |  val MAE (minutes-to-pump) = {mae:.1f}")

print("Done.")


rows: 9392000
Epoch 01  |  val MAE (minutes-to-pump) = 592.8


In [11]:
import torch
import os
from datetime import datetime

# -------------------------------------------------------------
# 1)  Where to save
# -------------------------------------------------------------
SAVE_DIR = "./checkpoints"
os.makedirs(SAVE_DIR, exist_ok=True)

timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
ckpt_path = os.path.join(SAVE_DIR, f"model_{timestamp}.pt")

# -------------------------------------------------------------
# 2)  What to save
#     – model weights
#     – fitted scaler (for inference)
#     – feature column order
# -------------------------------------------------------------
torch.save(
    {
        "model_state": model.state_dict(),   # trained LSTM weights
        "scaler": scaler,                    # StandardScaler instance
        "feature_cols": feature_cols         # list of feature names
    },
    ckpt_path
)

print(f"✓ Model checkpoint saved to {ckpt_path}")


✓ Model checkpoint saved to ./checkpoints\model_20250628_161708.pt


  timestamp = datetime.utcnow().strftime("%Y%m%d_%H%M%S")


In [12]:
# ---------------------------------------------------------------------------
# Enrich every PnD CSV with the ORIGINAL feature columns
# and save to pnd_enriched / <same-name>.csv
# ---------------------------------------------------------------------------
import os, glob, numpy as np, pandas as pd
from pathlib import Path

IN_DIR  = r"C:\Users\michael\Desktop\memedenis\data"       # raw ±7-day files
OUT_DIR = r"C:\Users\michael\Desktop\memedenis\features"   # where enriched files go
Path(OUT_DIR).mkdir(exist_ok=True)

def add_original_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.sort_values("timestamp").reset_index(drop=True)

    # -------- original 10 engineered columns -----------------------------
    df["log_return"]    = np.log(df["close"].replace(0, np.nan)).diff().fillna(0)
    df["return_diff"]   = df["log_return"].diff().fillna(0)
    df["volatility"]    = df["log_return"].rolling(10).std().bfill()
    df["momentum"]      = df["close"].diff(5).fillna(0)
    df["volume_std"]    = df["volume"].rolling(10).std().bfill()
    df["volume_ratio"]  = df["volume"] / df["volume"].rolling(10).mean().bfill()
    df["norm_close"]    = df["close"]  / df["close"].rolling(30).mean().bfill()
    df["norm_volume"]   = df["volume"] / df["volume"].rolling(30).mean().bfill()
    # keep the two raw columns that were also part of the feature set
    # (close, volume) already exist in the file

    return df

# -------- process each CSV ----------------------------------------------
for fp in glob.glob(os.path.join(IN_DIR, "*.csv")):
    df_in  = pd.read_csv(fp)
    df_out = add_original_features(df_in)

    out_path = os.path.join(OUT_DIR, os.path.basename(fp))
    df_out.to_csv(out_path, index=False)
    print(f"✓ saved {out_path}")


✓ saved C:\Users\michael\Desktop\memedenis\features\ADX_2019-07-26 17.59.csv
✓ saved C:\Users\michael\Desktop\memedenis\features\ADX_2020-07-07 16.00.csv
✓ saved C:\Users\michael\Desktop\memedenis\features\AMB_2019-01-01 14.30.csv
✓ saved C:\Users\michael\Desktop\memedenis\features\APPC_2020-11-04 17.00.csv
✓ saved C:\Users\michael\Desktop\memedenis\features\APPC_2020-11-18 18.00.csv
✓ saved C:\Users\michael\Desktop\memedenis\features\APPC_2020-11-27 18.00.csv
✓ saved C:\Users\michael\Desktop\memedenis\features\APPC_2020-12-07 16.00.csv
✓ saved C:\Users\michael\Desktop\memedenis\features\APPC_2021-01-13 21.00.csv
✓ saved C:\Users\michael\Desktop\memedenis\features\ARDR_2019-04-17 17.00.csv
✓ saved C:\Users\michael\Desktop\memedenis\features\ARDR_2019-05-31 17.00.csv
✓ saved C:\Users\michael\Desktop\memedenis\features\ARDR_2020-11-02 16.00.csv
✓ saved C:\Users\michael\Desktop\memedenis\features\ARN_2018-04-10 16.30.csv
✓ saved C:\Users\michael\Desktop\memedenis\features\ARN_2020-04-07 1

In [6]:
# ---------------------------------------------------------------------------
# LOAD a saved “original-LSTM” checkpoint
# and get it ready for inference
# ---------------------------------------------------------------------------
import torch
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# ---- path to the checkpoint you saved after training ---------------------
CKPT_PATH = "./checkpoints/model_20250628_161708.pt"   # change if you renamed the file
DEVICE    = "cuda" if torch.cuda.is_available() else "cpu"

# ---- 1. define the SAME model architecture ------------------------------
class SimpleLSTM(torch.nn.Module):
    def __init__(self, d_feat, hidden=64):
        super().__init__()
        self.rnn  = torch.nn.LSTM(d_feat, hidden, batch_first=True)
        self.head = torch.nn.Linear(hidden, 1)
    def forward(self, x):
        _, (h_last, _) = self.rnn(x)     # h_last shape (1,B,H)
        return self.head(h_last.squeeze(0)).squeeze(-1)

# ---- 2. load checkpoint --------------------------------------------------
ckpt   = torch.load(CKPT_PATH, map_location=DEVICE)
scaler = ckpt["scaler"]                # fitted StandardScaler
feat_cols = ckpt["feature_cols"]       # column order

model  = SimpleLSTM(len(feat_cols)).to(DEVICE)
model.load_state_dict(ckpt["model_state"])
model.eval()

print("✓ Model, scaler, and feature list loaded from checkpoint")

# -------------------------------------------------------------------------
#  EXAMPLE INFERENCE  –  predict minutes-to-pump for each window
# -------------------------------------------------------------------------
def engineer_original_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.sort_values("timestamp").reset_index(drop=True)
    df["log_return"]   = np.log(df["close"].replace(0, np.nan)).diff().fillna(0)
    df["return_diff"]  = df["log_return"].diff().fillna(0)
    df["volatility"]   = df["log_return"].rolling(10).std().bfill()
    df["momentum"]     = df["close"].diff(5).fillna(0)
    df["volume_std"]   = df["volume"].rolling(10).std().bfill()
    df["volume_ratio"] = df["volume"] / df["volume"].rolling(10).mean().bfill()
    df["norm_close"]   = df["close"]  / df["close"].rolling(30).mean().bfill()
    df["norm_volume"]  = df["volume"] / df["volume"].rolling(30).mean().bfill()
    return df

def predict_minutes_to_pump(csv_path: str):
    df = pd.read_csv(csv_path)
    df = engineer_original_features(df)
    X  = scaler.transform(df[feat_cols].fillna(0).values.astype("float32"))

    # create sliding windows of length 60
    windows = np.stack([
        X[i-59:i+1] for i in range(59, len(X))
    ])
    with torch.no_grad():
        preds = model(torch.from_numpy(windows).to(DEVICE)).cpu().numpy()
    return preds  # one prediction per minute starting at the 60th row

# ---- usage ---------------------------------------------------------------
# preds = predict_minutes_to_pump("./pnd_enriched/BTC_2019-07-26_17.59.csv")
# print(preds[:10])


UnpicklingError: Weights only load failed. This file can still be loaded, to do so you have two options, [1mdo those steps only if you trust the source of the checkpoint[0m. 
	(1) In PyTorch 2.6, we changed the default value of the `weights_only` argument in `torch.load` from `False` to `True`. Re-running `torch.load` with `weights_only` set to `False` will likely succeed, but it can result in arbitrary code execution. Do it only if you got the file from a trusted source.
	(2) Alternatively, to load with `weights_only=True` please check the recommended steps in the following error message.
	WeightsUnpickler error: Unsupported global: GLOBAL sklearn.preprocessing._data.StandardScaler was not an allowed global by default. Please use `torch.serialization.add_safe_globals([StandardScaler])` or the `torch.serialization.safe_globals([StandardScaler])` context manager to allowlist this global if you trust this class/function.

Check the documentation of torch.load to learn more about types accepted by default with weights_only https://pytorch.org/docs/stable/generated/torch.load.html.

In [8]:
import torch
from sklearn.preprocessing import StandardScaler   # needed for allow-list

# tell torch.load it’s OK to un-pickle this class
torch.serialization.add_safe_globals([StandardScaler])

CKPT_PATH = "./checkpoints/model_20250628_161708.pt"
DEVICE    = "cuda" if torch.cuda.is_available() else "cpu"

ckpt = torch.load(CKPT_PATH, map_location=DEVICE, weights_only=False)
scaler       = ckpt["scaler"]
feature_cols = ckpt["feature_cols"]

from torch import nn
class SimpleLSTM(nn.Module):
    def __init__(self, d_feat, hidden=64):
        super().__init__()
        self.rnn  = nn.LSTM(d_feat, hidden, batch_first=True)
        self.head = nn.Linear(hidden, 1)
    def forward(self, x):
        _, (h_last, _) = self.rnn(x)
        return self.head(h_last.squeeze(0)).squeeze(-1)

model = SimpleLSTM(len(feature_cols)).to(DEVICE)
model.load_state_dict(ckpt["model_state"])
model.eval()

print("✓ checkpoint loaded, model ready")


✓ checkpoint loaded, model ready


In [10]:
# ---------------------------------------------------------------------------
# Re-create the validation set (features + scaling + windows)
# ---------------------------------------------------------------------------
import os, glob, numpy as np, pandas as pd
from sklearn.preprocessing import StandardScaler

# ---------------- settings (match the training cell) -----------------------
DATA_DIR   = r"C:\Users\michael\Desktop\memedenis\data"
SEQ_LEN    = 60
VAL_SPLIT  = 0.20           # last 20 % of timeline → validation

# ---------------- 1. load all raw CSVs ------------------------------------
df = pd.concat([pd.read_csv(fp) for fp in glob.glob(os.path.join(DATA_DIR, "*.csv"))],
               ignore_index=True)

# unify datetime columns exactly as before
if df["timestamp"].max() < 1e12:         # seconds → ms
    df["timestamp"] *= 1000
df["dt"] = pd.to_datetime(df["timestamp"], unit="ms", utc=True)
df["pump_time"] = pd.to_datetime(df["pump_time"], utc=True)
df = df.sort_values("dt").reset_index(drop=True)

# ---------------- 2. original handcrafted features ------------------------
df["log_close"]   = np.log(df["close"].replace(0, np.nan)).ffill()
df["log_ret"]     = df["log_close"].diff().fillna(0)
df["return_diff"] = df["log_ret"].diff().fillna(0)
df["volatility"]  = df["log_ret"].rolling(10).std().bfill()
df["momentum"]    = df["close"].diff(5).fillna(0)
df["volume_std"]  = df["volume"].rolling(10).std().bfill()
df["volume_ratio"]= df["volume"] / df["volume"].rolling(10).mean().bfill()
df["norm_close"]  = df["close"]  / df["close"].rolling(30).mean().bfill()
df["norm_volume"] = df["volume"] / df["volume"].rolling(30).mean().bfill()

feature_cols = [
    "close","volume",
    "log_ret","return_diff","volatility","momentum",
    "volume_std","volume_ratio","norm_close","norm_volume"
]
df[feature_cols] = df[feature_cols].fillna(0)

# ---------------- 3. label: minutes-to-pump -------------------------------
ahead_min = (df["pump_time"] - df["dt"]).dt.total_seconds() / 60.0
df["y"]   = ahead_min.clip(lower=0, upper=12*60)

# ---------------- 4. identical time-based split ---------------------------
cut_dt = df["dt"].quantile(1 - VAL_SPLIT)
train_df = df[df["dt"] <  cut_dt].copy()
val_df   = df[df["dt"] >= cut_dt].copy()

# ---------------- 5. fit scaler on train, apply to val --------------------
scaler = StandardScaler().fit(train_df[feature_cols])
val_df[feature_cols] = scaler.transform(val_df[feature_cols])

# ---------------- 6. build 60-minute windows for the validation slice -----
def make_windows(frame):
    X, y = [], []
    arr  = frame[feature_cols].values.astype("float32")
    lab  = frame["y"].values.astype("float32")
    for i in range(SEQ_LEN-1, len(frame)):
        X.append(arr[i-SEQ_LEN+1:i+1])
        y.append(lab[i])
    return np.stack(X), np.array(y)

X_val, y_val = make_windows(val_df)

print("Validation set rebuilt:")
print(f"  val_df rows        : {len(val_df):,}")
print(f"  validation windows : {len(X_val):,}")


Validation set rebuilt:
  val_df rows        : 1,878,402
  validation windows : 1,878,343


In [14]:
# ---------------------------------------------------------------------------
#  Show model countdown at 1 day / 1 h / 30 m / 10 m / 5 m before each pump
# ---------------------------------------------------------------------------
import numpy as np
import pandas as pd
from datetime import timedelta

# ── 0. make sure val_df has predictions (y_pred) for every minute ─────────
if "y_pred" not in val_df.columns:
    BATCH_INF = 2048
    pred_minutes = np.empty(len(X_val), dtype=np.float32)
    model.eval()
    with torch.no_grad():
        for s in range(0, len(X_val), BATCH_INF):
            e = s + BATCH_INF
            xb = torch.from_numpy(X_val[s:e]).to(DEVICE)
            pred_minutes[s:e] = model(xb).cpu().numpy()
    val_df = val_df.reset_index(drop=True)            # align indices
    val_df.loc[val_df.index[SEQ_LEN-1:], "y_pred"] = pred_minutes

# ── 1. define the five checkpoints (in minutes) ───────────────────────────
OFFSETS = {
    "1d_before" : 24*60,
    "1h_before" : 60,
    "30m_before": 30,
    "10m_before": 10,
    "5m_before" : 5
}

# ── 2. gather forecast at each offset for every pump in validation ────────
records = []
for p_time in val_df["pump_time"].unique():
    pump_block = val_df[val_df["pump_time"] == p_time]
    for label, minutes in OFFSETS.items():
        target_dt = p_time - timedelta(minutes=minutes)

        # nearest minute *at or before* the target (safety for missing bars)
        row = pump_block[pump_block["dt"] <= target_dt].tail(1)
        if row.empty:
            pred = np.nan
        else:
            pred = row["y_pred"].values[0]

        records.append({
            "pump_time": p_time,
            "checkpoint": label,
            "y_pred"   : pred
        })

result_df = (pd.DataFrame(records)
               .pivot(index="pump_time", columns="checkpoint", values="y_pred")
               .sort_index())

print("\n=== Model countdown at key offsets (minutes remaining) ===")
print(result_df.round(2).to_string())



=== Model countdown at key offsets (minutes remaining) ===
checkpoint                 10m_before  1d_before  1h_before  30m_before  5m_before
pump_time                                                                         
2020-06-28 16:00:00+00:00         NaN        NaN        NaN         NaN        NaN
2020-07-04 16:00:00+00:00      718.54     719.39     718.96      719.23     719.18
2020-07-07 16:00:00+00:00      720.14     719.27     710.53      719.29     719.34
2020-07-10 17:00:00+00:00      719.32     719.25     719.25      719.16     718.54
2020-07-16 18:00:00+00:00      719.19     719.19     719.17      719.18     719.03
2020-07-21 16:00:00+00:00      719.23     719.31     718.71      719.16     719.25
2020-07-24 17:00:00+00:00      684.25     719.27     718.25      718.95     718.29
2020-07-30 18:00:00+00:00      719.04     719.21     719.23      692.65     719.18
2020-08-02 16:00:00+00:00      719.27     719.36     719.30      719.31     719.30
2020-08-04 18:00:00+00:00  