In [1]:
import os
import sys
sys.path.append(os.path.abspath('../'))
from os import makedirs
from os.path import join, basename
import logging
import numpy as np
import torch
import random
from arguments import define_new_main_parser
import json

from transformers import Trainer, TrainingArguments, EarlyStoppingCallback

from dataset.dataset import Dataset
from models.modules import TabFormerBertLM, TabFormerBertForClassification, TabFormerBertModel, TabStaticFormerBert, \
    TabStaticFormerBertLM, TabStaticFormerBertClassification
from misc.utils import ordered_split_dataset, compute_cls_metrics
from dataset.datacollator import *
from main import main

logger = logging.getLogger(__name__)
log = logger
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
)

import os
os.environ["WANDB_DISABLED"] = "true"

data_path = "/data/IDEA_DeFi_Research/LTM/Data/Lending_Protocols/Aave/V2/Mainnet"

include_user_features = True
include_time_features = True
include_market_features = True
include_exo_features = False

feature_extension = ""
if include_user_features:
    feature_extension += "_user"
if include_market_features:
    feature_extension += "_market"
if include_time_features:
    feature_extension += "_time"
if include_exo_features:
    feature_extension += "_exoLagged"



file_path = f"{data_path}/transactions_user_market_time_exoLagged.rds"
train_path = f"{data_path}/transactions{feature_extension}_train.csv"
test_path = f"{data_path}/transactions{feature_extension}_test.csv"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data="/data/IDEA_DeFi_Research/LTM/Data/Lending_Protocols/Aave/V2/Mainnet" 
dt="Aave"
exp_name="debug"
time_pos_type="regular_position"
fname = f"transactions{feature_extension}_train"  
val_fname = f"transactions{feature_extension}_val" 
test_fname = f"transactions{feature_extension}_test"  
fextension = False
bs=32
field_hs = 64 # hidden state dimension of the transformer (default: 768)
seq_len = 10 # length for transaction sliding window
stride = 1 # stride for transaction sliding window
num_train_epochs=10
save_steps=100
eval_steps=100
external_val=False
output_dir=f"{data}/output/{exp_name}"
checkpoint=None
nrows=1000
vocab_dir=f"{data}/vocab"
resample_method = None

# export-specific arguments:
nbatches = 1

In [3]:
arg_str = f" --export_task \
    --mlm \
    --pad_seq_first \
    --get_rids \
    --field_ce \
    --lm_type bert \
    --field_hs {field_hs} \
    --data_type {dt} \
    --seq_len {seq_len} \
    --stride {stride} \
    --num_train_epochs {num_train_epochs} \
    --data_root {data}/ \
    --train_batch_size {bs} \
    --eval_batch_size {bs} \
    --save_steps {save_steps} \
    --eval_steps {eval_steps} \
    --data_fname {fname} \
    --data_val_fname {val_fname} \
    --data_test_fname {test_fname} \
    --output_dir {output_dir} \
    --time_pos_type {time_pos_type} \
    --vocab_dir {vocab_dir} \
    --nrows {nrows} \
    --vocab_cached \
    --encoder_cached \
    --cached \
    --nbatches {nbatches} \
    --export_cls_embeddings \
    "
if fextension:
    arg_str += f"--fextension {fextension} \
    --external_vocab_path {data}/vocab/vocab_ob_{fextension}"
else:
    arg_str += f"--external_vocab_path {data}/vocab/vocab_ob"
if resample_method is not None:
    arg_str += f"\
    --resample_method {resample_method}"
if external_val:
    arg_str += f"\
    --external_val"
if checkpoint is not None:
    arg_str += f"\
    --checkpoint {checkpoint}"

In [4]:
parser = define_new_main_parser(data_type_choices=["Aave", "Cosmetics"])
opts = parser.parse_args(arg_str.split())

In [5]:
opts.log_dir = join(opts.output_dir, "logs")
makedirs(opts.output_dir, exist_ok=True)
makedirs(opts.log_dir, exist_ok=True)

file_handler = logging.FileHandler(
    join(opts.log_dir, 'output.log'), 'w', 'utf-8')
logger.addHandler(file_handler)

opts.cls_exp_task = opts.cls_task or opts.export_task

if opts.data_type in ["Aave"]:
    assert opts.time_pos_type in ['sin_cos_position', 'regular_position']

if (not opts.mlm) and (not opts.cls_exp_task) and opts.lm_type == "bert":
    raise Exception(
        "Error: Bert needs either '--mlm', '--cls_task' or '--export_task' option. Please re-run with this flag "
        "included.")

main(opts)

2025-02-12 12:59:28 - INFO - dataset.basic - cached encoded data is read from transactions_user_market_time_train.encoded.csv
2025-02-12 12:59:28 - INFO - dataset.basic - read data : (1000, 128)
2025-02-12 12:59:28 - INFO - dataset.basic - using cached vocab from /data/IDEA_DeFi_Research/LTM/Data/Lending_Protocols/Aave/V2/Mainnet/vocab/vocab_ob
2025-02-12 12:59:28 - INFO - dataset.dataset - preparing user level data...
100%|██████████| 18/18 [00:00<00:00, 437.86it/s]
2025-02-12 12:59:28 - INFO - dataset.dataset - creating transaction samples with vocab
100%|██████████| 18/18 [00:00<00:00, 270.02it/s]
2025-02-12 12:59:28 - INFO - dataset.dataset - ncols: 125
2025-02-12 12:59:28 - INFO - dataset.dataset - no of samples 1000
2025-02-12 12:59:28 - INFO - main - vocab size: 1732
2025-02-12 12:59:28 - INFO - main - dataset size: 1000
2025-02-12 12:59:28 - INFO - dataset.basic - cached encoded data is read from transactions_user_market_time_train.encoded.csv
2025-02-12 12:59:29 - INFO - datas

Predictions Shape After Processing: (1000, 1250, 1732)
CLS Embeddings Shape: (1000, 1732)


In [6]:
import numpy
b = numpy.load('/data/IDEA_DeFi_Research/LTM/Data/Lending_Protocols/Aave/V2/Mainnet/output/debug/all_embeddings.npz')
print(b.files)

['seq_last_rids', 'seq_last_labels', 'cls_embeddings']


In [7]:
len(b['cls_embeddings'][0])

1732

In [10]:
b['seq_last_rids'][0:10]

array(['0x3fec3516c8085e089d408562dd3f9ca1bbbc5b0eb4eaf20dc1acca3d5467ff57',
       '0x452f5f9b8d503f395a1db08a2bd8f3af937d401d53584ec5fea321fe8bea29de',
       '0x59a33c72e045d762a069ddfe10540ddc1b521cbd27e11e819a8b0e28b45732f1',
       '0x3c07de24894604a8e9fd36be5eda98a8f9fdad794536cce07393c86a4718b9a6',
       '0x090c9fb05186afb08a0e311160409f24b5aa33de8ea1b65998b218d8d91a9cbb',
       '0x7e34687060651f2b90bbe729c1ede45d06c9316c164f4c436ade8835d42c1238',
       '0x6b4ce8ce64a226ecdb08cad04fd4f15a8cc38c361cc672611ab0c3306bed91ef',
       '0x3f33d1fd27aee4e4ff56ef10ab6280729d187cd159a31cfec0c533174d92415d',
       '0x8d4a61bd9db350421e340d6b892f91f59935f1b9a64da4747737e5f859522503',
       '0x14c066a9f5cd381b95c82ee3ca4c47e3936a731a1367990e8944fcf528579026'],
      dtype='<U66')

In [11]:
b['cls_embeddings'][0:10]

array([[ 0.        ,  0.3931944 , -2.541841  , ...,  0.17998986,
         2.1453826 ,  0.37416127],
       [ 0.        ,  0.49045253, -1.628636  , ..., -0.22098492,
         1.8706552 , -1.213297  ],
       [ 0.        ,  1.1743053 , -3.493845  , ..., -1.3110975 ,
         3.1819081 , -0.83176506],
       ...,
       [ 0.        , -0.02754102, -1.0463388 , ...,  2.7947426 ,
         2.0592046 , -2.362729  ],
       [ 0.        ,  0.45193866, -0.9020668 , ...,  0.82694864,
         4.745487  , -2.0084782 ],
       [ 0.        ,  1.4116417 ,  0.09941611, ...,  1.1192842 ,
         2.6529408 , -2.003558  ]], dtype=float32)