In [1]:
import os
import sys
sys.path.append(os.path.abspath('../'))
from dataset.aave_preload import AavePreloadDataset
import os.path as path
import logging
from dataset.vocab import *

# Configure logging for better debugging information
logging.basicConfig(level=logging.INFO,  
                    format='%(asctime)s - %(levelname)s - %(message)s')  
log = logging.getLogger(__name__)

In [2]:
include_user_features = True
include_time_features = True
include_market_features = True
include_exo_features = False

fextension = ""
if include_user_features:
    fextension += "_user"
if include_market_features:
    fextension += "_market"
if include_time_features:
    fextension += "_time"
if include_exo_features:
    fextension += "_exoLagged"

In [3]:
# Root and file configurations
root = "/data/IDEA_DeFi_Research/LTM/Data/Lending_Protocols/Aave/V2/Mainnet"  
fname = f"transactions{fextension}_train"  
val_fname = "" 
test_fname = f"transactions{fextension}_test"  
fextension = ""  # Use empty string instead of False to avoid confusion
vocab_dir = "vocab/"
cached = False
val_cached = False
test_cached = False
vocab_cached = False
encoder_cached = False
external_encoder_path = False  # Initialize as empty string for consistency

# Ensure fextension and external_encoder_path are strings
fextension = fextension if fextension else ""
external_encoder_path = external_encoder_path if external_encoder_path else ""

# Class name of the dataset to be instantiated
dataset_class = "AavePreloadDataset"

# Instantiate the dataset for preloading
dataset = eval(dataset_class)(
    num_bins=10,
    cached=False,  # Set as False to force reloading for the first run
    encoder_cached=False,
    external_encoder_path=external_encoder_path,
    vocab_cached=False,
    root=root,
    fname=fname,
    vocab_dir=vocab_dir,
    fextension=fextension,
    nrows=None,
    adap_thres=10 ** 8,
    get_rids=True,
    columns_to_select=None
)

# Print out vocab sizes for verification
print(len(dataset.dynamic_vocab), len(dataset.time_feature_vocab), len(dataset.static_vocab))



  data = pd.read_csv(fname, nrows=self.nrows)
2025-01-22 14:06:52,681 - INFO - read data : (1921610, 128)
2025-01-22 14:06:52,682 - INFO - /data/IDEA_DeFi_Research/LTM/Data/Lending_Protocols/Aave/V2/Mainnet/transactions_user_market_time_train.csv is read.
2025-01-22 14:06:52,682 - INFO - nan resolution.
2025-01-22 14:06:57,664 - INFO - timestamp fit transform
2025-01-22 14:06:57,665 - INFO - label-fit-transform.
 41%|████      | 7/17 [00:02<00:02,  3.42it/s]2025-01-22 14:06:59,746 - INFO - skipping col user
2025-01-22 14:06:59,746 - INFO - skipping col id
100%|██████████| 17/17 [00:03<00:00,  4.35it/s]
2025-01-22 14:07:01,576 - INFO - amount quant transform
  0%|          | 0/111 [00:00<?, ?it/s]2025-01-22 14:07:01,578 - INFO - skipping col rowNumber
2025-01-22 14:07:01,578 - INFO - skipping col timestamp
100%|██████████| 111/111 [00:05<00:00, 18.58it/s]
2025-01-22 14:07:08,285 - INFO - writing cached csv to /data/IDEA_DeFi_Research/LTM/Data/Lending_Protocols/Aave/V2/Mainnet/preprocess

1208 18 520


In [4]:
# If the encoder wasn't cached before, set the external path to the newly generated encoder
if not encoder_cached:
    print(f"dataset.encoder_path: {dataset.encoder_path}")
    external_encoder_path = dataset.encoder_path
    
# Update flags indicating caching is now done
vocab_cached = True
encoder_cached = True

test_dataset = eval(dataset_class)(
    num_bins=10,
    cached=False,  # Disable cached to generate fresh encoded data
    encoder_cached=encoder_cached,  # Force generation of a new encoder if missing
    external_encoder_path=external_encoder_path,  # Should be empty or point to existing encoder
    vocab_cached=vocab_cached,
    root=root,
    fname=test_fname,
    vocab_dir=vocab_dir,
    fextension=fextension,
    nrows=None,
    adap_thres=10 ** 8,
    get_rids=True,
    columns_to_select=None
)

# Print out vocab sizes for the test dataset to ensure consistency
print(len(test_dataset.dynamic_vocab), len(test_dataset.time_feature_vocab), len(test_dataset.static_vocab))


2025-01-22 14:07:50,631 - INFO - cached encoder is read from /data/IDEA_DeFi_Research/LTM/Data/Lending_Protocols/Aave/V2/Mainnet/preprocessed/transactions_user_market_time_train.encoder_fit.pkl


dataset.encoder_path: /data/IDEA_DeFi_Research/LTM/Data/Lending_Protocols/Aave/V2/Mainnet/preprocessed/transactions_user_market_time_train.encoder_fit.pkl


2025-01-22 14:07:52,934 - INFO - read data : (87566, 128)
2025-01-22 14:07:52,935 - INFO - /data/IDEA_DeFi_Research/LTM/Data/Lending_Protocols/Aave/V2/Mainnet/transactions_user_market_time_test.csv is read.
2025-01-22 14:07:52,936 - INFO - nan resolution.
2025-01-22 14:07:53,165 - INFO - timestamp fit transform
2025-01-22 14:07:53,166 - INFO - label-fit-transform.
 29%|██▉       | 5/17 [00:00<00:00, 39.15it/s]2025-01-22 14:07:53,333 - INFO - skipping col user
2025-01-22 14:07:53,334 - INFO - skipping col id
100%|██████████| 17/17 [00:00<00:00, 58.93it/s]
2025-01-22 14:07:53,457 - INFO - amount quant transform
  0%|          | 0/111 [00:00<?, ?it/s]2025-01-22 14:07:53,459 - INFO - skipping col rowNumber
2025-01-22 14:07:53,459 - INFO - skipping col timestamp
100%|██████████| 111/111 [00:00<00:00, 616.28it/s]
2025-01-22 14:07:53,681 - INFO - writing cached csv to /data/IDEA_DeFi_Research/LTM/Data/Lending_Protocols/Aave/V2/Mainnet/preprocessed/transactions_user_market_time_test.encoded.cs

1208 18 520
