In [1]:
import os
import sys
sys.path.append(os.path.abspath('../'))
from dataset.preload import PreloadDataset
import os.path as path
import logging
from dataset.vocab import *
from arguments import parse_args

args = parse_args()
# Configure logging for better debugging information
logging.basicConfig(level=logging.INFO,  
                    format='%(asctime)s - %(levelname)s - %(message)s')  
log = logging.getLogger(__name__)

In [2]:
include_user_features = args.include_user_features
include_time_features = args.include_time_features
include_market_features = args.include_market_features
include_exo_features = args.include_exo_features

fextension = ""
if include_user_features:
    fextension += "_user"
if include_market_features:
    fextension += "_market"
if include_time_features:
    fextension += "_time"
if include_exo_features:
    fextension += "_exoLagged"

In [3]:
# Root and file configurations
#root = "/data/IDEA_DeFi_Research/Data/eCommerce/Cosmetics/preprocessed/"  
root = "/data/IDEA_DeFi_Research/Data/AML/LI_Small/preprocessed/"
fname = f"transactions{fextension}_train"  
val_fname = "" 
test_fname = f"transactions{fextension}_test"  
fextension = ""  # Use empty string instead of False to avoid confusion
vocab_dir = "vocab/"
cached = False
val_cached = False
test_cached = False
vocab_cached = False
encoder_cached = False
external_encoder_path = False  # Initialize as empty string for consistency

# Ensure fextension and external_encoder_path are strings
fextension = fextension if fextension else ""
external_encoder_path = external_encoder_path if external_encoder_path else ""

# Class name of the dataset to be instantiated
dataset_class = "PreloadDataset"

# Instantiate the dataset for preloading
dataset = eval(dataset_class)(
    num_bins=10,
    cached=False,  # Set as False to force reloading for the first run
    encoder_cached=False,
    external_encoder_path=external_encoder_path,
    vocab_cached=False,
    root=root,
    fname=fname,
    vocab_dir=vocab_dir,
    fextension=fextension,
    nrows=None,
    adap_thres=10 ** 8,
    get_rids=True,
    columns_to_select=None
)

# Print out vocab sizes for verification
print(len(dataset.dynamic_vocab), len(dataset.time_feature_vocab), len(dataset.static_vocab))



2025-02-10 10:13:44,319 - INFO - read data : (6049093, 70)
2025-02-10 10:13:44,320 - INFO - /data/IDEA_DeFi_Research/Data/AML/LI_Small/preprocessed/transactions_user_time_train.csv is read.
2025-02-10 10:13:44,320 - INFO - nan resolution.
2025-02-10 10:13:48,112 - INFO - timestamp fit transform
2025-02-10 10:13:48,113 - INFO - label-fit-transform.
  0%|          | 0/6 [00:00<?, ?it/s]2025-02-10 10:13:48,115 - INFO - skipping col user
100%|██████████| 6/6 [00:03<00:00,  1.78it/s]
2025-02-10 10:13:51,494 - INFO - amount quant transform
  0%|          | 0/64 [00:00<?, ?it/s]2025-02-10 10:13:51,496 - INFO - skipping col rowNumber
2025-02-10 10:13:51,496 - INFO - skipping col timestamp
2025-02-10 10:13:51,497 - INFO - skipping col id
2025-02-10 10:13:51,499 - INFO - encoding userBank
  6%|▋         | 4/64 [00:00<00:01, 36.80it/s]2025-02-10 10:13:51,618 - INFO - encoding recipientBank
2025-02-10 10:13:51,831 - INFO - encoding amountReceived
2025-02-10 10:13:52,131 - INFO - encoding amount
20

218 18 416


In [4]:
# If the encoder wasn't cached before, set the external path to the newly generated encoder
if not encoder_cached:
    print(f"dataset.encoder_path: {dataset.encoder_path}")
    external_encoder_path = dataset.encoder_path
    
# Update flags indicating caching is now done
vocab_cached = True
encoder_cached = True

test_dataset = eval(dataset_class)(
    num_bins=10,
    cached=False,  # Disable cached to generate fresh encoded data
    encoder_cached=encoder_cached,  # Force generation of a new encoder if missing
    external_encoder_path=external_encoder_path,  # Should be empty or point to existing encoder
    vocab_cached=vocab_cached,
    root=root,
    fname=test_fname,
    vocab_dir=vocab_dir,
    fextension=fextension,
    nrows=None,
    adap_thres=10 ** 8,
    get_rids=True,
    columns_to_select=None
)

# Print out vocab sizes for the test dataset to ensure consistency
print(len(test_dataset.dynamic_vocab), len(test_dataset.time_feature_vocab), len(test_dataset.static_vocab))


2025-02-10 10:15:05,353 - INFO - cached encoder is read from /data/IDEA_DeFi_Research/Data/AML/LI_Small/preprocessed/preprocessed/transactions_user_time_train.encoder_fit.pkl


dataset.encoder_path: /data/IDEA_DeFi_Research/Data/AML/LI_Small/preprocessed/preprocessed/transactions_user_time_train.encoder_fit.pkl


2025-02-10 10:15:34,179 - INFO - read data : (2269478, 70)
2025-02-10 10:15:34,180 - INFO - /data/IDEA_DeFi_Research/Data/AML/LI_Small/preprocessed/transactions_user_time_test.csv is read.
2025-02-10 10:15:34,180 - INFO - nan resolution.
2025-02-10 10:15:35,613 - INFO - timestamp fit transform
2025-02-10 10:15:35,614 - INFO - label-fit-transform.
  0%|          | 0/6 [00:00<?, ?it/s]2025-02-10 10:15:35,615 - INFO - skipping col user
100%|██████████| 6/6 [00:01<00:00,  5.15it/s]
2025-02-10 10:15:36,780 - INFO - amount quant transform
  0%|          | 0/64 [00:00<?, ?it/s]2025-02-10 10:15:36,782 - INFO - skipping col rowNumber
2025-02-10 10:15:36,782 - INFO - skipping col timestamp
2025-02-10 10:15:36,783 - INFO - skipping col id
2025-02-10 10:15:36,785 - INFO - encoding userBank
2025-02-10 10:15:36,809 - INFO - encoding recipientBank
2025-02-10 10:15:36,844 - INFO - encoding amountReceived
  9%|▉         | 6/64 [00:00<00:01, 56.14it/s]2025-02-10 10:15:36,891 - INFO - encoding amount
202

218 18 416
