<https://github.com/PolymathicAI/xVal>


In [1]:
import hashlib
import os
import random as py_random
from datetime import datetime as dt
from typing import Union

import icecream
import jax.numpy as jnp
import nltk
import numpy as np
import pandas as pd
from flax import nnx
from icecream import ic
from nltk.corpus import words
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from tqdm.notebook import tqdm
from transformers import BertTokenizerFast, FlaxBertModel

import hephaestus as hp
import hephaestus.training.training as ht

icecream.install()
ic_disable = False  # Global variable to disable ic
if ic_disable:
    ic.disable()
ic.configureOutput(includeContext=True, contextAbsPath=True)
pd.options.mode.copy_on_write = True
os.environ["TOKENIZERS_PARALLELISM"] = "false"
nltk.download("words")
word_list = words.words()

[nltk_data] Downloading package words to
[nltk_data]     /Users/kailukowiak/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [2]:
# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
model = FlaxBertModel.from_pretrained(model_name)
tokenizer = BertTokenizerFast.from_pretrained(model_name)

# Get the embeddings matrix
embeddings = model.params["embeddings"]["word_embeddings"]["embedding"]

# Now you can access specific embeddings like this:
# For example, to get embeddings for tokens 23, 293, and 993:
selected_embeddings = jnp.take(embeddings, jnp.array([23, 293, 993]), axis=0)

# If you want to get embeddings for specific words:
words_example = ["hello", "world", "example"]
tokens = tokenizer.convert_tokens_to_ids(words_example)
word_embeddings = jnp.take(embeddings, jnp.array(tokens), axis=0)
word_embeddings.shape

Some weights of FlaxBertModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: {('pooler', 'dense', 'kernel'), ('pooler', 'dense', 'bias')}
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


(3, 768)

In [3]:
df = pd.read_csv("data/Koinly/koinly_example.csv")
df.rename(
    lambda x: x.lower().replace(" ", "_").replace("(", "").replace(")", ""),
    axis=1,
    inplace=True,
)

df[df.select_dtypes(include=["object"]).columns] = df.select_dtypes(
    include=["object"]
).fillna("[missing]")

df[df.select_dtypes(include=["number"]).columns] = df.select_dtypes(
    include=["number"]
).fillna(0.0)
df = df.dropna(axis=0, how="all")
df = df.dropna(axis=1, how="all")
df = df.drop(columns=["date"])

df.head()

Unnamed: 0,type,tag,sending_wallet,sent_amount,sent_currency,sent_cost_basis,receiving_wallet,received_amount,received_currency,received_cost_basis,fee_amount,fee_currency,gain_usd,net_value_usd,fee_value_usd,txsrc,txdest,txhash,description
0,crypto_deposit,[missing],[missing],0.0,[missing],0.0,Solana (SOL) 10,0.36925,NOS,0.26125,0.0,[missing],0.0,0.26125,0.0,[missing],[missing],269CiLPaFK55QqiVbsJupN6BSPUHQ3x7kN6iHPhSMV2NDw...,[missing]
1,crypto_deposit,[missing],[missing],0.0,[missing],0.0,Solana (SOL) 10,0.001721,SOL,0.175936,0.0,[missing],0.0,0.175936,0.0,[missing],[missing],269CiLPaFK55QqiVbsJupN6BSPUHQ3x7kN6iHPhSMV2NDw...,[missing]
2,crypto_withdrawal,[missing],Solana (SOL) 10,0.001736,SOL,0.180755,[missing],0.0,[missing],0.0,0.0,[missing],-0.003286,0.17747,0.0,[missing],[missing],FaDbep4DQ7pJgHdvAoxZ7wvkfjHxLn99aZ8oT1EyaNE6L3...,[missing]
3,crypto_deposit,[missing],[missing],0.0,[missing],0.0,Solana (SOL) 10,0.3735,NOS,0.264257,0.0,[missing],0.0,0.264257,0.0,[missing],[missing],4jfNX5Ja3CnxMLHZCevfLw4k1A2SQxrMh6r2K3mZKz5QfQ...,[missing]
4,crypto_deposit,[missing],[missing],0.0,[missing],0.0,Solana (SOL) 10,0.001721,SOL,0.175936,0.0,[missing],0.0,0.175936,0.0,[missing],[missing],4jfNX5Ja3CnxMLHZCevfLw4k1A2SQxrMh6r2K3mZKz5QfQ...,[missing]


In [4]:
datetime_cols = ["created_at", "updated_at", "date"]


def enrich_datetimes(df: pd.DataFrame, col: str):
    df[col] = pd.to_datetime(df[col], utc=True)

    df[f"{col}_year"] = df[col].dt.year
    df[f"{col}_month_sin"] = np.sin(2 * np.pi * df[col].dt.month / 12)
    df[f"{col}_month_cos"] = np.cos(2 * np.pi * df[col].dt.month / 12)
    df[f"{col}_day_sin"] = np.sin(2 * np.pi * df[col].dt.day / 31)
    df[f"{col}_day_cos"] = np.cos(2 * np.pi * df[col].dt.day / 31)
    df[f"{col}_hour_sin"] = np.sin(2 * np.pi * df[col].dt.hour / 24)
    df[f"{col}_hour_cos"] = np.cos(2 * np.pi * df[col].dt.hour / 24)
    df[f"{col}_minute_sin"] = np.sin(2 * np.pi * df[col].dt.minute / 60)
    df[f"{col}_minute_cos"] = np.cos(2 * np.pi * df[col].dt.minute / 60)
    df[f"{col}_second_sin"] = np.sin(2 * np.pi * df[col].dt.second / 60)
    df[f"{col}_second_cos"] = np.cos(2 * np.pi * df[col].dt.second / 60)

    return df.drop(columns=[col])


# for col in datetime_cols:
#     df = enrich_datetimes(df, col)

In [5]:
# Use the words corpus from nltk
words_list = words.words()


def hash_to_words(crypto_hash: Union[str, np.nan]) -> Union[str, np.nan]:
    if crypto_hash is np.nan:
        return np.nan
    # Create a hash of the input
    hash_object = hashlib.sha256(crypto_hash.encode())
    hash_digest = hash_object.hexdigest()

    # Use the hash to select three words from the list
    py_random.seed(hash_digest)
    selected_words = py_random.sample(word_list, 3)

    return "hash " + " ".join(selected_words)


# Example usage
crypto_hash = "269CiLPaFK55QqiVbsJupN6BSPUHQ3x7kN6iHPhSMV2NDwHM2EhnwQ6hE6FEvbix6AVN2PLUMQyyhrKr2y514dRB"
print(hash_to_words(crypto_hash))

hash rabies placophoran isagogically


In [6]:
df.head()

Unnamed: 0,type,tag,sending_wallet,sent_amount,sent_currency,sent_cost_basis,receiving_wallet,received_amount,received_currency,received_cost_basis,fee_amount,fee_currency,gain_usd,net_value_usd,fee_value_usd,txsrc,txdest,txhash,description
0,crypto_deposit,[missing],[missing],0.0,[missing],0.0,Solana (SOL) 10,0.36925,NOS,0.26125,0.0,[missing],0.0,0.26125,0.0,[missing],[missing],269CiLPaFK55QqiVbsJupN6BSPUHQ3x7kN6iHPhSMV2NDw...,[missing]
1,crypto_deposit,[missing],[missing],0.0,[missing],0.0,Solana (SOL) 10,0.001721,SOL,0.175936,0.0,[missing],0.0,0.175936,0.0,[missing],[missing],269CiLPaFK55QqiVbsJupN6BSPUHQ3x7kN6iHPhSMV2NDw...,[missing]
2,crypto_withdrawal,[missing],Solana (SOL) 10,0.001736,SOL,0.180755,[missing],0.0,[missing],0.0,0.0,[missing],-0.003286,0.17747,0.0,[missing],[missing],FaDbep4DQ7pJgHdvAoxZ7wvkfjHxLn99aZ8oT1EyaNE6L3...,[missing]
3,crypto_deposit,[missing],[missing],0.0,[missing],0.0,Solana (SOL) 10,0.3735,NOS,0.264257,0.0,[missing],0.0,0.264257,0.0,[missing],[missing],4jfNX5Ja3CnxMLHZCevfLw4k1A2SQxrMh6r2K3mZKz5QfQ...,[missing]
4,crypto_deposit,[missing],[missing],0.0,[missing],0.0,Solana (SOL) 10,0.001721,SOL,0.175936,0.0,[missing],0.0,0.175936,0.0,[missing],[missing],4jfNX5Ja3CnxMLHZCevfLw4k1A2SQxrMh6r2K3mZKz5QfQ...,[missing]


In [7]:
hash_columns = [i for i in df.columns if "tx" in i]
hash_columns

for col in hash_columns:
    df[col] = df[col].apply(hash_to_words)

# Drop hash columns for testing
df = df.drop(columns=[i for i in df.columns if "tx" in i])

In [8]:
df.dtypes

type                    object
tag                     object
sending_wallet          object
sent_amount            float64
sent_currency           object
sent_cost_basis        float64
receiving_wallet        object
received_amount        float64
received_currency       object
received_cost_basis    float64
fee_amount             float64
fee_currency            object
gain_usd               float64
net_value_usd          float64
fee_value_usd          float64
description             object
dtype: object

In [9]:
df.index.unique()

RangeIndex(start=0, stop=12296, step=1)

In [10]:
df["idx"] = df.index // 32
# df.index = df.index % 32
# df = df.loc[
#     :,
#     [
#         "idx",
#         # "type",
#         # "sending_wallet",
#         # "sent_amount",
#         # "sent_currency",
#         # "sent_cost_basis",
#     ],
# ]
# df["test_index"] = df.index
# df["test_index_2"] = df.index * 2
# # make new column for if test_index is even or odd
# df["even_odd"] = df["test_index"] % 2
# df["even_odd_str"] = df["even_odd"].apply(lambda x: "even" if x == 0 else "odd")
# # Make a new column for sin and cos for each index
# df["sin"] = np.sin(2 * np.pi * df["test_index"])
# df.head()
# df.head(45)

In [11]:
# Get train test split at 80/20
time_series_config = hp.TimeSeriesConfig.generate(df=df)
train_idx = int(df.idx.max() * 0.8)
train_df = df.loc[df.idx < train_idx].copy()
test_df = df.loc[df.idx >= train_idx].copy()
# del df
train_ds = hp.TimeSeriesDS(train_df, time_series_config)
test_ds = hp.TimeSeriesDS(test_df, time_series_config)
len(train_ds), len(test_ds)

(307, 78)

In [12]:
def make_batch(ds: hp.TimeSeriesDS, start: int, length: int):
    numeric = []
    categorical = []
    for i in range(start, length + start):
        numeric.append(ds[i][0])
        categorical.append(ds[i][1])
    # print index of None values
    return {"numeric": jnp.array(numeric), "categorical": jnp.array(categorical)}


batch = make_batch(train_ds, 0, 4)
print(batch["numeric"].shape, batch["categorical"].shape)
# (4, 27, 59) (4, 3, 59)
# batch

(4, 8, 32) (4, 8, 32)


In [13]:
multiplier = 4
time_series_regressor = hp.TimeSeriesDecoder(
    time_series_config, d_model=512, n_heads=8 * multiplier, rngs=nnx.Rngs(0)
)
# nnx.display(time_series_regressor)

In [14]:
res = time_series_regressor(
    numeric_inputs=batch["numeric"],
    categorical_inputs=batch["categorical"],
    deterministic=False,
)

ic| /Users/kailukowiak/Hephaestus/hephaestus/models/time_series_decoder.py:697 in __call__()
    numeric_inputs.shape: (4, 8, 32)
    categorical_inputs.shape: (4, 8, 32)
ic| /Users/kailukowiak/Hephaestus/hephaestus/models/time_series_decoder.py:544 in process_numeric()
    "col_token type": 'col_token type'
    numeric_col_embeddings.dtype: dtype('float32')
ic| /Users/kailukowiak/Hephaestus/hephaestus/models/time_series_decoder.py:548 in process_numeric()
    numeric_embedding.shape: (512,)
ic| /Users/kailukowiak/Hephaestus/hephaestus/models/time_series_decoder.py:551 in process_numeric()
    numeric_embedding.shape: (4, 8, 32, 512)
ic| /Users/kailukowiak/Hephaestus/hephaestus/models/time_series_decoder.py:559 in process_numeric()
    numeric_embedding.shape: (4, 8, 32, 512)
ic| /Users/kailukowiak/Hephaestus/hephaestus/models/time_series_decoder.py:587 in process_categorical()
    "Issue here": 'Issue here'
    categorical_inputs.shape: (4, 8, 32)
    "args": 'args'
    categorical_in

In [15]:
res["numeric_out"].shape, res["categorical_out"].shape

((4, 8, 32), (4, 8, 32, 205))

In [16]:
# Test for NaN
if jnp.isnan(res["numeric_out"]).any():
    raise ValueError("NaN in numeric_out")
if jnp.isnan(res["categorical_out"]).any():
    raise ValueError("NaN in categorical_out")

In [17]:
ic.disable()
causal_mask = True
# time_series_regressor.train()

In [19]:
metric_history = ht.create_metric_history()

learning_rate = 1e-3
momentum = 0.9
optimizer = ht.create_optimizer(time_series_regressor, learning_rate, momentum)

metrics = ht.create_metrics()
writer_name = "CryptoLoss"

writer_time = dt.now().strftime("%Y-%m-%dT%H:%M:%S")
model_name = writer_time + writer_name
summary_writer = SummaryWriter("runs/" + model_name)


train_data_loader = DataLoader(train_ds, batch_size=16, shuffle=True)
train_step = ht.create_train_step(
    model=time_series_regressor, optimizer=optimizer, metrics=metrics
)

for step, batch in enumerate(tqdm(train_data_loader)):
    batch = {"numeric": jnp.array(batch[0]), "categorical": jnp.array(batch[1])}
    train_step(time_series_regressor, batch, optimizer, metrics)
    for metric, value in metrics.compute().items():
        # Only shows `loss`

        metric_history[metric].append(value)
        if jnp.isnan(value).any():
            raise ValueError("Nan Values")
        summary_writer.add_scalar(f"train/{metric}", np.array(value), step)
    metrics.reset()

  0%|          | 0/20 [00:00<?, ?it/s]