In [None]:
!pip install tqdm polars scikit-learn matplotlib

In [None]:
import torch
from data_loader import S3ParquetReader
from config import USER
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from models import AutoEncoder, train_model, CombinedLoss
from torch.optim import Adam

In [None]:
BUCKET = f"/{USER}/jane_street_data"
reader = S3ParquetReader(bucket=BUCKET)
FILE_KEY_S3 = "preprocessed.parquet/data_clean_symb_1.parquet"

In [None]:
data = reader.read_parquet(FILE_KEY_S3)
data.head()

In [None]:
data = data.sort(by="time_id")

In [None]:
target = "responder_6"
features = [col for col in data.columns if "feature" in col]

X, y = data[features], data[target]

In [None]:
n = X.height
n_train = int(0.8 * n)
X_train = X.slice(0, n_train)
y_train = y.slice(0, n_train)

X_val = X.slice(n_train)
y_val = y.slice(n_train)

In [None]:
class TimeSeriesDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X.to_numpy(), dtype=torch.float32)
        y = torch.tensor(y.to_numpy(), dtype=torch.float32)
        if y.ndim == 1:
            y = y.view(-1, 1)
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [None]:
train_data = TimeSeriesDataset(X_train, y_train)
val_data = TimeSeriesDataset(X_val, y_val)

In [None]:
train_data_loader = DataLoader(
    train_data, batch_size=2048, shuffle=False, num_workers=8
)
val_data_loader = DataLoader(val_data, batch_size=2048, shuffle=False, num_workers=8)

In [None]:
len(features)

In [None]:
n_feat = len(features)
n_latent = 16
encoder_hidden = [64, 32]
decoder_hidden = [32]
head_hidden = [8]

In [None]:
model = AutoEncoder(
    n_feat=n_feat,
    n_latent=n_latent,
    encoder_hidden=encoder_hidden,
    decoder_hidden=decoder_hidden,
    head_hidden=head_hidden,
)

In [None]:
lr = 1e-2
device = torch.device("cuda")
alpha = 1.0
beta = 1.0

In [None]:
device

In [None]:
optimizer = Adam(model.parameters(), lr=lr)
criterion = CombinedLoss(alpha=alpha, beta=beta)

In [None]:
train_model(
    model=model,
    train_loader=train_data_loader,
    val_loader=val_data_loader,
    criterion=criterion,
    optimizer=optimizer,
    device=device,
    n_epochs=10,
)