In [300]:
!pwd
!pip install datasets
!pip install transformers

/Users/evanwyf/Downloads/linear_test


In [301]:
# set up auto load
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [302]:
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, RobertaConfig,AutoModelForSequenceClassification
from datasets import load_dataset
from linearmodel import *

model = Model()

def preprocess_dataset(batch):
    prices = torch.tensor(batch["adj_close"]).unsqueeze(-1).float()
    volumes = torch.tensor(batch["volume"]).unsqueeze(-1).float()
    open_ = torch.tensor(batch["open"]).unsqueeze(-1).float()
    high = torch.tensor(batch["high"]).unsqueeze(-1).float()
    low = torch.tensor(batch["low"]).unsqueeze(-1).float()
    macd = torch.tensor(batch["macd"]).unsqueeze(-1).float()
    boll_ub = torch.tensor(batch["boll_ub"]).unsqueeze(-1).float()
    boll_lb = torch.tensor(batch["boll_lb"]).unsqueeze(-1).float()
    rsi_30 = torch.tensor(batch["rsi_30"]).unsqueeze(-1).float()
    cci_30 = torch.tensor(batch["cci_30"]).unsqueeze(-1).float()
    dx_30 = torch.tensor(batch["dx_30"]).unsqueeze(-1).float()
    close_30_sma = torch.tensor(batch["close_30_sma"]).unsqueeze(-1).float()
    close_60_sma = torch.tensor(batch["close_60_sma"]).unsqueeze(-1).float()
    arima_res = torch.tensor(batch["arima_res"]).unsqueeze(-1).float()
    residuals = torch.tensor(batch["residuals"]).unsqueeze(-1).float()


    labels = torch.tensor(batch["label"]).float()

    return {"prices": prices, "volumes": volumes, "labels": labels, "open": open_, "high": high, "low": low,
            "macd": macd, "boll_ub": boll_ub, "boll_lb": boll_lb, "rsi_30": rsi_30, "cci_30": cci_30, "dx_30": dx_30, "close_30_sma": close_30_sma,
            "close_60_sma": close_60_sma, "arima_res": arima_res, "residuals": residuals}
    # return {"prices": prices, "volumes": volumes, "labels": labels}


def custom_collate_fn(batch):
    prices = [torch.tensor(item["prices"]) for item in batch]
    volumes = [torch.tensor(item["volumes"]) for item in batch]
    labels = [torch.tensor(item["labels"]) for item in batch]
    open_ = [torch.tensor(item["open"]) for item in batch]
    high = [torch.tensor(item["high"]) for item in batch]
    low = [torch.tensor(item["low"]) for item in batch]
    macd = [torch.tensor(item["macd"]) for item in batch]
    boll_ub = [torch.tensor(item["boll_ub"]) for item in batch]
    boll_lb = [torch.tensor(item["boll_lb"]) for item in batch]
    rsi_30 = [torch.tensor(item["rsi_30"]) for item in batch]
    cci_30 = [torch.tensor(item["cci_30"]) for item in batch]
    dx_30 = [torch.tensor(item["dx_30"]) for item in batch]
    close_30_sma = [torch.tensor(item["close_30_sma"]) for item in batch]
    close_60_sma = [torch.tensor(item["close_60_sma"]) for item in batch]
    arima_res = [torch.tensor(item["arima_res"]) for item in batch]
    residuals = [torch.tensor(item["residuals"]) for item in batch]

    prices = torch.nn.utils.rnn.pad_sequence(prices, batch_first=True, padding_value=0)
    volumes = torch.nn.utils.rnn.pad_sequence(volumes, batch_first=True, padding_value=0)
    open_ = torch.nn.utils.rnn.pad_sequence(open_, batch_first=True, padding_value=0)
    high = torch.nn.utils.rnn.pad_sequence(high, batch_first=True, padding_value=0)
    low = torch.nn.utils.rnn.pad_sequence(low, batch_first=True, padding_value=0)
    macd = torch.nn.utils.rnn.pad_sequence(macd, batch_first=True, padding_value=0)
    boll_ub = torch.nn.utils.rnn.pad_sequence(boll_ub, batch_first=True, padding_value=0)
    boll_lb = torch.nn.utils.rnn.pad_sequence(boll_lb, batch_first=True, padding_value=0)
    rsi_30 = torch.nn.utils.rnn.pad_sequence(rsi_30, batch_first=True, padding_value=0)
    cci_30 = torch.nn.utils.rnn.pad_sequence(cci_30, batch_first=True, padding_value=0)
    dx_30 = torch.nn.utils.rnn.pad_sequence(dx_30, batch_first=True, padding_value=0)
    close_30_sma = torch.nn.utils.rnn.pad_sequence(close_30_sma, batch_first=True, padding_value=0)
    close_60_sma = torch.nn.utils.rnn.pad_sequence(close_60_sma, batch_first=True, padding_value=0)
    arima_res = torch.nn.utils.rnn.pad_sequence(arima_res, batch_first=True, padding_value=0)
    residuals = torch.nn.utils.rnn.pad_sequence(residuals, batch_first=True, padding_value=0)

    labels = torch.stack(labels)

    return {"prices": prices, "volumes": volumes, "labels": labels, "open": open_, "high": high, "low": low,
            "macd": macd, "boll_ub": boll_ub, "boll_lb": boll_lb, "rsi_30": rsi_30, "cci_30": cci_30, "dx_30": dx_30, "close_30_sma": close_30_sma,
            "close_60_sma": close_60_sma, "arima_res": arima_res, "residuals": residuals}# return {"prices": prices, "volumes": volumes, "labels": labels}
from tqdm import tqdm

def train(model, dataloader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0

    progress_bar = tqdm(dataloader, desc="Training", unit="batch")
    for inputs in progress_bar:
        prices = inputs["prices"].to(device)
        volumes = inputs["volumes"].to(device)
        open_ = inputs["open"].to(device)
        high = inputs["high"].to(device)
        low = inputs["low"].to(device)
        macd = inputs["macd"].to(device)
        boll_ub = inputs["boll_ub"].to(device)
        boll_lb = inputs["boll_lb"].to(device)
        rsi_30 = inputs["rsi_30"].to(device)
        cci_30 = inputs["cci_30"].to(device)
        dx_30 = inputs["dx_30"].to(device)
        close_30_sma = inputs["close_30_sma"].to(device)
        close_60_sma = inputs["close_60_sma"].to(device)
        arima_res = inputs["arima_res"].to(device)
        residuals = inputs["residuals"].to(device)
        input_batch = torch.cat((prices, volumes, open_, high, low, macd, boll_ub, boll_lb, rsi_30, cci_30, dx_30, close_30_sma, close_60_sma, arima_res, residuals), -1)
        # input_batch = torch.cat((prices, volumes), -1)
        labels = inputs["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_batch)
        loss = criterion(torch.flatten(outputs), torch.flatten(labels))
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        
        progress_bar.set_postfix({"Running Loss": running_loss / (progress_bar.n + 1)})

    return running_loss / len(dataloader)

def evaluate(model, dataloader, criterion, device):
    model.eval()
    running_loss = 0.0
    total_predictions = 0
    correct_predictions = 0

    with torch.no_grad():
        for inputs in dataloader:
            prices = inputs["prices"].to(device)
            volumes = inputs["volumes"].to(device)
            open_ = inputs["open"].to(device)
            high = inputs["high"].to(device)
            low = inputs["low"].to(device)
            macd = inputs["macd"].to(device)
            boll_ub = inputs["boll_ub"].to(device)
            boll_lb = inputs["boll_lb"].to(device)
            rsi_30 = inputs["rsi_30"].to(device)
            cci_30 = inputs["cci_30"].to(device)
            dx_30 = inputs["dx_30"].to(device)
            close_30_sma = inputs["close_30_sma"].to(device)
            close_60_sma = inputs["close_60_sma"].to(device)
            arima_res = inputs["arima_res"].to(device)
            residuals = inputs["residuals"].to(device)
            input_batch = torch.cat((prices, volumes, open_, high, low, macd, boll_ub, boll_lb, rsi_30, cci_30, dx_30, close_30_sma, close_60_sma, arima_res, residuals), -1)
        
            # input_batch = torch.cat((prices, volumes), -1)
            labels = inputs["labels"].to(device)

            outputs = model(input_batch)
            loss = criterion(torch.flatten(outputs), torch.flatten(labels))

            running_loss += loss.item()

            # Calculate accuracy
            predictions = torch.flatten((outputs > 0).int())
            total_predictions += labels.size(0)
            correct_predictions += (torch.flatten(predictions) == labels).sum().item()
    return running_loss / len(dataloader), correct_predictions / total_predictions





In [303]:
time_window = 5
batch_size = 4

hf_dataset = load_dataset("json", data_files={"train": "tw5.json"}, split="train")
hf_dataset = hf_dataset.map(preprocess_dataset, batched=True)
train_val_split = hf_dataset.train_test_split(test_size=0.1)
train_dataset = train_val_split["train"]
val_dataset = train_val_split["test"]
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=custom_collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=custom_collate_fn)



Found cached dataset json (/Users/evanwyf/.cache/huggingface/datasets/json/default-863568d442456b93/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e)
Loading cached processed dataset at /Users/evanwyf/.cache/huggingface/datasets/json/default-863568d442456b93/0.0.0/fe5dd6ea2639a6df622901539cb550cf8797e5a6b2dd7af1cf934bed8e233e6e/cache-7023d5019e50ad99.arrow


In [304]:
device =  "cpu"
print(f"Using device: {device}")
model.to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
num_epochs = 10

import copy

num_epochs = 10
min_val_loss = float("inf")  # Set initial minimum validation loss to a large value
best_model = None

for epoch in range(num_epochs):
    train_loss = train(model, train_dataloader, criterion, optimizer, device)
    val_loss, val_accuracy = evaluate(model, val_dataloader, criterion, device)
    
    print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {train_loss:.4f} - Val Loss: {val_loss:.4f} - Val Accuracy: {val_accuracy:.4f}")
    
    # Check if validation loss is lower than the current minimum
    if val_loss < min_val_loss:
        min_val_loss = val_loss
        best_model = copy.deepcopy(model.state_dict())
        #print("Minimum validation loss decreased. Saving the model...")
        #torch.save(best_model, f"/content/drive/MyDrive/Colab Notebooks/{epoch+1}_{val_accuracy:.4f}_.pth")

# Save the best model to a file




Using device: cpu


Training: 100%|██████████| 2168/2168 [00:05<00:00, 394.42batch/s, Running Loss=1.08e+3]


Epoch 1/10 - Train Loss: 1071.9208 - Val Loss: 4.1882 - Val Accuracy: 0.5270


Training: 100%|██████████| 2168/2168 [00:05<00:00, 395.53batch/s, Running Loss=50.2]


Epoch 2/10 - Train Loss: 49.9156 - Val Loss: 9.7470 - Val Accuracy: 0.5104


Training: 100%|██████████| 2168/2168 [00:05<00:00, 392.78batch/s, Running Loss=46.9] 


Epoch 3/10 - Train Loss: 46.2673 - Val Loss: 9.1488 - Val Accuracy: 0.5052


Training: 100%|██████████| 2168/2168 [00:06<00:00, 346.88batch/s, Running Loss=48.7]


Epoch 4/10 - Train Loss: 48.5412 - Val Loss: 23.5872 - Val Accuracy: 0.5052


Training: 100%|██████████| 2168/2168 [00:06<00:00, 356.45batch/s, Running Loss=44.6] 


Epoch 5/10 - Train Loss: 44.5828 - Val Loss: 49.1528 - Val Accuracy: 0.4865


Training: 100%|██████████| 2168/2168 [00:06<00:00, 339.65batch/s, Running Loss=47.6] 


Epoch 6/10 - Train Loss: 47.0667 - Val Loss: 13.2760 - Val Accuracy: 0.5135


Training: 100%|██████████| 2168/2168 [00:06<00:00, 353.18batch/s, Running Loss=45.5]


Epoch 7/10 - Train Loss: 45.1247 - Val Loss: 3.4320 - Val Accuracy: 0.5062


Training: 100%|██████████| 2168/2168 [00:06<00:00, 343.43batch/s, Running Loss=34.6]


Epoch 8/10 - Train Loss: 34.5718 - Val Loss: 30.4830 - Val Accuracy: 0.4886


Training: 100%|██████████| 2168/2168 [00:06<00:00, 352.68batch/s, Running Loss=35.2] 


Epoch 9/10 - Train Loss: 34.7801 - Val Loss: 30.8858 - Val Accuracy: 0.4886


Training: 100%|██████████| 2168/2168 [00:06<00:00, 351.18batch/s, Running Loss=35.4]


Epoch 10/10 - Train Loss: 35.0081 - Val Loss: 4.7262 - Val Accuracy: 0.5207
