In [144]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch.nn import functional as F
from torch.autograd import Variable
from torch import Tensor, optim, nn
import wandb
from tqdm import tqdm

wandb.login()

True

In [145]:
config = {
    "learning_rate": 0.005,
    "architecture": "GRU",
    "dataset": "timeline_1.0",
    "epochs": 50,
    "classes": 2,
    "batch_size": 32,
    "num_layers": 2,
    "hidden_size": 64,
    "dropout_prob": 0,
    "input_size": 381,
    "output_size": 2,
    "optimizer": "Adam",
    "loss": "CrossEntropyLoss",
    "activation": "ReLU",
    "initializer": "Xavier",
    "regularization": "L2",
    "regularization_lambda": 0.01,
    "gru_layers": 1,
    "sequence_length": 16,

}

In [146]:
def model_pipeline(hyperparameters):
    with wandb.init(project="leaguify", config=hyperparameters):
        # access all HPs through wandb.config, so logging matches execution!
        config = wandb.config

        # make the model, data, and optimization problem
        model, train_loader, val_loader, test_loader, criterion, optimizer = make(config)
        print(model)

        # and use them to train the model
        train(model, train_loader, criterion, optimizer, config)

        # and test its final performance
        test(model, val_loader)
        test(model, test_loader)
    return model

In [147]:
class TimelineDataset(Dataset):
    def __init__(self, data_dir, sequence_length, transform=None, target_transform=None):
        self.data = torch.tensor(np.load(data_dir)[:, :-1], dtype=torch.float32, device=device)
        self.labels = torch.tensor(np.load(data_dir)[:, -1], dtype=torch.int64, device=device)
        self.sequence_length = sequence_length
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.data) - self.sequence_length

    def __getitem__(self, idx):
        sample = self.data[idx:idx + self.sequence_length, :]
        label = self.labels[idx]
        if self.transform:
            sample = self.transform(sample)
        if self.target_transform:
            label = self.target_transform(label)
        return sample, label

In [148]:
def make(config):
    train, val = get_train_data(sequence_length=config.sequence_length, slice=2)
    test = get_test_data(sequence_length=config.sequence_length)
    train_loader = make_loader(train, batch_size=config.batch_size)
    val_loader = make_loader(val, batch_size=config.batch_size)
    test_loader = make_loader(test, batch_size=config.batch_size)

    model = GRU(config.input_size, config.hidden_size, config.classes, config.num_layers, config
                .gru_layers, drop_prob=config.dropout_prob).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)

    return model, train_loader, val_loader, test_loader, criterion, optimizer

In [149]:
def get_train_data(sequence_length=16, val_split=0.8, slice=1) -> (torch.utils.data.Dataset, torch.utils.data.Dataset):
    dataset = TimelineDataset('../data/processed/train_timeline.npy', sequence_length)
    print(f'len(dataset): {len(dataset)}')
    train_len = int(len(dataset) * val_split // sequence_length * sequence_length)
    print(f'train_len: {train_len}')
    assert train_len % sequence_length == 0
    val_len = len(dataset) - train_len
    assert val_len % sequence_length == 0
    print(f'train_len: {train_len}, val_len: {val_len}')
    train_data, val_data = torch.utils.data.random_split(dataset, [train_len, val_len])
    train_slice = torch.utils.data.Subset(
        train_data, indices=range(0, len(train_data), slice))
    check_correct_split(dataset, train_data, val_data, sequence_length=sequence_length)
    return train_slice, val_data

In [150]:
def check_correct_split(full_dataset, train_data, val_data, sequence_length=16):
    """
    Check that the train/val split is correct and no data is leaked between the two
    appends both datasets together, removes duplicates and checks that the original dataset is the same as the combined
    :param full_dataset: 
    :param train_data: 
    :param val_data: 
    :param sequence_length: 
    :return: 
    """
    len_combined = len(full_dataset)
    full_no_dup = torch.unique(full_dataset, dim=0)
    len_no_dup = len(full_no_dup)
    original = TimelineDataset('../data/processed/train_timeline.npy', sequence_length)
    len_original = len(original)
    print(f'len_combined: {len_combined}, len_no_dup: {len_no_dup}, len_original: {len_original}')
    assert len_combined == len_no_dup
    assert len_original == len_combined



In [151]:
def get_test_data(sequence_length=16):
    full_dataset = TimelineDataset('../data/processed/test_timeline.npy', sequence_length)
    return full_dataset

In [152]:
def make_loader(dataset, batch_size=64):
    return DataLoader(dataset, batch_size=batch_size, num_workers=0, drop_last=False)

In [153]:
device = (
    "cuda" if torch.cuda.is_available()
    else "cpu"
)
if torch.cuda.is_available():
    print(f'PyTorch version: {torch.__version__}')
    print('*' * 10)
    print(f'_CUDA version: ')
    !nvcc --version
    print('*' * 10)
    print(f'CUDNN version: {torch.backends.cudnn.version()}')
    print(f'Available GPU devices: {torch.cuda.device_count()}')
    print(f'Device Name: {torch.cuda.get_device_name()}')
print(f"Using {device} device")

PyTorch version: 2.1.0+cu121
**********
_CUDA version: 
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:09:35_Pacific_Daylight_Time_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0
**********
CUDNN version: 8801
Available GPU devices: 1
Device Name: NVIDIA GeForce RTX 2080
Using cuda device


In [154]:
class GRU(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, n_layers, gru_layers, drop_prob=0.2):
        super(GRU, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.gru = nn.GRU(input_dim, hidden_dim, gru_layers, batch_first=True, dropout=drop_prob)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()

    def forward(self, x, h=None):
        out, h = self.gru(x, h)
        out = self.fc(self.relu(out[:, -1]))
        return out, h

    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = weight.new(1, batch_size, self.hidden_dim).zero_().to(device)
        return hidden

In [155]:
def train(model, loader, criterion, optimizer, config):
    wandb.watch(model, criterion, log='all', log_freq=10)

    total_batches = len(loader) * config.epochs
    example_count = 0
    batch_count = 0
    loss_vals = []
    for epoch in tqdm(range(config.epochs)):
        h = model.init_hidden(config.batch_size)
        for _, (matches, labels) in enumerate(loader):
            output, h = model(matches)  # hidden state is not passed to re-init at each batch
            loss = criterion(output, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            example_count += len(matches)
            batch_count += 1
            if (batch_count + 1) % 25 == 0:
                train_log(loss, example_count, epoch)

In [156]:
def train_log(loss, example_count, epoch):
    wandb.log({"epoch": epoch, "loss": loss}, step=example_count)
    print(f"Loss after {str(example_count).zfill(5)} examples: {loss:.3f}")

In [157]:
train_data, val_data = get_train_data(sequence_length=16)
print(f'train_data: {len(train_data)}')
print(f'val_data: {len(val_data)}')
for matches, labels in make_loader(val_data, batch_size=1):
    if matches.shape[1] != 16:
        print(f'matches: {matches.shape}, labels: {labels.shape}')

len(dataset): 12784
train_len: 10224
train_len: 10224, val_len: 2560


AttributeError: 'ConcatDataset' object has no attribute 'tensor'

In [None]:
def test(model, test_loader):
    # Run the model on some test examples
    with torch.no_grad():
        correct, total = 0, 0
        for matches, labels in test_loader:
            print(f'matches: {matches.shape}, labels: {labels.shape}')
            matches, labels = matches.to(device), labels.to(device)
            model.eval()
            output, h = model(matches)
            _, predicted = torch.max(output, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        print(f"Accuracy of the model on the {total} " +
              f"test matches: {correct / total:%}")

        wandb.log({"test_accuracy": correct / total})

In [None]:
model = model_pipeline(config)