In [10]:
# Imports

import os
import re
from argparse import ArgumentParser

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# PyTorch Lightning
import pytorch_lightning as pl
import seaborn as sns

# PyTorch
import torch
from torch import Tensor, nn, optim
import torch.nn.functional as F
import torch.utils.data as data

import torchmetrics.functional as metrics

from tqdm.notebook import tqdm
from IPython.display import set_matplotlib_formats
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import WandbLogger

import wandb

DEVICE = torch.device("cpu")

# Import GPU-related things
if torch.cuda.is_available():
    # import cupy as np
    # import cudf as pd

    # Ensure that all operations are deterministic on GPU (if used) for reproducibility
    torch.backends.cudnn.determinstic = True
    torch.backends.cudnn.benchmark = False

    DEVICE = torch.device("cuda:0")
# else:

# Plotting
plt.set_cmap("cividis")
#%matplotlib inline
set_matplotlib_formats("svg", "pdf")  # For export
matplotlib.rcParams["lines.linewidth"] = 2.0
sns.reset_orig()

# Path to the folder where the datasets are/should be downloaded (e.g. CIFAR10)
DATASET_PATH = os.environ.get("PATH_DATASETS", "data/")
# Path to the folder where the pretrained models are saved
CHECKPOINT_PATH = os.environ.get("PATH_CHECKPOINT", "saved_models/")

# Setting the seed
pl.seed_everything(42)

print('CUDA:', torch.cuda.is_available())
print("Device:", DEVICE)

  set_matplotlib_formats("svg", "pdf")  # For export
Global seed set to 42


CUDA: True
Device: cuda:0


In [2]:
def in_ipython():
    try:
        return __IPYTHON__
    except NameError:
        return False

In [3]:
df = pd.read_csv('./data/data.csv')

In [4]:
X = df.drop(columns=['type', 'name'])
y = df['type']

X_encoder = preprocessing.LabelBinarizer()
y_encoder = preprocessing.LabelBinarizer()

name_trans = X_encoder.fit_transform(df['name'].to_numpy().reshape(-1, 1))
X = pd.concat([X, pd.DataFrame(name_trans)], axis=1)
y = y_encoder.fit_transform(y.to_numpy().reshape(-1, 1))

In [5]:
# Split into train+val and test
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Split train into train-val
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.1, random_state=0)

In [6]:
scaler = preprocessing.MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

X_train, y_train = np.array(X_train), np.array(y_train)
X_val, y_val = np.array(X_val), np.array(y_val)
X_test, y_test = np.array(X_test), np.array(y_test)



In [13]:
class CustomDataset(data.Dataset):
    def __init__(self, X, y):
        super().__init__()
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [22]:
class FFNetwork(pl.LightningModule):
    def __init__(self, num_features, num_classes, dropout: float = 0.1):
        super().__init__()

        self.save_hyperparameters()

        self.sequential = nn.Sequential(
            nn.Linear(num_features, num_features),
            # nn.Linear(num_features, num_features),
            # nn.Linear(num_features, num_features),
            # nn.BatchNorm1d(num_features),
            nn.Linear(num_features, num_classes),
            # nn.ReLU(),
            nn.Softmax(dim=1)
        )

    def training_step(self, batch, batch_idx):
        loss = self._calculate_loss(batch, mode="train")
        return loss

    def validation_step(self, batch, batch_idx):
        _ = self._calculate_loss(batch, mode="val")

    def test_step(self, batch, batch_idx):
        _ = self._calculate_loss(batch, mode="test")

    def forward(self, X):
        return self.sequential(X)

    def _calculate_loss(self, batch, mode="train"):
        X, y = batch
        preds = self.forward(X)

        #print('X_hat:', X_hat.size())
        #print('X', X.size())
        #print('y', y.size())
        #print('X_hat')
        #print(preds)
        #print('X')
        #print(X)
        #print('y')
        #print(y)
        loss = F.cross_entropy(preds, y)
        # Logging to WANDB
        self.log(f"{mode}_loss", loss)
        self.log(f'{mode}_f1', metrics.f1_score(preds, y.long()))
        return loss

    def configure_optimizers(self):
        return optim.Adam(self.parameters(), lr=0.001)

    def train_dataloader(self):
        return data.DataLoader(
            CustomDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).float()),
            batch_size=32,
            shuffle=True,
            num_workers=8
        )

    def val_dataloader(self):
        return data.DataLoader(
            CustomDataset(torch.from_numpy(X_val).float(), torch.from_numpy(y_val).float()),
            batch_size=1,
            num_workers=8
        )

    def test_dataloader(self):
        return data.DataLoader(
            CustomDataset(torch.from_numpy(X_test).float(), torch.from_numpy(y_test).float()),
            batch_size=1,
            num_workers=8
        )

In [23]:
# train.py
def main(hparams):
    wandb.finish()
    wandb_logger = WandbLogger(project="bachelor")

    print('Loading data..')
    print(f'X shape: {X.shape[1]}')
    print(f'y shape: {y.shape[1]}')

    model = FFNetwork(
        num_features=X.shape[1],
        num_classes=y.shape[1],
        dropout=0.1
    )

    # train the model
    trainer = pl.Trainer(
        devices=1, # torch.cuda.device_count(),
        accelerator="gpu",
        # strategy=None,
        precision=16,
        max_epochs=1,
        min_epochs=1,
        #overfit_batches=1,
        logger=wandb_logger
     )

    trainer.fit(model=model)


if __name__ == "__main__":
    if not in_ipython():
        root_dir = os.path.dirname(os.path.realpath(__file__))
        parser = ArgumentParser(add_help=False)
        hyperparams = parser.parse_args()

        # TRAIN
        main(hyperparams)
    else:
        main(None)

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_f1,▁▅▅▄▄▄█▃▅▄▄▄▅▆▄▅▄▂▂▃▇▅▅▄▂▃█▃▇▂▅▄▁▂▁▅▃▇▂▅
train_loss,▇▆▃▆▆▅▃▄▄▅▅▇▅▅▄▃▆█▆▄▁▁▃▇█▄▄▅▄▄▅▄█▅▆▄▇▄▃▃
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
val_f1,▁
val_loss,▁

0,1
epoch,0.0
train_f1,0.2
train_loss,2.75963
trainer/global_step,56662.0
val_f1,0.24279
val_loss,2.48702


Loading data..
X shape: 311
y shape: 82


Using 16bit native Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name       | Type       | Params
------------------------------------------
0 | sequential | Sequential | 122 K 
------------------------------------------
122 K     Trainable params
0         Non-trainable params
122 K     Total params
0.245     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
