In [1]:
import torch
from torcheval import metrics
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from torchtext import vocab
from torchtext.transforms import ToTensor
import os
import pandas as pd
from transformers import AutoTokenizer
from functools import partial
import tweets_classifier_torch
from typing import List, Dict

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DATA_DIR = "../../Datasets/disaster_tweets"
LOGS_FOLDER = "./logs"
CHECKPOINT_DIR = "./tweets_classifier/checkpoints"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SHUFFLE_SEED = 42
EPOCHS = 10
BATCH = 32
MAX_SENTENCE_LENGTH = 165
EMBEDDING_SIZE = 256

In [3]:
def create_run_logger(folder, model):
    run_folder = f"{folder}/{model.__class__.__name__}"
    try:
        run_n = int(max(os.listdir(run_folder)))
        run_n += 1
    except:
        run_n = 0
    return SummaryWriter(f"{run_folder}/{run_n}")
    
def train_step(
        model: tweets_classifier_torch.model.TweetsDisasterClassifier,
        data: tweets_classifier_torch.datasets.TweetsV2,
        optimizer: torch.optim.Optimizer,
        loss_fn: torch.nn.Module,
        metrics_dict: Dict[str, metrics.Metric],
        device: str):
    model.to(device)
    ov_loss = 0
    for batch, (X, y) in enumerate(data):
        X, y = X.to(device), y.to(device)

        y_pred = model(X).squeeze()

        loss = loss_fn(y_pred, y)
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

        ov_loss += loss

        for metric in metrics_dict.values():
            metric.to(device)
            metric.update(y_pred, y)

    return ov_loss / len(data)

In [4]:
pretrained_tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

train_data = tweets_classifier_torch.datasets.TweetsV2(
    file = f"{DATA_DIR}/train_preprocessed.parquet",
    transform = ToTensor(dtype = torch.float32),
    target_transform = partial(torch.tensor, dtype = torch.float32),
    concat_cols = True,
    vectorize = True,
    max_vector_length = MAX_SENTENCE_LENGTH 
)

#num_workers = None - 1m52s
#num_workers = 8 - too long 
#num_workers = 8, persistent_workers = True - 1m15s
#num_workers = 12, persistent_workers = True - 1m25s
#num_workers = 8, persistent_workers = True, prefetch_factor = 4 - 1m25s
train_dataloader = DataLoader(
    train_data,
    batch_size = 32,
    shuffle = True,
    num_workers = 8,
    persistent_workers = True,
)

In [9]:
model = tweets_classifier_torch.model.TweetsDisasterClassifier(
    sentence_length = MAX_SENTENCE_LENGTH,
    vocab_size = pretrained_tokenizer.vocab_size,
    embed_dim = EMBEDDING_SIZE,
    ff_dim = 32,
    num_attention_heads = 8).to(DEVICE)
loss_fn = torch.nn.BCELoss()
metrics_dict = {"F1": metrics.BinaryF1Score(), "AUC": metrics.BinaryAUROC()}
optim = torch.optim.Adam(
    params = model.parameters(),
    lr = 1e-5,
    betas = (0.9, 0.98),
    eps = 1e-9
)
model_name = model.__class__.__name__
writer = create_run_logger(LOGS_FOLDER, model)

In [None]:
for epoch in range(EPOCHS):
    epoch_loss = train_step(
        model = model,
        data = train_dataloader,
        optimizer = optim,
        loss_fn = loss_fn,
        metrics_dict = metrics_dict,
        device = DEVICE
    )
    print(f"Epoch: {epoch};", end = ' ')

    # Compute metrics of epoch
    for name, metric in metrics_dict.items():
        # Compute metric
        metric_value = metric.compute()
        print(f"{name}: {metric_value};", end = ' ')
        # Add metric to tensorboard
        writer.add_scalar(f"{model_name}_{name}", metric_value, epoch)
        metric.reset()
    print(f"Loss: {epoch_loss};")
    # Add loss to tensorboard
    writer.add_scalar(f"{model_name}_loss", epoch_loss, epoch)
    
# torch.save(model.state_dict(), './tweets_classifier_torch/weights.pth')