In [1]:
from pathlib import Path

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import StratifiedKFold
import torch
from torch import nn
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint

from src.models import Tokenizer, Vocabulary, VectorizerFactory
from src.lightning import SentenceClassifier
from src.data.datasets import SparseDatasetFactory, DenseDatasetFactory
from src.data.datamodules import TrainValDataModule

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
BASE_DIR = Path().resolve().parent
CONVERTED = "data/converted"

df_train = pd.read_csv(str(BASE_DIR / CONVERTED / "train.csv"))
df_test = pd.read_csv(str(BASE_DIR / CONVERTED / "test.csv"))

In [3]:
MAX_DF = 0.8
MIN_COUNT = 5
MIN_TOKEN_SIZE = 4

vocabulary = Vocabulary(max_doc_freq=MAX_DF, min_count=MIN_COUNT)
tokenizer = Tokenizer(min_token_size=MIN_TOKEN_SIZE)

In [4]:
tokenized_texts_train = tokenizer.tokenize_corpus(list(df_train["text"]))
tokenized_texts_test = tokenizer.tokenize_corpus(list(df_test["text"]))

vocabulary.build(tokenized_texts_train)

In [5]:
use_sparse = True

In [6]:
vectorizer_factory = VectorizerFactory(
    vocabulary, mode="tfidf", scale="minmax", use_sparse=use_sparse
)
vectorizer = vectorizer_factory.get_vectorizer()

In [7]:
train_vectors = vectorizer.vectorize(tokenized_texts_train)
test_vectors = vectorizer.vectorize(tokenized_texts_test)

train_targets = df_train["label_index"].to_numpy()
test_targets = df_test["label_index"].to_numpy()

In [8]:
dataset_factory = SparseDatasetFactory() if use_sparse else DenseDatasetFactory()

train_dataset = dataset_factory.create_dataset(train_vectors, train_targets)
test_dataset = dataset_factory.create_dataset(test_vectors, test_targets)

data_module = TrainValDataModule(
    train_dataset,
    test_dataset,
    batch_size=32,
    num_workers=1,
)

In [9]:
# define the early stopping and model checkpoint callbacks
early_stop_callback = EarlyStopping(
    monitor="val_loss", patience=3, verbose=False, mode="min"
)
checkpoint_callback = ModelCheckpoint(
    monitor="val_loss",
    dirpath="../experiments/exp01",
    filename="models/{epoch:02d}-{val_loss:.2f}",
    save_top_k=1,
    mode="min",
)

In [10]:
trainer = pl.Trainer(
    max_epochs=10,
    accelerator="gpu",
    devices=1,
    callbacks=[early_stop_callback, checkpoint_callback],
)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [11]:
UNIQUE_WORDS_N = len(vocabulary)
UNIQUE_LABELS_N = len(set(train_targets))

lightning_model = SentenceClassifier(
    model=nn.Linear(UNIQUE_WORDS_N, UNIQUE_LABELS_N),
    criterion=nn.CrossEntropyLoss(),
    num_classes=UNIQUE_LABELS_N,
)

In [12]:
trainer.fit(lightning_model, data_module)

You are using a CUDA device ('NVIDIA A100 80GB PCIe') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]

  | Name      | Type               | Params
-------------------------------------------------
0 | model     | Linear             | 432 K 
1 | criterion | CrossEntropyLoss   | 0     
2 | train_acc | MulticlassAccuracy | 0     
3 | val_acc   | MulticlassAccuracy | 0     
-------------------------------------------------
432 K     Trainable params
0         Non-trainable params
432 K     Total params
1.730     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


                                                                           

  rank_zero_warn(


Epoch 9: 100%|██████████| 354/354 [00:02<00:00, 132.20it/s, v_num=13, train_loss=2.950, val_loss=2.980, train_acc=0.465]

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 354/354 [00:02<00:00, 131.28it/s, v_num=13, train_loss=2.950, val_loss=2.980, train_acc=0.465]


In [13]:
! python -m torch.utils.collect_env
print(torch.cuda.nccl.is_available(torch.randn(1).cuda()))
print(torch.cuda.nccl.version())

Collecting environment information...
PyTorch version: 1.13.1+cu117
Is debug build: False
CUDA used to build PyTorch: 11.7
ROCM used to build PyTorch: N/A

OS: Ubuntu 22.04.1 LTS (x86_64)
GCC version: (Ubuntu 11.3.0-1ubuntu1~22.04.1) 11.3.0
Clang version: Could not collect
CMake version: version 3.22.1
Libc version: glibc-2.35

Python version: 3.10.0 | packaged by conda-forge | (default, Nov 20 2021, 02:24:10) [GCC 9.4.0] (64-bit runtime)
Python platform: Linux-5.15.0-71-generic-x86_64-with-glibc2.35
Is CUDA available: True
CUDA runtime version: Could not collect
CUDA_MODULE_LOADING set to: LAZY
GPU models and configuration: 
GPU 0: NVIDIA A100 80GB PCIe
GPU 1: NVIDIA A100 80GB PCIe
GPU 2: NVIDIA A100 80GB PCIe

Nvidia driver version: 525.60.13
cuDNN version: Could not collect
HIP runtime version: N/A
MIOpen runtime version: N/A
Is XNNPACK available: True

Versions of relevant libraries:
[pip3] mypy==1.4.1
[pip3] mypy-extensions==1.0.0
[pip3] numpy==1.25.1
[pip3] pytorch-lightning==2.0

In [None]:
# Initialize your StratifiedKFold
num_folds = 5
skf = StratifiedKFold(n_splits=num_folds)

# List to store metrics
metrics = []

# Start the cross validation
for fold, (train_idx, val_idx) in enumerate(skf.split(train_vectors, train_targets)):
    print(f"Processing Fold {fold}")

    train_features = train_vectors[train_idx]
    train_targets = train_targets[train_idx]
    val_features = train_vectors[val_idx]
    val_targets = train_targets[val_idx]

    # Create your datasets using factory
    train_dataset = dataset_factory.create_dataset(
        features=train_features, targets=train_targets
    )
    val_dataset = dataset_factory.create_dataset(
        features=val_features, targets=val_targets
    )

    # Then wrap your datasets using data module
    data_module = TrainValDataModule(
        train_dataset=train_dataset, val_dataset=val_dataset
    )

    # Initialize your model with hyperparameters
    UNIQUE_WORDS_N = len(vocabulary)
    UNIQUE_LABELS_N = len(set(train_targets))
    lightning_model = SentenceClassifier(
        model=nn.Linear(UNIQUE_WORDS_N, UNIQUE_LABELS_N),
        criterion=nn.CrossEntropyLoss(),
        num_classes=UNIQUE_LABELS_N,
    )

    # Initialize the trainer
    trainer = pl.Trainer(
        max_epochs=10,
        accelerator="gpu",
        devices=1,
        callbacks=[early_stop_callback, checkpoint_callback],
    )

    # Fit the model
    trainer.fit(lightning_model, data_module)

    # At the end of each fold, you can log the metrics or use them to update your hyperparameters
    fold_metrics = trainer.logged_metrics
    print(f"Fold {fold} metrics: {fold_metrics}")

    metrics.append(fold_metrics)

# Once the loop is finished, you can compute the average metrics
average_metrics = {
    metric: np.mean([fold[metric] for fold in metrics]) for metric in metrics[0]
}
print(f"Average metrics: {average_metrics}")