<a href="https://colab.research.google.com/github/Ibraheem101/mlops/blob/main/foundations/CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import re
import json
import math
import nltk
import torch
import random
import itertools
import collections
import numpy as np
import pandas as pd
import seaborn as sns
import torch.nn as nn
import torch.nn.functional as F
import matplotlib.pyplot as plt

In [2]:
seed = 1234

In [3]:
def set_seeds(seed=1234):
    """Set seeds for reproducibility."""
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed) # multi-GPU

In [4]:
set_seeds(seed = seed)

In [5]:
# Set device
cuda = True
device = torch.device("cuda" if (
    torch.cuda.is_available() and cuda) else "cpu")
torch.set_default_tensor_type("torch.FloatTensor")
if device.type == "cuda":
    torch.set_default_tensor_type("torch.cuda.FloatTensor")
print (device)

cpu


In [6]:
# Load data
url = "https://raw.githubusercontent.com/GokuMohandas/Made-With-ML/main/datasets/news.csv"
df = pd.read_csv(url, header=0) # load
df = df.sample(frac=1).reset_index(drop=True) # shuffle
df.head()

Unnamed: 0,title,category
0,Sharon Accepts Plan to Reduce Gaza Army Operat...,World
1,Internet Key Battleground in Wildlife Crime Fight,Sci/Tech
2,July Durable Good Orders Rise 1.7 Percent,Business
3,Growing Signs of a Slowing on Wall Street,Business
4,The New Faces of Reality TV,World


### Preprocessing

In [7]:
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

In [8]:
nltk.download("stopwords")
stop_words = stopwords.words("english")
print (stop_words[:5])
sstm = SnowballStemmer('english')

['i', 'me', 'my', 'myself', 'we']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [9]:
def preprocess(text, stopwords = stop_words):

    # lowercasing
    text = text.lower()

    # Remove stopwords
    pattern = re.compile(r"\b(" + r"|".join(stopwords) + r")\b\s*")
    text = pattern.sub("", text)

    # Remove words in parenthesis
    text = re.sub(r"\([^)]*\)", "", text)

    # Spacing and filters
    text = re.sub(r"([-;;.,!?<=>])", r" \1 ", text)  # separate punctuation tied to words
    text = re.sub("[^A-Za-z0-9]+", " ", text)  # remove non alphanumeric chars
    text = re.sub(" +", " ", text)  # remove multiple spaces
    text = text.strip()

    return text



In [10]:
# Sample
text = "US and China take a potentially crucial step"
preprocess(text=text)


'us china take potentially crucial step'

In [11]:
# Apply to dataframe
preprocessed_df = df.copy()
preprocessed_df.title = preprocessed_df.title.apply(preprocess)

preprocessed_df.head()

Unnamed: 0,title,category
0,sharon accepts plan reduce gaza army operation...,World
1,internet key battleground wildlife crime fight,Sci/Tech
2,july durable good orders rise 1 7 percent,Business
3,growing signs slowing wall street,Business
4,new faces reality tv,World


In [12]:
from sklearn.model_selection import train_test_split

In [13]:
TRAIN_SIZE = 0.7
VAL_SIZE = 0.15
TEST_SIZE = 0.15

In [14]:
def train_val_test_split(X, y, train_size):
    """Split dataset into data splits."""
    X_train, X_, y_train, y_ = train_test_split(X, y, train_size=TRAIN_SIZE, stratify=y)
    X_val, X_test, y_val, y_test = train_test_split(X_, y_, train_size=0.5, stratify=y_)
    return X_train, X_val, X_test, y_train, y_val, y_test

In [15]:
# Data
X = preprocessed_df["title"].values
y = preprocessed_df["category"].values

In [16]:
# Create data splits
X_train, X_val, X_test, y_train, y_val, y_test = train_val_test_split(
    X=X, y=y, train_size=TRAIN_SIZE)
print (f"X_train: {X_train.shape}, y_train: {y_train.shape}")
print (f"X_val: {X_val.shape}, y_val: {y_val.shape}")
print (f"X_test: {X_test.shape}, y_test: {y_test.shape}")
print (f"Sample point: {X_train[10]} → {y_train[10]}")

X_train: (84000,), y_train: (84000,)
X_val: (18000,), y_val: (18000,)
X_test: (18000,), y_test: (18000,)
Sample point: air france klm sales rise 6 4 passenger increase → Business


### Label encoding

In [17]:
from collections import OrderedDict

In [18]:
class LabelEncoder(object):
    """
        Class to transform categorical labels into numerical values.

        Attributes:
            mapping (dict): A dictionary that maps labels to their corresponding numerical values.
            reverse_mapping (dict): A dictionary that maps numerical values back to their original labels.
            classes (list): A list of unique labels.

        Methods:
            fit(data): Fit the encoder to the given data by creating the mapping and reverse_mapping dictionaries.
            encode(data): Encode the given data by replacing labels with their corresponding numerical values.
            decode(data): Decode the given data by replacing numerical values with their original labels.
            __len__(): Return the number of unique labels in the encoder.
            __str__(): Return a string representation of the encoder.
            save(fp): Save the encoder's mapping dictionary to a JSON file.
            load(fp): Load a saved encoder from a JSON file.

    """


    def __init__(self):
        self.mapping = {}
        self.reverse_mapping = {}
        self.classes = []

    def fit(self, data):
        unique_labels = list(OrderedDict.fromkeys(data))
        for value, label in enumerate(unique_labels):
            self.mapping[label] = value
            self.reverse_mapping[value] = label
            self.classes.append(label)

    def encode(self, data):
        return [self.mapping[i] for i in data]

    def decode(self, data):
        return [self.reverse_mapping[j] for j in data]

    def __len__(self):
        return len(self.mapping)

    def __str__(self):
        return f"<LabelEncoder(num_classes={len(self)})>"

    def save(self, fp):
        with open(fp, "w") as fp:
            contents = {'class_to_index': self.class_to_index}
            json.dump(contents, fp, indent=4, sort_keys=False)

    @classmethod
    def load(cls, fp):
        with open(fp, "r") as fp:
            kwargs = json.load(fp=fp)
        return cls(**kwargs)

In [19]:
# Encode
label_encoder = LabelEncoder()
label_encoder.fit(y_train)
num_classes = len(label_encoder)
label_encoder.mapping

{'World': 0, 'Sports': 1, 'Business': 2, 'Sci/Tech': 3}

In [20]:
# Convert labels to tokens
print (f"y_train[0]: {y_train[0]}")
y_train_enc = label_encoder.encode(y_train)
y_val_enc = label_encoder.encode(y_val)
y_test_enc = label_encoder.encode(y_test)
print (f"y_train[0]: {y_train_enc[0]}")

y_train[0]: World
y_train[0]: 0


### Class weights

In [21]:
# Calculate class weights
class_weights = {}
total_samples = len(y_train_enc)
num_classes = len(np.unique(y_train_enc))
class_samples = np.bincount(y_train_enc)
for i in range(num_classes):
    class_weights[i] = total_samples / (num_classes * class_samples[i])

print(f"Class weights: {class_weights}")

Class weights: {0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0}


### Tokenizer

In [22]:
from collections import Counter
from more_itertools import take

In [23]:
class Tokenizer(object):
    def __init__(self, char_level, num_tokens=None,
                 pad_token="<PAD>", oov_token="<UNK>",
                 token_to_index=None):
        self.char_level = char_level
        self.separator = "" if self.char_level else " "
        if num_tokens: num_tokens -= 2 # pad + unk tokens
        self.num_tokens = num_tokens
        self.pad_token = pad_token
        self.oov_token = oov_token
        if not token_to_index:
            token_to_index = {pad_token: 0, oov_token: 1}
        self.token_to_index = token_to_index
        self.index_to_token = {v: k for k, v in self.token_to_index.items()}

    def __len__(self):
        return len(self.token_to_index)

    def __str__(self):
        return f"<Tokenizer(num_tokens={len(self)})>"

    def fit_on_texts(self, texts):
        if not self.char_level:
            texts = [text.split(" ") for text in texts]
        all_tokens = [token for text in texts for token in text]
        counts = Counter(all_tokens).most_common(self.num_tokens)
        self.min_token_freq = counts[-1][1]
        for token, count in counts:
            index = len(self)
            self.token_to_index[token] = index
            self.index_to_token[index] = token
        return self

    def texts_to_sequences(self, texts):
        sequences = []
        for text in texts:
            if not self.char_level:
                text = text.split(" ")
            sequence = []
            for token in text:
                sequence.append(self.token_to_index.get(
                    token, self.token_to_index[self.oov_token]))
            sequences.append(np.asarray(sequence))
        return sequences

    def sequences_to_texts(self, sequences):
        texts = []
        for sequence in sequences:
            text = []
            for index in sequence:
                text.append(self.index_to_token.get(index, self.oov_token))
            texts.append(self.separator.join([token for token in text]))
        return texts

    def save(self, fp):
        with open(fp, "w") as fp:
            contents = {
                "char_level": self.char_level,
                "oov_token": self.oov_token,
                "token_to_index": self.token_to_index
            }
            json.dump(contents, fp, indent=4, sort_keys=False)

    @classmethod
    def load(cls, fp):
        with open(fp, "r") as fp:
            kwargs = json.load(fp=fp)
        return cls(**kwargs)


In [24]:
# Tokenize
tokenizer = Tokenizer(char_level=False, num_tokens=500)
tokenizer.fit_on_texts(texts=X_train)
VOCAB_SIZE = len(tokenizer)
print (tokenizer)

<Tokenizer(num_tokens=500)>


In [25]:
# Sample of tokens
print (take(5, tokenizer.token_to_index.items()))
print (f"least freq token's freq: {tokenizer.min_token_freq}") # use this to adjust num_tokens

[('<PAD>', 0), ('<UNK>', 1), ('39', 2), ('b', 3), ('gt', 4)]
least freq token's freq: 166


In [26]:
# Convert texts to sequences of indices
X_train_tok = tokenizer.texts_to_sequences(X_train)
X_val_tok = tokenizer.texts_to_sequences(X_val)
X_test_tok = tokenizer.texts_to_sequences(X_test)
preprocessed_text = tokenizer.sequences_to_texts([X_train_tok[0]])[0]
print ("Text to indices:\n"
    f"  (preprocessed) → {preprocessed_text}\n"
    f"  (tokenized) → {X_train_tok[0]}")

Text to indices:
  (preprocessed) → china <UNK> north korea nuclear talks
  (tokenized) → [ 16   1 285 142 114  24]


### One-hot encoding

In [27]:
def one_hot_encode(features, num_classes):
    num_samples = len(features)

    # Create an array of zeros with shape (num_samples, num_classes)
    one_hot_encoded = np.zeros((num_samples, num_classes))

    # Set the corresponding element to 1 for each sample and class
    for i, value in enumerate(features):
        one_hot_encoded[i, value] = 1

    return one_hot_encoded

In [28]:
# One-hot encoding
print (X_train_tok[0])
print (len(X_train_tok[0]))
cat = one_hot_encode(features=X_train_tok[0], num_classes=len(tokenizer))
print (cat)
print (cat.shape)

[ 16   1 285 142 114  24]
6
[[0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(6, 500)


In [29]:
# Convert tokens to one-hot
vocab_size = len(tokenizer)
X_train_hot = [one_hot_encode(seq, num_classes=vocab_size) for seq in X_train_tok]
X_val_hot = [one_hot_encode(seq, num_classes=vocab_size) for seq in X_val_tok]
X_test_hot = [one_hot_encode(seq, num_classes=vocab_size) for seq in X_test_tok]

### Padding
In the context of Natural Language Processing (NLP), padding refers to the process of adding special tokens or characters to sequences in order to make them of equal length. It is necessary because many machine learning models require input data of consistent dimensions.

Padding is particularly relevant when working with sequential data, such as sentences or documents, where the length of the text varies. To ensure that all sequences have the same length, shorter sequences are padded with special tokens (such as <PAD>) to match the length of the longest sequence in the dataset.

In [30]:
len(X_train_hot[0]), len(X_train_hot[1]), len(X_train_hot[9]) # Varying lengths

(6, 5, 7)

In [31]:
def pad_sequences(sequences, max_seq_len=0):
    """Pad sequences to max length in sequence."""
    max_seq_len = max(max_seq_len, max(len(sequence) for sequence in sequences))
    num_classes = sequences[0].shape[-1]
    padded_sequences = np.zeros((len(sequences), max_seq_len, num_classes))
    for i, sequence in enumerate(sequences):
        padded_sequences[i][:len(sequence)] = sequence
    return padded_sequences

<details>
  <summary>Padding with zero</summary>

  Padding with 0 is a common practice and is generally a good idea in many cases. Here are a few reasons why padding with 0 is commonly used:

Compatibility: Padding with 0 is compatible with many machine learning frameworks and libraries. Most frameworks, including PyTorch and TensorFlow, handle 0-padding efficiently and have built-in functions to handle it.

Zero-padding doesn't introduce noise: When padding with zeros, you are essentially adding neutral values that do not carry any meaning or bias. This ensures that the padding does not introduce any unwanted noise or affect the data distribution.

Easy handling of variable-length sequences: Padding with 0 allows you to convert variable-length sequences into fixed-length sequences, which is often required for batch processing and model training. It simplifies the handling of sequences by ensuring consistent dimensions.

Memory efficiency: Padding with 0 doesn't require additional memory or storage for the padding values. Zeros are already present in the memory and can be easily allocated, making it memory-efficient.

However, it's important to note that the choice of padding value may depend on the specific problem and the data being used. In some cases, other padding values such as a specific value outside the range of the data or a special token may be more appropriate. It's always recommended to consider the characteristics of your data and the requirements of your model when deciding on the padding strategy.

</details>

In [32]:
# 3D sequences
print (X_train_hot[0].shape, X_train_hot[1].shape, X_train_hot[2].shape)
padded = pad_sequences(X_train_hot[0:3])
print (padded.shape)

(6, 500) (5, 500) (6, 500)
(3, 6, 500)


### Dataset

In [33]:
filter_size = 1 # unigram

In [34]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, X, y, max_filter_size):
        self.X = X
        self.y = y
        self.max_filter_size = max_filter_size

    def __len__(self):
        return len(self.y)

    def __str__(self):
        return f"<Dataset(N={len(self)})>"

    def __getitem__(self, index):
        X = self.X[index]
        y = self.y[index]
        return [X, y]

    def collate_fn(self, batch):
        """Processing on a batch."""
        # Get inputs
        batch = np.array(batch)
        X = batch[:, 0]
        y = batch[:, 1]

        # Pad sequences
        X = pad_sequences(X, max_seq_len=self.max_filter_size)

        # Cast
        X = torch.FloatTensor(X.astype(np.int32))
        y = torch.LongTensor(y.astype(np.int32))

        return X, y

    def create_dataloader(self, batch_size, shuffle=False, drop_last=False):
        return torch.utils.data.DataLoader(
            dataset=self, batch_size=batch_size, collate_fn=self.collate_fn,
            shuffle=shuffle, drop_last=drop_last, pin_memory=True)


In [35]:
# Create datasets
train_dataset = Dataset(X=X_train_hot, y=y_train_enc, max_filter_size=filter_size)
val_dataset = Dataset(X=X_val_hot, y=y_val_enc, max_filter_size=filter_size)
test_dataset = Dataset(X=X_test_hot, y=y_test_enc, max_filter_size=filter_size)
print ("Datasets:\n"
    f"  Train dataset:{train_dataset.__str__()}\n"
    f"  Val dataset: {val_dataset.__str__()}\n"
    f"  Test dataset: {test_dataset.__str__()}\n"
    "Sample point:\n"
    f"  X: {train_dataset[0][0]}\n"
    f"  y: {train_dataset[0][1]}")

Datasets:
  Train dataset:<Dataset(N=84000)>
  Val dataset: <Dataset(N=18000)>
  Test dataset: <Dataset(N=18000)>
Sample point:
  X: [[0. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
  y: 0


In [36]:
# Mini batch gradient descent with dataloader
batch_size = 64
train_dataloader = train_dataset.create_dataloader(batch_size=batch_size)
val_dataloader = val_dataset.create_dataloader(batch_size=batch_size)
test_dataloader = test_dataset.create_dataloader(batch_size=batch_size)
batch_X, batch_y = next(iter(test_dataloader))
print ("Sample batch:\n"
    f"  X: {list(batch_X.size())}\n"
    f"  y: {list(batch_y.size())}\n"
    "Sample point:\n"
    f"  X: {batch_X[0]}\n"
    f"  y: {batch_y[0]}")


Sample batch:
  X: [64, 14, 500]
  y: [64]
Sample point:
  X: tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
  y: 3


  batch = np.array(batch)


### CNN

In [37]:
NUM_FILTERS = 50
HIDDEN_DIM = 100
DROPOUT_P = 0.1

In [38]:
class CNN(nn.Module):
    def __init__(self, vocab_size, num_filters, filter_size,
                 hidden_dim, dropout_p, num_classes):
        super(CNN, self).__init__()

        # Convolutional filters
        self.filter_size = filter_size
        self.conv = nn.Conv1d(
            in_channels=vocab_size, out_channels=num_filters,
            kernel_size=filter_size, stride=1, padding=0, padding_mode="zeros")
        self.batch_norm = nn.BatchNorm1d(num_features=num_filters)

        # FC layers
        self.fc1 = nn.Linear(num_filters, hidden_dim)
        self.dropout = nn.Dropout(dropout_p)
        self.fc2 = nn.Linear(hidden_dim, num_classes)

    def forward(self, inputs, channel_first=False,):

        # Rearrange input so num_channels is in dim 1 (N, C, L)
        x_in, = inputs
        if not channel_first:
            x_in = x_in.transpose(1, 2)

        # Padding for `SAME` padding
        max_seq_len = x_in.shape[2]
        padding_left = int((self.conv.stride[0]*(max_seq_len-1) - max_seq_len + self.filter_size)/2)
        padding_right = int(math.ceil((self.conv.stride[0]*(max_seq_len-1) - max_seq_len + self.filter_size)/2))

        # Conv outputs
        z = self.conv(F.pad(x_in, (padding_left, padding_right)))
        z = F.max_pool1d(z, z.size(2)).squeeze(2)

        # FC layer
        z = self.fc1(z)
        z = self.dropout(z)
        z = self.fc2(z)
        return z


This code defines a CNN (Convolutional Neural Network) model for text classification. Here's a breakdown of the key components:

Convolutional Filters: The model uses a 1D convolutional layer (nn.Conv1d) to extract local features from the input text. The convolutional layer has vocab_size input channels, num_filters output channels, and a kernel size of filter_size. It applies convolutional filters over the input text to capture different patterns.

Batch Normalization: After the convolutional layer, the model applies batch normalization (nn.BatchNorm1d) to normalize the output activations. Batch normalization helps stabilize and speed up the training process by normalizing the activations across the mini-batches.

Fully Connected Layers: The model has two fully connected (FC) layers (nn.Linear) for further processing of the extracted features. The first FC layer takes the output of the convolutional layer and transforms it to a higher-dimensional space represented by hidden_dim. The second FC layer maps the hidden representation to the num_classes output classes.

Padding: To handle different input sequence lengths and maintain spatial resolution, the input sequences are padded with appropriate padding sizes. The padding_left and padding_right values are calculated based on the convolution stride and filter size to ensure "SAME" padding, where the output feature maps have the same size as the input.

Pooling: After the convolutional layer, the model applies max pooling (F.max_pool1d) to reduce the spatial dimension of the feature maps. This operation extracts the most important features and reduces the output size.

Dropout: Dropout regularization (nn.Dropout) is applied between the two FC layers. Dropout randomly sets a fraction of the input units to zero during training, which helps prevent overfitting and improves generalization.

Forward Pass: The forward method defines the forward pass of the model. It takes input inputs and applies the defined operations sequentially. The input inputs should be a tuple containing the input tensors. The method returns the output logits z, which represent the predicted class probabilities for each input.

The line x_in = x_in.transpose(1, 2) is used to transpose the dimensions of the input tensor x_in if channel_first is set to False.

By default, PyTorch assumes that the input tensor has dimensions in the order of (batch_size, num_channels, sequence_length) (also known as channel-first format). However, in some cases, the input data may be provided in a different format, such as (batch_size, sequence_length, num_channels) (channel-last format).

In [41]:
# Initialize model
model = CNN(vocab_size=VOCAB_SIZE, num_filters=NUM_FILTERS, filter_size=filter_size,
            hidden_dim=HIDDEN_DIM, dropout_p=DROPOUT_P, num_classes=num_classes)
model = model.to(device) # set device
print (model.named_parameters)

<bound method Module.named_parameters of CNN(
  (conv): Conv1d(500, 50, kernel_size=(1,), stride=(1,))
  (batch_norm): BatchNorm1d(50, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc1): Linear(in_features=50, out_features=100, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (fc2): Linear(in_features=100, out_features=4, bias=True)
)>


### Training

In [42]:
from torch.optim import Adam

In [44]:
lr = 1e-3
patience = 5
epochs = 10

In [45]:
class Trainer(object):
    def __init__(self, model, device, loss_fn=None, optimizer=None, scheduler=None):

        # Set params
        self.model = model
        self.device = device
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.scheduler = scheduler

    def train_step(self, dataloader):
        """Train step."""
        # Set model to train mode
        self.model.train()
        loss = 0.0

        # Iterate over train batches
        for i, batch in enumerate(dataloader):

            # Step
            batch = [item.to(self.device) for item in batch]  # Set device
            inputs, targets = batch[:-1], batch[-1]
            self.optimizer.zero_grad()  # Reset gradients
            z = self.model(inputs)  # Forward pass
            J = self.loss_fn(z, targets)  # Define loss
            J.backward()  # Backward pass
            self.optimizer.step()  # Update weights

            # Cumulative Metrics
            loss += (J.detach().item() - loss) / (i + 1)

        return loss

    def eval_step(self, dataloader):
        """Validation or test step."""
        # Set model to eval mode
        self.model.eval()
        loss = 0.0
        y_trues, y_probs = [], []

        # Iterate over val batches
        with torch.inference_mode():
            for i, batch in enumerate(dataloader):

                # Step
                batch = [item.to(self.device) for item in batch]  # Set device
                inputs, y_true = batch[:-1], batch[-1]
                z = self.model(inputs)  # Forward pass
                J = self.loss_fn(z, y_true).item()

                # Cumulative Metrics
                loss += (J - loss) / (i + 1)

                # Store outputs
                y_prob = F.softmax(z).cpu().numpy()
                y_probs.extend(y_prob)
                y_trues.extend(y_true.cpu().numpy())

        return loss, np.vstack(y_trues), np.vstack(y_probs)

    def predict_step(self, dataloader):
        """Prediction step."""
        # Set model to eval mode
        self.model.eval()
        y_probs = []

        # Iterate over val batches
        with torch.inference_mode():
            for i, batch in enumerate(dataloader):

                # Forward pass w/ inputs
                inputs, targets = batch[:-1], batch[-1]
                z = self.model(inputs)

                # Store outputs
                y_prob = F.softmax(z).cpu().numpy()
                y_probs.extend(y_prob)

        return np.vstack(y_probs)

    def train(self, num_epochs, patience, train_dataloader, val_dataloader):
        best_val_loss = np.inf
        for epoch in range(num_epochs):
            # Steps
            train_loss = self.train_step(dataloader=train_dataloader)
            val_loss, _, _ = self.eval_step(dataloader=val_dataloader)
            self.scheduler.step(val_loss)

            # Early stopping
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                best_model = self.model
                _patience = patience  # reset _patience
            else:
                _patience -= 1
            if not _patience:  # 0
                print("Stopping early!")
                break

            # Logging
            print(
                f"Epoch: {epoch+1} | "
                f"train_loss: {train_loss:.5f}, "
                f"val_loss: {val_loss:.5f}, "
                f"lr: {self.optimizer.param_groups[0]['lr']:.2E}, "
                f"_patience: {_patience}"
            )
        return best_model


In [46]:
# Define Loss
class_weights_tensor = torch.Tensor(list(class_weights.values())).to(device)
loss_fn = nn.CrossEntropyLoss(weight=class_weights_tensor)

In [47]:
# Define optimizer & scheduler
optimizer = Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.1, patience=3)

In [48]:
# Trainer module
trainer = Trainer(
    model=model, device=device, loss_fn=loss_fn,
    optimizer=optimizer, scheduler=scheduler)

In [50]:
# Train
best_model = trainer.train(
    epochs, patience, train_dataloader, val_dataloader)

  batch = np.array(batch)
  y_prob = F.softmax(z).cpu().numpy()


Epoch: 1 | train_loss: 0.87051, val_loss: 0.78786, lr: 1.00E-03, _patience: 5
Epoch: 2 | train_loss: 0.78283, val_loss: 0.78516, lr: 1.00E-03, _patience: 5
Epoch: 3 | train_loss: 0.77612, val_loss: 0.78323, lr: 1.00E-03, _patience: 5
Epoch: 4 | train_loss: 0.77166, val_loss: 0.78147, lr: 1.00E-03, _patience: 5
Epoch: 5 | train_loss: 0.76821, val_loss: 0.78127, lr: 1.00E-03, _patience: 5
Epoch: 6 | train_loss: 0.76566, val_loss: 0.78067, lr: 1.00E-03, _patience: 5
Epoch: 7 | train_loss: 0.76350, val_loss: 0.78052, lr: 1.00E-03, _patience: 5
Epoch: 8 | train_loss: 0.76140, val_loss: 0.77999, lr: 1.00E-03, _patience: 5
Epoch: 9 | train_loss: 0.75998, val_loss: 0.78012, lr: 1.00E-03, _patience: 4
Epoch: 10 | train_loss: 0.75859, val_loss: 0.78002, lr: 1.00E-03, _patience: 3
