In [1]:
import torch

class LoRALayer(torch.nn.Module):
    def __init__(self, in_dim, out_dim, rank, alpha):
        super().__init__()
        #standart deviation of weights. we use sqrt(1/n) <--> tanh activat.func
        std_dev = 1 / torch.sqrt(torch.tensor(rank).float())
        self.A = torch.nn.Parameter(torch.randn(in_dim, rank)*std_dev)
        #at the beginning of the training, before A and B are updated via backpropagation, 
        #the LoRALayer does not impact the original weights because AB=0 if B=0.
        self.B = torch.nn.Parameter(torch.zeros(rank, out_dim))
        #magnitude of the changes introduced by the LoRA layer to the model's existing weights
        self.alpha = alpha
        #A higher value of alpha means larger adjustments to the model's behavior, 
        # while a lower value results in more subtle changes.

    def forward(self,x):
        x = self.alpha * (x @ self.A @ self.B)
        return x

#replace each Linear layer with a LinearWithLoRA layer that combines the Linear layer with Lora
class LinearWithLoRA(torch.nn.Module):
    def __init__(self, linear, rank, alpha):
        super().__init__()
        self.linear=linear
        self.lora=LoRALayer(
            linear.in_features, linear.out_features, rank, alpha
        )

    def forward(self, x):
        return self.linear(x) + self.lora(x)


"""In practice, to equip and finetune a model with LoRA,
 all we have to do is replace its pretrained Linear layers 
with our new LinearWithLoRA layer."""




'In practice, to equip and finetune a model with LoRA,\n all we have to do is replace its pretrained Linear layers \nwith our new LinearWithLoRA layer.'

## Finetuning with LoRA 


#### train a small BERT model for text classification
we will use a pretrained DistilBERT (a smaller version of BERT) model from the transformers library:

In [2]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


We only want to train the new LoRA weights, we freeze all model parameters by setting `requires_grad` to `False` for all trainable parameters:

In [3]:
for param in model.parameters():
    param.requires_grad = False

In [4]:
#
print(f"Total number of parameters: {sum(p.numel() for p in model.parameters()):,}")


Total number of parameters: 66,955,010


In [5]:
for name, param in model.named_parameters():
    print(name, param.shape)

distilbert.embeddings.word_embeddings.weight torch.Size([30522, 768])
distilbert.embeddings.position_embeddings.weight torch.Size([512, 768])
distilbert.embeddings.LayerNorm.weight torch.Size([768])
distilbert.embeddings.LayerNorm.bias torch.Size([768])
distilbert.transformer.layer.0.attention.q_lin.weight torch.Size([768, 768])
distilbert.transformer.layer.0.attention.q_lin.bias torch.Size([768])
distilbert.transformer.layer.0.attention.k_lin.weight torch.Size([768, 768])
distilbert.transformer.layer.0.attention.k_lin.bias torch.Size([768])
distilbert.transformer.layer.0.attention.v_lin.weight torch.Size([768, 768])
distilbert.transformer.layer.0.attention.v_lin.bias torch.Size([768])
distilbert.transformer.layer.0.attention.out_lin.weight torch.Size([768, 768])
distilbert.transformer.layer.0.attention.out_lin.bias torch.Size([768])
distilbert.transformer.layer.0.sa_layer_norm.weight torch.Size([768])
distilbert.transformer.layer.0.sa_layer_norm.bias torch.Size([768])
distilbert.trans

In [6]:
param_dict = [p.numel() for p in model.parameters()]
param_dict
#sum(param_dict)
sum(param_dict)

66955010

In [7]:
#numel() returns a total number of element in tensor
# Calculating the number of parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
non_trainable_params = total_params - trainable_params

print(f"Total Parameters: {total_params}")
print(f"Trainable Parameters: {trainable_params}") # We set requires_grad = True to apply lora
print(f"Non-trainable Parameters: {non_trainable_params}")

Total Parameters: 66955010
Trainable Parameters: 0
Non-trainable Parameters: 66955010


In [8]:
print(model)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [9]:
for param in model.parameters():
    param.requires_grad = False

In [10]:
from functools import partial

#default hyhperparam choices
lora_r = 8
lora_alpha = 16
#Dropout is a regularization technique that randomly sets 
#some of the activations to zero during training,
#which can help prevent overfitting. The default value is 0.05.
lora_dropout = 0.05

lora_query = True #whether the query matrix is adapted
lora_key = False #whether the key matrix is adapted
lora_value = True #whether the value matrix is adapted
lora_projection = False # whether the projection matrices
lora_mlp= False #MLP 
lora_head = False # whether the final classification head is adapted

layers = []

assign_lora = partial(LinearWithLoRA, rank= lora_r, alpha = lora_alpha)

for i, layer in enumerate(model.distilbert.transformer.layer):
    if lora_query:
        print("-----"*25)
        layer.attention.q_lin = assign_lora(layer.attention.q_lin)
        print(f"layer{i+1}: {layer.attention.q_lin} assigned with LoRA")
    if lora_key:
        print("-----"*25)
        layer.attention.k_lin = assign_lora(layer.attention.k_lin)
        print(f"layer{i+1}: {layer.attention.k_lin} assigned with LoRA")
    if lora_value:
        print("-----"*25)
        layer.attention.v_lin = assign_lora(layer.attention.v_lin)
        print(f"layer{i+1}: {layer.attention.v_lin} assigned with LoRA")
    if lora_projection:
        print("-----"*25)
        layer.attention.out_lin = assign_lora(layer.attention.out_lin)
        print(f"layer{i+1}: {layer.attention.out_lin} assigned with LoRA")
    if lora_mlp:
        
        layer.ffn.lin1 = assign_lora(layer.ffn.lin1)
        print(f"layer{i+1}: {layer.ffn.lin1 } assigned with LoRA")
        print("-----"*25)
        layer.ffn.lin2 = assign_lora(layer.ffn.lin2)
        print(f"layer{i+1}: {layer.ffn.lin2 } assigned with LoRA")



-----------------------------------------------------------------------------------------------------------------------------
layer1: LinearWithLoRA(
  (linear): Linear(in_features=768, out_features=768, bias=True)
  (lora): LoRALayer()
) assigned with LoRA
-----------------------------------------------------------------------------------------------------------------------------
layer1: LinearWithLoRA(
  (linear): Linear(in_features=768, out_features=768, bias=True)
  (lora): LoRALayer()
) assigned with LoRA
-----------------------------------------------------------------------------------------------------------------------------
layer2: LinearWithLoRA(
  (linear): Linear(in_features=768, out_features=768, bias=True)
  (lora): LoRALayer()
) assigned with LoRA
-----------------------------------------------------------------------------------------------------------------------------
layer2: LinearWithLoRA(
  (linear): Linear(in_features=768, out_features=768, bias=True)
  (lora): L

### inspect the model again to check its updated structure using print(model):

In [11]:
print(model)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (out_lin): Linear(in_features=768, out_features=768, bias

# Train the model with the default hyperparameter choices above

#### DATASET - IMDb Movie Reviews classification dataset: https://ai.stanford.edu/~amaas/data/sentiment/

In [12]:
pip install transformers datasets lightning watermark


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [13]:
%load_ext watermark
%watermark --conda -p torch,transformers,datasets,lightning

torch       : 2.1.1+cu121
transformers: 4.36.2
datasets    : 2.16.1
lightning   : 2.1.2

conda environment: n/a



## 1 Loading the dataset into DataFrames

In [14]:
import os
from datasets import load_dataset

import lightning as L
from lightning.pytorch.loggers import CSVLogger
from lightning.pytorch.callbacks import ModelCheckpoint

import pandas as pd
import torch

from local_dataset_utilities import download_dataset, load_dataset_into_to_dataframe, partition_dataset
from local_dataset_utilities import IMDBDataset

In [15]:
if not torch.cuda.is_available():
    print("Please switch to a GPU machine before running this notebook.")


In [17]:
files = {"test.csv", "train.csv", "val.csb"}
download = True

for f in files:
        if not os.path.exists(os.path.join("data", f)):
            download = False

if download is False:
    download_dataset()
    df = load_dataset_into_to_dataframe()
    partition_dataset(df) # returns "data/val.csv & data/train.csv & data/test.csv"



100%|██████████| 50000/50000 [00:51<00:00, 967.82it/s] 


Class distribution:


In [18]:
df_train = pd.read_csv(os.path.join("data", "train.csv"))
df_val = pd.read_csv(os.path.join("data", "val.csv"))
df_test = pd.read_csv(os.path.join("data", "test.csv"))

In [19]:
df_train

Unnamed: 0,index,text,label
0,0,"When we started watching this series on cable,...",1
1,0,Steve Biko was a black activist who tried to r...,1
2,0,My short comment for this flick is go pick it ...,1
3,0,"As a serious horror fan, I get that certain ma...",0
4,0,"Robert Cummings, Laraine Day and Jean Muir sta...",1
...,...,...,...
34995,0,Frank Capra's creativity must have been just a...,0
34996,0,Just saw the film tonight in a preview and it'...,0
34997,0,"If you love Japanese monster movies, you'll lo...",1
34998,0,Because it came from HBO and based on the IMDb...,0


#  2 Tokenization and Numericalization

#### Load the dataset with load_dataset 


In [26]:
imdb_dataset =load_dataset(
    "csv",
    data_files={
        "train" : os.path.join("data", "train.csv"),
        "validatiaon" : os.path.join("data", "val.csv"),
        "test" : os.path.join("data", "test.csv")
    },
)

print(imdb_dataset)

DatasetDict({
    train: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 35000
    })
    validatiaon: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['index', 'text', 'label'],
        num_rows: 10000
    })
})


### Tokenize

In [27]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
print("Tokenizer input max length:", tokenizer.model_max_length)
print("Tokenizer vocabulary size:", tokenizer.vocab_size)

Tokenizer input max length: 512
Tokenizer vocabulary size: 30522


In [28]:
def tokenize_text(batch):
    return tokenizer(batch["text"], truncation=True, padding = True)
    

In [30]:
imdb_tokenized = imdb_dataset.map(tokenize_text, batched = True, batch_size=None)

Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Delete dataset 


In [31]:
del imdb_dataset

In [32]:
imdb_tokenized

DatasetDict({
    train: Dataset({
        features: ['index', 'text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 35000
    })
    validatiaon: Dataset({
        features: ['index', 'text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['index', 'text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 10000
    })
})

In [33]:
imdb_tokenized.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [63]:
imdb_tokenized

DatasetDict({
    train: Dataset({
        features: ['index', 'text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 35000
    })
    validatiaon: Dataset({
        features: ['index', 'text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 5000
    })
    test: Dataset({
        features: ['index', 'text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 10000
    })
})

In [34]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# 3 Set Up DataLoaders

In [61]:
from torch.utils.data import DataLoader, Dataset


class IMDBDataset(Dataset):
    def __init__(self, dataset_dict, partition_key="train"):
        self.partition = dataset_dict[partition_key]

    def __getitem__(self, index):
        return self.partition[index]

    def __len__(self):
        return self.partition.num_rows

In [64]:
train_dataset = IMDBDataset(imdb_tokenized, partition_key="train")
val_dataset = IMDBDataset(imdb_tokenized, partition_key="validatiaon")
test_dataset = IMDBDataset(imdb_tokenized, partition_key="test")

train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=12,
    shuffle=True, 
    num_workers=4
)

val_loader = DataLoader(
    dataset=val_dataset,
    batch_size=12,
    num_workers=4
)

test_loader = DataLoader(
    dataset=test_dataset,
    batch_size=12,
    num_workers=4
)

## Initializing DistilBERT

In [65]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels = 2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


#### Freeze all layers

In [66]:
for param in model.parameters():
    param.requires_grad = False

### ADD LoRA layers

In [67]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [68]:
class LoRALayer(torch.nn.Module):
    def __init__(self, in_dim, out_dim, rank, alpha):
        super().__init__()
        std_dev = 1/torch.sqrt(torch.tensor(rank).float())
        self.W_a = torch.nn.Parameter(torch.randn(in_dim, rank)*std_dev)
        self.W_b = torch.nn.Parameter(torch.zeros(rank, out_dim))
        self.alpha = alpha

    def forward(self, x):
        x = self.alpha *( x@ self.W_a @ self.W_b)
        return x


class LinearWithLoRA(torch.nn.Module):
    def __init__(self, linear, rank, alpha):
        super().__init__()
        self.linear = linear 
        self.lora = LoRALayer(
            linear.in_features, linear.out_features, rank, alpha
        )
    def forward(self, x):
        return self.linear(x) + self.lora(x)    

In [69]:

from functools import partial


lora_r = 8
lora_alpha = 16
lora_dropout = 0.05
lora_query = True
lora_key = False
lora_value = True
lora_projection = False
lora_mlp = False
lora_head = False

layers = []

assign_lora = partial(LinearWithLoRA, rank=lora_r, alpha = lora_alpha)

for layer in model.distilbert.transformer.layer:
    if lora_query:
        layer.attention.q_lin = assign_lora(layer.attention.q_lin)
    if lora_key:
        layer.attention.k_lin = assign_lora(layer.attention.k_lin)
    if lora_value:
        layer.attention.v_lin = assign_lora(layer.attention.v_lin)
    if lora_projection:
        layer.attention.out_lin = assign_lora(layer.attention.out_lin)
    if lora_mlp:
        layer.ffn.lin1 = assign_lora(layer.ffn.lin1)
        layer.ffn.lin2 = assign_lora(layer.ffn.lin2)
if lora_head:
    model.pre_classifier = assign_lora(model.pre_classifier)
    model.classifier = assign_lora(model.classifier)

In [70]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): LinearWithLoRA(
              (linear): Linear(in_features=768, out_features=768, bias=True)
              (lora): LoRALayer()
            )
            (out_lin): Linear(in_features=768, out_features=768, bias

In [71]:
for name, param in model.named_parameters():
    print(f"{name} : {param.requires_grad}")

distilbert.embeddings.word_embeddings.weight : False
distilbert.embeddings.position_embeddings.weight : False
distilbert.embeddings.LayerNorm.weight : False
distilbert.embeddings.LayerNorm.bias : False
distilbert.transformer.layer.0.attention.q_lin.linear.weight : False
distilbert.transformer.layer.0.attention.q_lin.linear.bias : False
distilbert.transformer.layer.0.attention.q_lin.lora.W_a : True
distilbert.transformer.layer.0.attention.q_lin.lora.W_b : True
distilbert.transformer.layer.0.attention.k_lin.weight : False
distilbert.transformer.layer.0.attention.k_lin.bias : False
distilbert.transformer.layer.0.attention.v_lin.linear.weight : False
distilbert.transformer.layer.0.attention.v_lin.linear.bias : False
distilbert.transformer.layer.0.attention.v_lin.lora.W_a : True
distilbert.transformer.layer.0.attention.v_lin.lora.W_b : True
distilbert.transformer.layer.0.attention.out_lin.weight : False
distilbert.transformer.layer.0.attention.out_lin.bias : False
distilbert.transformer.lay

In [72]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [73]:
print("Total number of trainable parameters:", count_parameters(model))

Total number of trainable parameters: 147456


# FineTuning

In [74]:
from local_model_utilities import CustomLightningModule

lightning_model = CustomLightningModule(model)

In [75]:
callbacks = [
    ModelCheckpoint(
        save_top_k=1, mode = "max", monitor = "val_acc"
    )
]
logger = CSVLogger(save_dir = "logs/", name = "my-model")

In [76]:
trainer = L.Trainer(
    max_epochs=3,
    callbacks=callbacks,
    accelerator="gpu",
    precision="16-mixed",
    devices=1,
    logger=logger,
    log_every_n_steps=10,
)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [77]:
import time
start = time.time()

trainer.fit(model = lightning_model,
            train_dataloaders=train_loader,
            val_dataloaders = val_loader
)

You are using a CUDA device ('NVIDIA A10G') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
Missing logger folder: logs/my-model
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                                | Params
-----------------------------------------------------------------
0 | model    | DistilBertForSequenceClassification | 67.1 M
1 | val_acc  | MulticlassAccuracy                  | 0     
2 | test_acc | MulticlassAccuracy                  | 0     
-----------------------------------------------------------------
147 K     Trainable params
67.0 M    Non-trainable params
67.1 M    Total params
268.410   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


In [78]:
train_acc = trainer.test(lightning_model, dataloaders=train_loader, ckpt_path="best", verbose=False)
val_acc = trainer.test(lightning_model, dataloaders=val_loader, ckpt_path="best", verbose=False)
test_acc = trainer.test(lightning_model, dataloaders=test_loader, ckpt_path="best", verbose=False)

Restoring states from the checkpoint path at logs/my-model/version_0/checkpoints/epoch=0-step=2917.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at logs/my-model/version_0/checkpoints/epoch=0-step=2917.ckpt
/home/zeus/miniconda3/envs/cloudspace/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:492: Your `test_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.


Testing: |          | 0/? [00:00<?, ?it/s]

Restoring states from the checkpoint path at logs/my-model/version_0/checkpoints/epoch=0-step=2917.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at logs/my-model/version_0/checkpoints/epoch=0-step=2917.ckpt


Testing: |          | 0/? [00:00<?, ?it/s]

Restoring states from the checkpoint path at logs/my-model/version_0/checkpoints/epoch=0-step=2917.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from the checkpoint at logs/my-model/version_0/checkpoints/epoch=0-step=2917.ckpt


Testing: |          | 0/? [00:00<?, ?it/s]

In [79]:
print(f"Train acc: {train_acc[0]['accuracy']*100:2.2f}%")
print(f"Val acc:   {val_acc[0]['accuracy']*100:2.2f}%")
print(f"Test acc:  {test_acc[0]['accuracy']*100:2.2f}%")

Train acc: 89.96%
Val acc:   89.18%
Test acc:  87.73%


In [80]:
import shutil

# Cleanup checkpoint files as we don't need them later
log_dir = f"logs/my-model"
if os.path.exists(log_dir):
    shutil.rmtree(log_dir)