In [1]:
import sys
import os

main_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(main_dir)

import model_classes
from model_classes import *
from MH_Lori import *
from dataloader import *
import dataloader
from helper_functions import *
import torch
from transformers import PretrainedConfig
import torch.nn as nn
import math
import copy
import lightning.pytorch as pl

device = 'cuda' if torch.cuda.is_available() else 'cpu'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
config_small= PretrainedConfig(
    num_experts_per_token=2,
    hidden_size=256,
    num_attention_heads = 8,
    num_MH_MOE_heads = 4,
    num_experts=8,
    batch_size = 4,
    seq_len = 256,
    capacity_factor = 8,
    device = device,
    intermediate_size = 512,
    forward_layer_class = MH_Lori,
    vocab_size = 30522,
    n_layers = 8,
    no_lori_segments = 32,
    py_lightning_loging = False,
    loss_fn = torch.nn.CrossEntropyLoss(),
    lr = 0.0002,
    betas = (0.9, 0.95),
)

config = config_small

In [3]:
# example of usage:
l = give_dataloaders(batch_size = config.batch_size, seq_len = config.seq_len + 1, development=True)
train_dataloader = l["train_dataloader"]
val_dataloader = l["val_dataloader"]
test_dataloader = l["test_dataloader"]
sample = next(iter(train_dataloader))
# print(sample)
sample.shape

Token indices sequence length is longer than the specified maximum sequence length for this model (3520 > 512). Running this sequence through the model will result in indexing errors


torch.Size([4, 257])

In [4]:
mh_lori = Transformer(config_small).to(config_small.device)
estimate_model_size(mh_lori)
estimate_model_size(mh_lori.layers)
estimate_model_size(mh_lori.layers[0].forward_layer)
get_gpu_memory()

Estimated Model Size: 87.80 MB, total number of parameters: 92,065,000
Estimated Model Size: 28.07 MB, total number of parameters: 29,433,856
Estimated Model Size: 2.50 MB, total number of parameters: 2,625,536
Total GPU memory: 12.8843776 GB
Reserved GPU memory: 0.09437184 GB
Allocated GPU memory: 0.092470784 GB
Free GPU memory: 0.001901056 GB


In [5]:
expert_1_before_training = copy.deepcopy(mh_lori.layers[0].forward_layer.first_linear)
merge_1_before_training = copy.deepcopy(mh_lori.layers[0].forward_layer.merge_layer)
print(type(expert_1_before_training), expert_1_before_training.shape)
print(type(merge_1_before_training))

<class 'torch.nn.parameter.Parameter'> torch.Size([8, 512, 64])
<class 'torch.nn.modules.linear.Linear'>


In [6]:
test_input = sample[:, :-1].to(config.device)

with torch.no_grad():
    output = mh_lori(test_input)

# output = mh_lori(test_input)
# output = output.detach()
get_gpu_memory()
output = output.detach()
print(output.shape)

Total GPU memory: 12.8843776 GB
Reserved GPU memory: 0.81788928 GB
Allocated GPU memory: 0.227332608 GB
Free GPU memory: 0.590556672 GB
torch.Size([4, 256, 30522])


In [7]:
torch.set_float32_matmul_precision("medium")
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [8]:
trainer = pl.Trainer(max_epochs=3)

trainer.fit(model = mh_lori, train_dataloaders=train_dataloader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type             | Params
------------------------------------------------
0 | loss_fn    | CrossEntropyLoss | 0     
1 | embedding  | Embedding        | 7.8 M 
2 | layers     | ModuleList       | 7.4 M 
3 | final_proj | Linear           | 7.8 M 
------------------------------------------------
23.0 M    Trainable params
0         Non-trainable params
23.0 M    Total params
92.065    Total estimated model params size (MB)
c:\Users\Komputer\AppData\Local\Programs\Python\Python310\lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch 0: |          | 0/? [00:00<?, ?it/s]    TRRAINING: Batch 0, loss 10.48367977142334
Epoch 0: |          | 1/? [00:00<00:00,  1.87it/s, v_num=20]   TRRAINING: Batch 1, loss 10.403736114501953
Epoch 0: |          | 2/? [00:00<00:00,  2.82it/s, v_num=20]

RuntimeError: stack expects each tensor to be equal size, but got [257] at entry 0 and [178] at entry 1

In [None]:
mh_lori.no_of_skiped_baches/mh_lori.no_of_total_batches

0.25222755311857437

In [None]:
expert_1_after_training = mh_lori.layers[0].forward_layer.first_linear
merge_1_after_training = mh_lori.layers[0].forward_layer.merge_layer


In [None]:
torch.equal(merge_1_before_training.weight, merge_1_after_training.weight)

False

In [None]:
torch.equal(expert_1_before_training, expert_1_after_training)
torch.max(abs(expert_1_after_training - expert_1_before_training))

tensor(0.0020, device='cuda:0', grad_fn=<MaxBackward1>)