In [1]:
import sys
import os

main_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(main_dir)

import model_classes
from model_classes import *
from MH_Lori import *
from MH_MoE import *
from dataloader import *
import dataloader
from helper_functions import *
import torch
from transformers import PretrainedConfig
import torch.nn as nn
import math
import copy
import lightning.pytorch as pl
# from pytorch_lightning.callbacks import ModelCheckpoint
from lightning.pytorch.callbacks import ModelCheckpoint

device = 'cuda' if torch.cuda.is_available() else 'cpu'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
config_small= PretrainedConfig(
    num_experts_per_token=2,
    hidden_size=128,
    num_attention_heads = 8,
    num_MH_MOE_heads = 4,
    num_experts=12,
    batch_size = 16,
    seq_len = 512,
    capacity_factor = 3,
    device = device,
    intermediate_size = 256,
    forward_layer_class = MH_MoE,
    vocab_size = 30522,
    n_layers = 8,
    no_lori_segments = 32,
    py_lightning_loging = False,
    loss_fn = torch.nn.CrossEntropyLoss(),
    lr = 0.0006, #SET TO 0.0002
    betas = (0.9, 0.95),
    treat_mh_lori_as_regular_lori = False,
    load_balancing_coefficient=0,
    proportions = [0.4, 0.4, 0.2]
)

config = config_small

In [3]:
# example of usage:
l = give_dataloaders(batch_size = config.batch_size, seq_len = config.seq_len + 1, proportions = config.proportions, development=True)
train_dataloader_a = l["train_dataloader"]
train_dataloader_b = l["val_dataloader"]
test_dataloader = l["test_dataloader"]
sample = next(iter(train_dataloader_a))
# print(sample)
sample.shape

Token indices sequence length is longer than the specified maximum sequence length for this model (766 > 512). Running this sequence through the model will result in indexing errors


torch.Size([16, 513])

In [4]:
mh_lori = Transformer(config_small).to(config_small.device)
estimate_model_size(mh_lori)
estimate_model_size(mh_lori.layers)
estimate_model_size(mh_lori.layers[0].forward_layer)
get_gpu_memory()

Estimated Model Size: 38.96 MB, total number of parameters: 40,854,760
Estimated Model Size: 9.04 MB, total number of parameters: 9,478,144
Estimated Model Size: 0.88 MB, total number of parameters: 920,064
No GPU available.


In [5]:
torch.set_float32_matmul_precision("medium")
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

# Define the checkpoint callback to save the model every 1000 batches
checkpoint_callback = ModelCheckpoint(
    dirpath='saved_models',  # Directory to save the checkpoints
    filename='MH_Lori_dataloader_a-{epoch}-{step}',  # Filename pattern
    save_top_k=-1,  # Save all models
    save_weights_only=False,  # Save only the weights (or set to False to save the full model)
    every_n_train_steps=500  # Save the model every 1000 batches
)

# Initialize the trainer with the checkpoint callback
trainer = pl.Trainer(
    callbacks=[checkpoint_callback],
    max_epochs=30  # Set the number of epochs
)


trainer.fit(model = mh_lori, train_dataloaders=train_dataloader_a, val_dataloaders=train_dataloader_b)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name       | Type             | Params | Mode 
--------------------------------------------------------
0 | loss_fn    | CrossEntropyLoss | 0      | train
1 | embedding  | Embedding        | 3.9 M  | train
2 | layers     | ModuleList       | 2.4 M  | train
3 | final_proj | Linear           | 3.9 M  | train
--------------------------------------------------------
10.2 M    Trainable params
0         Non-trainable params
10.2 M    Total params
40.855    Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Users\miria\anaconda3\envs\integrated_env\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]   VALIDATION: Batch 0, loss 10.493735313415527
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:11<00:11,  0.09it/s]   VALIDATION: Batch 1, loss 10.540153503417969
                                                                           

c:\Users\miria\anaconda3\envs\integrated_env\Lib\site-packages\lightning\pytorch\trainer\connectors\data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Epoch 0: |          | 0/? [00:00<?, ?it/s]    TRRAINING: Batch 0, loss 10.507697105407715
Epoch 0: |          | 1/? [00:25<00:00,  0.04it/s, v_num=5]   TRRAINING: Batch 1, loss 10.439556121826172


c:\Users\miria\anaconda3\envs\integrated_env\Lib\site-packages\lightning\pytorch\trainer\call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...


In [None]:
torch.set_float32_matmul_precision("medium")
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

# Define the checkpoint callback to save the model every 1000 batches
checkpoint_callback = ModelCheckpoint(
    dirpath='saved_models',  # Directory to save the checkpoints
    filename='MH_Lori_dataloader_b-{epoch}-{step}',  # Filename pattern
    save_top_k=-1,  # Save all models
    save_weights_only=False,  # Save only the weights (or set to False to save the full model)
    every_n_train_steps=500  # Save the model every 1000 batches
)

# Initialize the trainer with the checkpoint callback
trainer = pl.Trainer(
    callbacks=[checkpoint_callback],
    max_epochs=30  # Set the number of epochs
)


trainer.fit(model = mh_lori, train_dataloaders=train_dataloader_b)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type             | Params
------------------------------------------------
0 | loss_fn    | CrossEntropyLoss | 0     
1 | embedding  | Embedding        | 3.9 M 
2 | layers     | ModuleList       | 2.4 M 
3 | final_proj | Linear           | 3.9 M 
------------------------------------------------
10.2 M    Trainable params
0         Non-trainable params
10.2 M    Total params
40.855    Total estimated model params size (MB)


Epoch 0: |          | 0/? [00:00<?, ?it/s]    TRRAINING: Batch 0, loss 7.754574775695801
Epoch 0: |          | 1/? [00:00<00:00,  1.49it/s, v_num=53]   TRRAINING: Batch 1, loss 7.893613815307617
Epoch 0: |          | 2/? [00:01<00:00,  1.41it/s, v_num=53]   TRRAINING: Batch 2, loss 7.7666120529174805
Epoch 0: |          | 3/? [00:02<00:00,  1.39it/s, v_num=53]   TRRAINING: Batch 3, loss 7.839859485626221
Epoch 0: |          | 4/? [00:02<00:00,  1.37it/s, v_num=53]   TRRAINING: Batch 4, loss 7.731955528259277
Epoch 0: |          | 5/? [00:03<00:00,  1.36it/s, v_num=53]   TRRAINING: Batch 5, loss 7.096835136413574
Epoch 0: |          | 6/? [00:04<00:00,  1.36it/s, v_num=53]   TRRAINING: Batch 6, loss 7.770885467529297
Epoch 0: |          | 7/? [00:05<00:00,  1.35it/s, v_num=53]   TRRAINING: Batch 7, loss 7.613646507263184
Epoch 0: |          | 8/? [00:05<00:00,  1.35it/s, v_num=53]   TRRAINING: Batch 8, loss 7.551409721374512
Epoch 0: |          | 9/? [00:06<00:00,  1.35it/s, v_num=53]  

c:\Users\Komputer\AppData\Local\Programs\Python\Python310\lib\site-packages\lightning\pytorch\trainer\call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...
