In [13]:
import sys
import os

main_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(main_dir)

import model_classes
from model_classes import *
from MH_Lori import *
from MH_MoE import *
from dataloader import *
import dataloader
from helper_functions import *
import torch
from transformers import PretrainedConfig
import torch.nn as nn
import math
import copy
#import lightning.pytorch as pl
from pytorch_lightning import LightningModule

from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
import pandas as pd
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import CSVLogger

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [14]:
config_small= PretrainedConfig(
    num_experts_per_token=4,
    hidden_size=256,
    num_attention_heads = 4,
    num_MH_MOE_heads = 4,
    num_experts=8,
    batch_size = 20,
    seq_len = 512,
    capacity_factor = 3,
    device = device,
    intermediate_size = 512,
    forward_layer_class = VectorizedMoE,
    vocab_size = 30522,
    n_layers = 12,
    no_lori_segments = 32,
    py_lightning_loging = False,
    loss_fn = torch.nn.CrossEntropyLoss(),
    lr = 0.001, #SET TO 0.0002
    betas = (0.9, 0.98),
    treat_mh_lori_as_regular_lori = True,
    load_balancing_coefficient=0.01,
    proportions = [0.97, 0.01, 0.01, 0.01] # null, train, validation, test
)

config = config_small

#training hiperparams
save_every_n_baches = 500 #how often do you wish to save the model
epochs = 10

#path to folders where you want to save model checkpoints and val and train logs
model_saving_path = 'D:/Projekt_NLP/Saved_stuff/saved_models'
log_saving_path = 'D:/Projekt_NLP/Saved_stuff/logs/vectorized_moe'

model_name='Normal_moe_model' #name of the model in saving logs
saving_filename = 'Vectorised_MOE_175M-{epoch}-{step}'

In [15]:
# example of usage:
l = give_dataloaders(batch_size = config.batch_size, seq_len = config.seq_len + 1, proportions = config.proportions)
train_dataloader = l["train_dataloader"]
val_dataloader = l["val_dataloader"]
test_dataloader = l["test_dataloader"]
sample = next(iter(train_dataloader))
# print(sample)
sample.shape

Token indices sequence length is longer than the specified maximum sequence length for this model (1201 > 512). Running this sequence through the model will result in indexing errors


torch.Size([20, 513])

In [16]:
model = Transformer(config_small).to(config_small.device)
estimate_model_size(model)
estimate_model_size(model.layers)
estimate_model_size(model.layers[0].forward_layer)
get_gpu_memory()
print(isinstance(model, LightningModule))


Estimated Model Size: 167.79 MB, total number of parameters: 175,938,792
Estimated Model Size: 108.06 MB, total number of parameters: 113,307,648
Estimated Model Size: 8.00 MB, total number of parameters: 8,388,608
Total GPU memory: 12.8843776 GB
Reserved GPU memory: 10.036969472 GB
Allocated GPU memory: 0.744733184 GB
Free GPU memory: 9.292236288 GB
True


In [17]:
#Execute this to make shure all parameters are registerd properly

# for name, param in model.named_parameters():
#     print(f"Parameter name: {name}, Parameter shape: {param.size()}")

In [18]:
torch.set_float32_matmul_precision("medium")
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
import pytorch_lightning as pl


# Define the checkpoint callback to save the model every 1000 batches
checkpoint_callback = ModelCheckpoint(
    dirpath=model_saving_path,  # Directory to save the checkpoints
    filename=saving_filename,  # Filename pattern
    save_top_k=-1,  # Save all models
    save_weights_only=False,  # Save only the weights (or set to False to save the full model)
    every_n_train_steps=save_every_n_baches  # Save the model every 1000 batches
)
#new tensorboard for displaying logs

# Define the logger
logger = TensorBoardLogger("tb_logs", name="moe_plain_model") ### CHANGE NAME FOR DIFFERENT RUN (different model)

# Initialize the trainer with the checkpoint callback
trainer = pl.Trainer(
    callbacks=[checkpoint_callback],
    max_epochs=epochs, # Set the number of epochs
    logger=logger
)

trainer.fit(model = model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)



GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type             | Params
------------------------------------------------
0 | loss_fn    | CrossEntropyLoss | 0     
1 | embedding  | Embedding        | 7.8 M 
2 | layers     | ModuleList       | 28.3 M
3 | final_proj | Linear           | 7.8 M 
------------------------------------------------
44.0 M    Trainable params
0         Non-trainable params
44.0 M    Total params
175.939   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Users\Komputer\AppData\Local\Programs\Python\Python310\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]   VALIDATION: Batch 0, loss 10.512404441833496
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00,  1.82it/s]   VALIDATION: Batch 1, loss 10.517236709594727
                                                                           

c:\Users\Komputer\AppData\Local\Programs\Python\Python310\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch 0: |          | 0/? [00:00<?, ?it/s]    TRRAINING: Batch 0, loss 10.528090476989746
Epoch 0: |          | 1/? [00:01<00:00,  0.98it/s, v_num=26]   TRRAINING: Batch 1, loss 10.408585548400879
Epoch 0: |          | 2/? [00:01<00:00,  1.12it/s, v_num=26]   TRRAINING: Batch 2, loss 9.805338859558105
Epoch 0: |          | 3/? [00:02<00:00,  1.16it/s, v_num=26]   TRRAINING: Batch 3, loss 9.181699752807617
Epoch 0: |          | 4/? [00:03<00:00,  1.19it/s, v_num=26]   TRRAINING: Batch 4, loss 10.045382499694824
Epoch 0: |          | 5/? [00:04<00:00,  1.21it/s, v_num=26]   TRRAINING: Batch 5, loss 8.66895580291748
Epoch 0: |          | 6/? [00:05<00:00,  1.20it/s, v_num=26]   TRRAINING: Batch 6, loss 8.998995780944824
Epoch 0: |          | 7/? [00:05<00:00,  1.19it/s, v_num=26]   TRRAINING: Batch 7, loss 8.890582084655762
Epoch 0: |          | 8/? [00:06<00:00,  1.20it/s, v_num=26]   TRRAINING: Batch 8, loss 8.392278671264648
Epoch 0: |          | 9/? [00:07<00:00,  1.20it/s, v_num=26] 

c:\Users\Komputer\AppData\Local\Programs\Python\Python310\lib\site-packages\pytorch_lightning\trainer\call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...


In [None]:
### Saving logs to csv
val_data=model.val_losses_list
train_data=model.train_losses_list
# Convert list of dictionaries to DataFrame
log_val_df = pd.DataFrame(val_data)
log_train_df = pd.DataFrame(train_data)

print(log_val_df)
print(log_train_df)

# Directory to save the CSV file
save_dir = log_saving_path #'/path/to/your/directory'  # Replace with your desired directory path
os.makedirs(save_dir, exist_ok=True)  # Create directory if it doesn't exist
# save_dir = 'D:/Projekt_NLP/Saved_stuff/logs/vectorized_moe'#'/path/to/your/directory'  # Replace with your desired directory path
# os.makedirs(save_dir, exist_ok=True)  # Create directory if it doesn't exist

model_name='Normal_moe_model'
# Define the filename for your CSV file
csv_filename_train = f'logs_train_{model_name}.csv'
csv_filename_val = f'logs_val_{model_name}.csv'

# Construct the full file path
csv_file_path_train = os.path.join(save_dir, csv_filename_train)
csv_file_path_val = os.path.join(save_dir, csv_filename_val)

# Save DataFrame to CSV file
log_train_df.to_csv(csv_file_path_train, index=False)
log_val_df.to_csv(csv_file_path_val, index=False)

print(f"DataFrame saved to {csv_file_path_train}")
print(f"DataFrame saved to {csv_file_path_val}")


    batch_idx   val_loss
0           0  10.509090
1           1  10.533659
2           0   7.635991
3           1   7.299952
4           2   7.781688
5           3   7.924422
6           4   7.558314
7           5   7.898718
8           6   7.490449
9           7   7.570178
10          8   7.404633
11          9   7.353227
12         10   7.346442
13         11   7.593083
14         12   7.275128
15         13   7.496394
16         14   7.480696
17         15   7.463328
18         16   7.805147
19         17   7.610564
    batch_idx  train_loss
0           0   10.503899
1           1   10.457583
2           2   10.151628
3           3   10.159952
4           4    9.520422
5           5    9.292139
6           6    9.466715
7           7    8.826136
8           8    9.118588
9           9    9.344730
10         10    9.162207
11         11    8.577986
12         12    8.826115
13         13    8.224370
14         14    8.281962
15         15    7.891154
16         16    8.150372
17     

Training second MODEL

In [None]:
torch.cuda.empty_cache()

In [24]:
config_small= PretrainedConfig(
    num_experts_per_token=4,
    hidden_size=256,
    num_attention_heads = 4,
    num_MH_MOE_heads = 4,
    num_experts=8,
    batch_size = 20,
    seq_len = 512,
    capacity_factor = 3,
    device = device,
    intermediate_size = 512,
    forward_layer_class = MH_MoE,
    vocab_size = 30522,
    n_layers = 12,
    no_lori_segments = 32,
    py_lightning_loging = False,
    loss_fn = torch.nn.CrossEntropyLoss(),
    lr = 0.001, #SET TO 0.0002
    betas = (0.9, 0.98),
    treat_mh_lori_as_regular_lori = True,
    load_balancing_coefficient=0.01,
    proportions = [0.97, 0.01, 0.01, 0.01] # null, train, validation, test
)

config = config_small

#training hiperparams
save_every_n_baches = 500 #how often do you wish to save the model
epochs = 10

#path to folders where you want to save model checkpoints and val and train logs
model_saving_path = 'D:/Projekt_NLP/Saved_stuff/saved_models'
log_saving_path = 'D:/Projekt_NLP/Saved_stuff/logs/mh_moe'

model_name='MH_MoE_model' #name of the model in saving logs
saving_filename = 'MH_MOE_175M-{epoch}-{step}'

In [25]:
model = Transformer(config_small).to(config_small.device)
estimate_model_size(model)
estimate_model_size(model.layers)
estimate_model_size(model.layers[0].forward_layer)
get_gpu_memory()
print(isinstance(model, LightningModule))

Epoch 0: |          | 23/? [05:55<00:00,  0.06it/s, v_num=25]
Estimated Model Size: 101.81 MB, total number of parameters: 106,757,352
Estimated Model Size: 42.08 MB, total number of parameters: 44,126,208
Estimated Model Size: 2.50 MB, total number of parameters: 2,623,488
Total GPU memory: 12.8843776 GB
Reserved GPU memory: 5.869928448 GB
Allocated GPU memory: 0.833367552 GB
Free GPU memory: 5.036560896 GB
True


In [26]:
torch.set_float32_matmul_precision("medium")
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
import pytorch_lightning as pl


# Define the checkpoint callback to save the model every 1000 batches
checkpoint_callback = ModelCheckpoint(
    dirpath=model_saving_path,  # Directory to save the checkpoints
    filename=saving_filename,  # Filename pattern
    save_top_k=-1,  # Save all models
    save_weights_only=False,  # Save only the weights (or set to False to save the full model)
    every_n_train_steps=save_every_n_baches  # Save the model every 1000 batches
)
#new tensorboard for displaying logs

# Define the logger
logger = TensorBoardLogger("tb_logs", name="moe_plain_model") ### CHANGE NAME FOR DIFFERENT RUN (different model)

# Initialize the trainer with the checkpoint callback
trainer = pl.Trainer(
    callbacks=[checkpoint_callback],
    max_epochs=epochs, # Set the number of epochs
    logger=logger
)

trainer.fit(model = model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type             | Params
------------------------------------------------
0 | loss_fn    | CrossEntropyLoss | 0     
1 | embedding  | Embedding        | 7.8 M 
2 | layers     | ModuleList       | 11.0 M
3 | final_proj | Linear           | 7.8 M 
------------------------------------------------
26.7 M    Trainable params
0         Non-trainable params
26.7 M    Total params
106.757   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Users\Komputer\AppData\Local\Programs\Python\Python310\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]   VALIDATION: Batch 0, loss 10.576143264770508
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00,  1.26it/s]   VALIDATION: Batch 1, loss 10.658533096313477
                                                                           

c:\Users\Komputer\AppData\Local\Programs\Python\Python310\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch 0: |          | 0/? [00:00<?, ?it/s]    TRRAINING: Batch 0, loss 10.509136199951172
Epoch 0: |          | 1/? [00:02<00:00,  0.43it/s, v_num=27]   TRRAINING: Batch 1, loss 10.423609733581543
Epoch 0: |          | 2/? [00:04<00:00,  0.48it/s, v_num=27]   TRRAINING: Batch 2, loss 9.882196426391602
Epoch 0: |          | 3/? [00:06<00:00,  0.49it/s, v_num=27]   TRRAINING: Batch 3, loss 9.192214965820312
Epoch 0: |          | 4/? [00:07<00:00,  0.50it/s, v_num=27]   TRRAINING: Batch 4, loss 9.686577796936035
Epoch 0: |          | 5/? [00:09<00:00,  0.51it/s, v_num=27]   TRRAINING: Batch 5, loss 8.671618461608887
Epoch 0: |          | 6/? [00:11<00:00,  0.51it/s, v_num=27]   TRRAINING: Batch 6, loss 8.795808792114258
Epoch 0: |          | 7/? [00:14<00:00,  0.49it/s, v_num=27]

c:\Users\Komputer\AppData\Local\Programs\Python\Python310\lib\site-packages\pytorch_lightning\trainer\call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...


In [None]:
### Saving logs to csv
val_data=model.val_losses_list
train_data=model.train_losses_list
# Convert list of dictionaries to DataFrame
log_val_df = pd.DataFrame(val_data)
log_train_df = pd.DataFrame(train_data)

print(log_val_df)
print(log_train_df)

# Directory to save the CSV file
save_dir = log_saving_path #'/path/to/your/directory'  # Replace with your desired directory path
os.makedirs(save_dir, exist_ok=True)  # Create directory if it doesn't exist
# save_dir = 'D:/Projekt_NLP/Saved_stuff/logs/vectorized_moe'#'/path/to/your/directory'  # Replace with your desired directory path
# os.makedirs(save_dir, exist_ok=True)  # Create directory if it doesn't exist

model_name='Normal_moe_model'
# Define the filename for your CSV file
csv_filename_train = f'logs_train_{model_name}.csv'
csv_filename_val = f'logs_val_{model_name}.csv'

# Construct the full file path
csv_file_path_train = os.path.join(save_dir, csv_filename_train)
csv_file_path_val = os.path.join(save_dir, csv_filename_val)

# Save DataFrame to CSV file
log_train_df.to_csv(csv_file_path_train, index=False)
log_val_df.to_csv(csv_file_path_val, index=False)

print(f"DataFrame saved to {csv_file_path_train}")
print(f"DataFrame saved to {csv_file_path_val}")