In [1]:
import sys
import os

main_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(main_dir)

import model_classes
from model_classes import *
from MH_Lori import *
from MH_MoE import *
from dataloader import *
import dataloader
from helper_functions import *
import torch
from transformers import PretrainedConfig
import torch.nn as nn
import math
import copy
#import lightning.pytorch as pl
from pytorch_lightning import LightningModule

from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
import pandas as pd
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import CSVLogger

device = 'cuda' if torch.cuda.is_available() else 'cpu'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
config_small= PretrainedConfig(
    num_experts_per_token=4,
    hidden_size=256,
    num_attention_heads = 4,
    num_MH_MOE_heads = 4,
    num_experts=8,
    batch_size = 20,
    seq_len = 512,
    capacity_factor = 3,
    device = device,
    intermediate_size = 512,
    forward_layer_class = VectorizedMoE,
    vocab_size = 30522,
    n_layers = 12,
    no_lori_segments = 32,
    py_lightning_loging = False,
    loss_fn = torch.nn.CrossEntropyLoss(),
    lr = 0.001, #SET TO 0.0002
    betas = (0.9, 0.98),
    treat_mh_lori_as_regular_lori = True,
    load_balancing_coefficient=0.01,
    proportions = [0, 0.8, 0.1, 0.1] # null, train, validation, test
)

config = config_small

#training hiperparams
save_every_n_baches = 500 #how often do you wish to save the model
epochs = 10

#path to folders where you want to save model checkpoints and val and train logs
model_saving_path = 'D:/Projekt_NLP/Saved_stuff/saved_models'
log_saving_path = 'D:/Projekt_NLP/Saved_stuff/logs/vectorized_moe'

model_name='Normal_moe_model' #name of the model in saving logs
saving_filename = 'Vectorised_MOE_175M-{epoch}-{step}'

In [3]:
# example of usage:
l = give_dataloaders(batch_size = config.batch_size, seq_len = config.seq_len + 1, proportions = config.proportions)
train_dataloader = l["train_dataloader"]
val_dataloader = l["val_dataloader"]
test_dataloader = l["test_dataloader"]
sample = next(iter(train_dataloader))
# print(sample)
sample.shape

Token indices sequence length is longer than the specified maximum sequence length for this model (766 > 512). Running this sequence through the model will result in indexing errors


torch.Size([20, 513])

In [4]:
model = Transformer(config_small).to(config_small.device)
estimate_model_size(model)
estimate_model_size(model.layers)
estimate_model_size(model.layers[0].forward_layer)
get_gpu_memory()
print(isinstance(model, LightningModule))


Estimated Model Size: 167.79 MB, total number of parameters: 175,938,792
Estimated Model Size: 108.06 MB, total number of parameters: 113,307,648
Estimated Model Size: 8.00 MB, total number of parameters: 8,388,608
Total GPU memory: 12.8843776 GB
Reserved GPU memory: 0.182452224 GB
Allocated GPU memory: 0.17644288 GB
Free GPU memory: 0.006009344 GB
True


In [5]:
#Execute this to make shure all parameters are registerd properly

# for name, param in model.named_parameters():
#     print(f"Parameter name: {name}, Parameter shape: {param.size()}")

In [6]:
torch.set_float32_matmul_precision("medium")
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
import pytorch_lightning as pl


# Define the checkpoint callback to save the model every 1000 batches
checkpoint_callback = ModelCheckpoint(
    dirpath=model_saving_path,  # Directory to save the checkpoints
    filename=saving_filename,  # Filename pattern
    save_top_k=-1,  # Save all models
    save_weights_only=False,  # Save only the weights (or set to False to save the full model)
    every_n_train_steps=save_every_n_baches  # Save the model every 1000 batches
)
#new tensorboard for displaying logs

# Define the logger
logger = TensorBoardLogger("tb_logs", name="moe_plain_model") ### CHANGE NAME FOR DIFFERENT RUN (different model)

# Initialize the trainer with the checkpoint callback
trainer = pl.Trainer(
    callbacks=[checkpoint_callback],
    max_epochs=epochs, # Set the number of epochs
    logger=logger
)

trainer.fit(model = model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)



GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type             | Params
------------------------------------------------
0 | loss_fn    | CrossEntropyLoss | 0     
1 | embedding  | Embedding        | 7.8 M 
2 | layers     | ModuleList       | 28.3 M
3 | final_proj | Linear           | 7.8 M 
------------------------------------------------
44.0 M    Trainable params
0         Non-trainable params
44.0 M    Total params
175.939   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Users\Komputer\AppData\Local\Programs\Python\Python310\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]   VALIDATION: Batch 0, loss 10.483537673950195
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00,  2.11it/s]   VALIDATION: Batch 1, loss 10.502656936645508
                                                                           

c:\Users\Komputer\AppData\Local\Programs\Python\Python310\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch 0: |          | 0/? [00:00<?, ?it/s]    TRRAINING: Batch 0, loss 10.508197784423828
Epoch 0: |          | 1/? [00:01<00:00,  0.99it/s, v_num=30]   TRRAINING: Batch 1, loss 10.082255363464355
Epoch 0: |          | 2/? [00:01<00:00,  1.04it/s, v_num=30]   TRRAINING: Batch 2, loss 9.035494804382324
Epoch 0: |          | 3/? [00:02<00:00,  1.06it/s, v_num=30]   TRRAINING: Batch 3, loss 9.248161315917969
Epoch 0: |          | 4/? [00:03<00:00,  1.08it/s, v_num=30]   TRRAINING: Batch 4, loss 8.656417846679688
Epoch 0: |          | 5/? [00:04<00:00,  1.15it/s, v_num=30]   TRRAINING: Batch 5, loss 8.846236228942871
Epoch 0: |          | 6/? [00:05<00:00,  1.15it/s, v_num=30]   TRRAINING: Batch 6, loss 8.788739204406738
Epoch 0: |          | 7/? [00:06<00:00,  1.17it/s, v_num=30]   TRRAINING: Batch 7, loss 8.476215362548828
Epoch 0: |          | 8/? [00:06<00:00,  1.16it/s, v_num=30]   TRRAINING: Batch 8, loss 8.15264892578125
Epoch 0: |          | 9/? [00:07<00:00,  1.16it/s, v_num=30]  



   VALIDATION: Batch 0, loss 5.40457010269165
   VALIDATION: Batch 1, loss 4.248114585876465
   VALIDATION: Batch 2, loss 5.5528974533081055
   VALIDATION: Batch 3, loss 5.249224662780762
   VALIDATION: Batch 4, loss 4.815767765045166
   VALIDATION: Batch 5, loss 4.330968379974365
   VALIDATION: Batch 6, loss 4.641824245452881
   VALIDATION: Batch 7, loss 5.290759086608887
   VALIDATION: Batch 8, loss 5.2831878662109375
   VALIDATION: Batch 9, loss 5.276829242706299
   VALIDATION: Batch 10, loss 5.089653015136719
   VALIDATION: Batch 11, loss 4.591521263122559
   VALIDATION: Batch 12, loss 5.114561557769775
   VALIDATION: Batch 13, loss 5.373461723327637
   VALIDATION: Batch 14, loss 5.020101547241211
   VALIDATION: Batch 15, loss 4.8066606521606445
   VALIDATION: Batch 16, loss 5.305370330810547
   VALIDATION: Batch 17, loss 4.9691901206970215
   VALIDATION: Batch 18, loss 4.105074405670166
   VALIDATION: Batch 19, loss 5.231184005737305
   VALIDATION: Batch 20, loss 5.386232376098633

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: |          | 1529/? [23:25<00:00,  1.09it/s, v_num=30]


In [7]:
### Saving logs to csv
val_data=model.val_losses_list
train_data=model.train_losses_list
# Convert list of dictionaries to DataFrame
log_val_df = pd.DataFrame(val_data)
log_train_df = pd.DataFrame(train_data)

print(log_val_df)
print(log_train_df)

# Directory to save the CSV file
save_dir = log_saving_path #'/path/to/your/directory'  # Replace with your desired directory path
os.makedirs(save_dir, exist_ok=True)  # Create directory if it doesn't exist
# save_dir = 'D:/Projekt_NLP/Saved_stuff/logs/vectorized_moe'#'/path/to/your/directory'  # Replace with your desired directory path
# os.makedirs(save_dir, exist_ok=True)  # Create directory if it doesn't exist

model_name='Normal_moe_model'
# Define the filename for your CSV file
csv_filename_train = f'logs_train_{model_name}.csv'
csv_filename_val = f'logs_val_{model_name}.csv'

# Construct the full file path
csv_file_path_train = os.path.join(save_dir, csv_filename_train)
csv_file_path_val = os.path.join(save_dir, csv_filename_val)

# Save DataFrame to CSV file
log_train_df.to_csv(csv_file_path_train, index=False)
log_val_df.to_csv(csv_file_path_val, index=False)

print(f"DataFrame saved to {csv_file_path_train}")
print(f"DataFrame saved to {csv_file_path_val}")


      batch_idx   val_loss
0             0  10.483538
1             1  10.502657
2             0   5.404570
3             1   4.248115
4             2   5.552897
...         ...        ...
1927        188   4.582532
1928        189   3.896645
1929        190   3.980797
1930        191   4.471370
1931        192   4.872915

[1932 rows x 2 columns]
       batch_idx  train_loss
0              0   10.508198
1              1   10.082255
2              2    9.035495
3              3    9.248161
4              4    8.656418
...          ...         ...
15275       1523    3.665086
15276       1524    3.515073
15277       1525    3.201782
15278       1526    3.790570
15279       1527    3.798639

[15280 rows x 2 columns]
DataFrame saved to D:/Projekt_NLP/Saved_stuff/logs/vectorized_moe\logs_train_Normal_moe_model.csv
DataFrame saved to D:/Projekt_NLP/Saved_stuff/logs/vectorized_moe\logs_val_Normal_moe_model.csv


Training second MODEL

In [8]:
torch.cuda.empty_cache()

In [9]:
config_small= PretrainedConfig(
    num_experts_per_token=4,
    hidden_size=256,
    num_attention_heads = 4,
    num_MH_MOE_heads = 4,
    num_experts=8,
    batch_size = 20,
    seq_len = 512,
    capacity_factor = 3,
    device = device,
    intermediate_size = 512,
    forward_layer_class = MH_MoE,
    vocab_size = 30522,
    n_layers = 12,
    no_lori_segments = 32,
    py_lightning_loging = False,
    loss_fn = torch.nn.CrossEntropyLoss(),
    lr = 0.001, #SET TO 0.0002
    betas = (0.9, 0.98),
    treat_mh_lori_as_regular_lori = True,
    load_balancing_coefficient=0.01,
    proportions = [0, 0.8, 0.1, 0.1] # null, train, validation, test
)

config = config_small

#training hiperparams
save_every_n_baches = 500 #how often do you wish to save the model
epochs = 10

#path to folders where you want to save model checkpoints and val and train logs
model_saving_path = 'D:/Projekt_NLP/Saved_stuff/saved_models'
log_saving_path = 'D:/Projekt_NLP/Saved_stuff/logs/mh_moe'

model_name='MH_MoE_model' #name of the model in saving logs
saving_filename = 'MH_MOE_175M-{epoch}-{step}'

In [10]:
model = Transformer(config_small).to(config_small.device)
estimate_model_size(model)
estimate_model_size(model.layers)
estimate_model_size(model.layers[0].forward_layer)
get_gpu_memory()
print(isinstance(model, LightningModule))

Estimated Model Size: 101.81 MB, total number of parameters: 106,757,352
Estimated Model Size: 42.08 MB, total number of parameters: 44,126,208
Estimated Model Size: 2.50 MB, total number of parameters: 2,623,488
Total GPU memory: 12.8843776 GB
Reserved GPU memory: 0.109051904 GB
Allocated GPU memory: 0.10729216 GB
Free GPU memory: 0.001759744 GB
True


In [11]:
torch.set_float32_matmul_precision("medium")
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
import pytorch_lightning as pl


# Define the checkpoint callback to save the model every 1000 batches
checkpoint_callback = ModelCheckpoint(
    dirpath=model_saving_path,  # Directory to save the checkpoints
    filename=saving_filename,  # Filename pattern
    save_top_k=-1,  # Save all models
    save_weights_only=False,  # Save only the weights (or set to False to save the full model)
    every_n_train_steps=save_every_n_baches  # Save the model every 1000 batches
)
#new tensorboard for displaying logs

# Define the logger
logger = TensorBoardLogger("tb_logs", name="moe_plain_model") ### CHANGE NAME FOR DIFFERENT RUN (different model)

# Initialize the trainer with the checkpoint callback
trainer = pl.Trainer(
    callbacks=[checkpoint_callback],
    max_epochs=epochs, # Set the number of epochs
    logger=logger
)

trainer.fit(model = model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
c:\Users\Komputer\AppData\Local\Programs\Python\Python310\lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:653: Checkpoint directory D:\Projekt_NLP\Saved_stuff\saved_models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type             | Params
------------------------------------------------
0 | loss_fn    | CrossEntropyLoss | 0     
1 | embedding  | Embedding        | 7.8 M 
2 | layers     | ModuleList       | 11.0 M
3 | final_proj | Linear           | 7.8 M 
------------------------------------------------
26.7 M    Trainable params
0         Non-trainable params
26.7 M    Total params
106.757   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Users\Komputer\AppData\Local\Programs\Python\Python310\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]   VALIDATION: Batch 0, loss 10.493760108947754
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00,  1.67it/s]   VALIDATION: Batch 1, loss 10.453999519348145
                                                                           

c:\Users\Komputer\AppData\Local\Programs\Python\Python310\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch 0: |          | 0/? [00:00<?, ?it/s]    TRRAINING: Batch 0, loss 10.485569953918457
Epoch 0: |          | 1/? [00:01<00:00,  0.85it/s, v_num=31]   TRRAINING: Batch 1, loss 10.003713607788086
Epoch 0: |          | 2/? [00:02<00:00,  0.71it/s, v_num=31]   TRRAINING: Batch 2, loss 8.94566822052002
Epoch 0: |          | 3/? [00:04<00:00,  0.69it/s, v_num=31]   TRRAINING: Batch 3, loss 9.716654777526855
Epoch 0: |          | 4/? [00:05<00:00,  0.69it/s, v_num=31]   TRRAINING: Batch 4, loss 8.552960395812988
Epoch 0: |          | 5/? [00:07<00:00,  0.71it/s, v_num=31]   TRRAINING: Batch 5, loss 8.86923599243164
Epoch 0: |          | 6/? [00:08<00:00,  0.70it/s, v_num=31]   TRRAINING: Batch 6, loss 8.804082870483398
Epoch 0: |          | 7/? [00:09<00:00,  0.71it/s, v_num=31]   TRRAINING: Batch 7, loss 8.479351043701172
Epoch 0: |          | 8/? [00:11<00:00,  0.71it/s, v_num=31]   TRRAINING: Batch 8, loss 8.204797744750977
Epoch 0: |          | 9/? [00:12<00:00,  0.70it/s, v_num=31]   

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: |          | 1529/? [38:45<00:00,  0.66it/s, v_num=31]


In [12]:
### Saving logs to csv
val_data=model.val_losses_list
train_data=model.train_losses_list
# Convert list of dictionaries to DataFrame
log_val_df = pd.DataFrame(val_data)
log_train_df = pd.DataFrame(train_data)

print(log_val_df)
print(log_train_df)

# Directory to save the CSV file
save_dir = log_saving_path #'/path/to/your/directory'  # Replace with your desired directory path
os.makedirs(save_dir, exist_ok=True)  # Create directory if it doesn't exist
# save_dir = 'D:/Projekt_NLP/Saved_stuff/logs/vectorized_moe'#'/path/to/your/directory'  # Replace with your desired directory path
# os.makedirs(save_dir, exist_ok=True)  # Create directory if it doesn't exist

model_name='Normal_moe_model'
# Define the filename for your CSV file
csv_filename_train = f'logs_train_{model_name}.csv'
csv_filename_val = f'logs_val_{model_name}.csv'

# Construct the full file path
csv_file_path_train = os.path.join(save_dir, csv_filename_train)
csv_file_path_val = os.path.join(save_dir, csv_filename_val)

# Save DataFrame to CSV file
log_train_df.to_csv(csv_file_path_train, index=False)
log_val_df.to_csv(csv_file_path_val, index=False)

print(f"DataFrame saved to {csv_file_path_train}")
print(f"DataFrame saved to {csv_file_path_val}")

      batch_idx   val_loss
0             0  10.493760
1             1  10.454000
2             0   5.458968
3             1   4.351250
4             2   5.631401
...         ...        ...
1927        188   4.500703
1928        189   3.832399
1929        190   3.925724
1930        191   4.408593
1931        192   4.708729

[1932 rows x 2 columns]
       batch_idx  train_loss
0              0   10.485570
1              1   10.003714
2              2    8.945668
3              3    9.716655
4              4    8.552960
...          ...         ...
15275       1523    3.873087
15276       1524    3.761661
15277       1525    3.421404
15278       1526    4.007259
15279       1527    4.016477

[15280 rows x 2 columns]
DataFrame saved to D:/Projekt_NLP/Saved_stuff/logs/mh_moe\logs_train_Normal_moe_model.csv
DataFrame saved to D:/Projekt_NLP/Saved_stuff/logs/mh_moe\logs_val_Normal_moe_model.csv
