In [18]:
import sys
import os

main_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(main_dir)

import model_classes
from model_classes import *
from MH_Lori import *
from MH_MoE import *
from dataloader import *
import dataloader
from helper_functions import *
import torch
from transformers import PretrainedConfig
import torch.nn as nn
import math
import copy
#import lightning.pytorch as pl
from pytorch_lightning import LightningModule

from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
import pandas as pd
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import CSVLogger

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [19]:
config_small= PretrainedConfig(
    num_experts_per_token=2,
    hidden_size=128,
    num_attention_heads = 8,
    num_MH_MOE_heads = 4,
    num_experts=12,
    batch_size = 1,
    seq_len = 512,
    capacity_factor = 3,
    device = device,
    intermediate_size = 256,
    forward_layer_class = VectorizedMoE,
    vocab_size = 30522,
    n_layers = 8,
    no_lori_segments = 32,
    py_lightning_loging = False,
    loss_fn = torch.nn.CrossEntropyLoss(),
    lr = 0.0006, #SET TO 0.0002
    betas = (0.9, 0.95),
    treat_mh_lori_as_regular_lori = False,
    load_balancing_coefficient=0.01,
    proportions = [0.997, 0.001, 0.001, 0.001] # null, train, validation, test
)

config = config_small

#training hiperparams
save_every_n_baches = 500 #how often do you wish to save the model
epochs = 10

#path to folders where you want to save model checkpoints and val and train logs
model_saving_path = 'D:/Projekt_NLP/Saved_stuff/saved_models'
log_saving_path = 'D:/Projekt_NLP/Saved_stuff/logs/vectorized_moe'

model_name='Normal_moe_model' #name of the model in saving logs
saving_filename = '_dataloader_a-{epoch}-{step}'

In [20]:
# example of usage:
l = give_dataloaders(batch_size = config.batch_size, seq_len = config.seq_len + 1, proportions = config.proportions)
train_dataloader = l["train_dataloader"]
val_dataloader = l["val_dataloader"]
test_dataloader = l["test_dataloader"]
sample = next(iter(train_dataloader))
# print(sample)
sample.shape

Token indices sequence length is longer than the specified maximum sequence length for this model (569 > 512). Running this sequence through the model will result in indexing errors


torch.Size([1, 513])

In [21]:
model = Transformer(config_small).to(config_small.device)
estimate_model_size(model)
estimate_model_size(model.layers)
estimate_model_size(model.layers[0].forward_layer)
get_gpu_memory()
print(isinstance(model, LightningModule))


Estimated Model Size: 55.94 MB, total number of parameters: 58,660,072
Estimated Model Size: 26.02 MB, total number of parameters: 27,283,456
Estimated Model Size: 3.00 MB, total number of parameters: 3,145,728
Total GPU memory: 12.8843776 GB
Reserved GPU memory: 0.637534208 GB
Allocated GPU memory: 0.200487936 GB
Free GPU memory: 0.437046272 GB
True


In [26]:
#Execute this to make shure all parameters are registerd properly

# for name, param in model.named_parameters():
#     print(f"Parameter name: {name}, Parameter shape: {param.size()}")

In [24]:
torch.set_float32_matmul_precision("medium")
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
import pytorch_lightning as pl


# Define the checkpoint callback to save the model every 1000 batches
checkpoint_callback = ModelCheckpoint(
    dirpath=model_saving_path,  # Directory to save the checkpoints
    filename=saving_filename,  # Filename pattern
    save_top_k=-1,  # Save all models
    save_weights_only=False,  # Save only the weights (or set to False to save the full model)
    every_n_train_steps=save_every_n_baches  # Save the model every 1000 batches
)
#new tensorboard for displaying logs

# Define the logger
logger = TensorBoardLogger("tb_logs", name="moe_plain_model") ### CHANGE NAME FOR DIFFERENT RUN (different model)

# Initialize the trainer with the checkpoint callback
trainer = pl.Trainer(
    callbacks=[checkpoint_callback],
    max_epochs=epochs, # Set the number of epochs
    logger=logger
)

trainer.fit(model = model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)



GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
c:\Users\Komputer\AppData\Local\Programs\Python\Python310\lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:653: Checkpoint directory D:\Projekt_NLP\Saved_stuff\saved_models exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type             | Params
------------------------------------------------
0 | loss_fn    | CrossEntropyLoss | 0     
1 | embedding  | Embedding        | 3.9 M 
2 | layers     | ModuleList       | 6.8 M 
3 | final_proj | Linear           | 3.9 M 
------------------------------------------------
14.7 M    Trainable params
0         Non-trainable params
14.7 M    Total params
58.660    Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]

c:\Users\Komputer\AppData\Local\Programs\Python\Python310\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


   VALIDATION: Batch 0, loss 10.468964576721191
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00,  3.31it/s]   VALIDATION: Batch 1, loss 10.485994338989258
                                                                           

c:\Users\Komputer\AppData\Local\Programs\Python\Python310\lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch 0: |          | 0/? [00:00<?, ?it/s]    TRRAINING: Batch 0, loss 10.45272445678711
Epoch 0: |          | 1/? [00:00<00:00,  9.12it/s, v_num=7]   TRRAINING: Batch 1, loss 10.495039939880371
Epoch 0: |          | 2/? [00:00<00:00, 10.21it/s, v_num=7]   TRRAINING: Batch 2, loss 10.525457382202148
Epoch 0: |          | 3/? [00:00<00:00,  9.07it/s, v_num=7]   TRRAINING: Batch 3, loss 10.31803035736084
Epoch 0: |          | 4/? [00:00<00:00,  9.45it/s, v_num=7]   TRRAINING: Batch 4, loss 10.150997161865234
Epoch 0: |          | 5/? [00:00<00:00,  9.71it/s, v_num=7]   TRRAINING: Batch 5, loss 10.278472900390625
Epoch 0: |          | 6/? [00:00<00:00, 10.10it/s, v_num=7]   TRRAINING: Batch 6, loss 10.345307350158691
Epoch 0: |          | 7/? [00:00<00:00, 10.06it/s, v_num=7]   TRRAINING: Batch 7, loss 10.173484802246094
Epoch 0: |          | 8/? [00:00<00:00, 10.17it/s, v_num=7]   TRRAINING: Batch 8, loss 10.126062393188477
Epoch 0: |          | 9/? [00:00<00:00,  9.78it/s, v_num=7]   TR

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: |          | 20/? [00:03<00:00,  5.13it/s, v_num=7]


In [25]:
### Saving logs to csv
val_data=model.val_losses_list
train_data=model.train_losses_list
# Convert list of dictionaries to DataFrame
log_val_df = pd.DataFrame(val_data)
log_train_df = pd.DataFrame(train_data)

print(log_val_df)
print(log_train_df)

# Directory to save the CSV file
save_dir = log_saving_path #'/path/to/your/directory'  # Replace with your desired directory path
os.makedirs(save_dir, exist_ok=True)  # Create directory if it doesn't exist
# save_dir = 'D:/Projekt_NLP/Saved_stuff/logs/vectorized_moe'#'/path/to/your/directory'  # Replace with your desired directory path
# os.makedirs(save_dir, exist_ok=True)  # Create directory if it doesn't exist

model_name='Normal_moe_model'
# Define the filename for your CSV file
csv_filename_train = f'logs_train_{model_name}.csv'
csv_filename_val = f'logs_val_{model_name}.csv'

# Construct the full file path
csv_file_path_train = os.path.join(save_dir, csv_filename_train)
csv_file_path_val = os.path.join(save_dir, csv_filename_val)

# Save DataFrame to CSV file
log_train_df.to_csv(csv_file_path_train, index=False)
log_val_df.to_csv(csv_file_path_val, index=False)

print(f"DataFrame saved to {csv_file_path_train}")
print(f"DataFrame saved to {csv_file_path_val}")


     batch_idx   val_loss
0            0  10.468965
1            1  10.485994
2            0  10.671075
3            1   8.824972
4            2   8.998638
..         ...        ...
307         26   8.645702
308         27   9.591789
309         28  10.349781
310         29   9.070243
311         30   8.842028

[312 rows x 2 columns]
     batch_idx  train_loss
0            0   10.452724
1            1   10.495040
2            2   10.525457
3            3   10.318030
4            4   10.150997
..         ...         ...
195         15    4.002635
196         16    1.998070
197         17    2.209420
198         18    4.320018
199         19    4.349841

[200 rows x 2 columns]
DataFrame saved to D:/Projekt_NLP/Saved_stuff/logs/vectorized_moe\logs_train_Normal_moe_model.csv
DataFrame saved to D:/Projekt_NLP/Saved_stuff/logs/vectorized_moe\logs_val_Normal_moe_model.csv
