In [1]:
import sys
import os

main_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(main_dir)

import model_classes
from model_classes import *
from MH_Lori import *
from MH_MoE import *
from dataloader import *
import dataloader
from helper_functions import *
import torch
from transformers import PretrainedConfig
import torch.nn as nn
import math
import copy
#import lightning.pytorch as pl
from pytorch_lightning import LightningModule

from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
import pandas as pd
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import CSVLogger

device = 'cuda' if torch.cuda.is_available() else 'cpu'

  from .autonotebook import tqdm as notebook_tqdm


In [25]:
config_small= PretrainedConfig(
    num_experts_per_token=4,
    hidden_size=256,
    num_attention_heads = 4,
    num_MH_MOE_heads = 1,
    num_experts=3,
    batch_size = 20,
    seq_len = 512,
    capacity_factor = 3,
    device = device,
    intermediate_size = 512,
    forward_layer_class = MH_Lori,
    vocab_size = 30522,
    n_layers = 12,
    no_lori_segments = 4,
    py_lightning_loging = False,
    loss_fn = torch.nn.CrossEntropyLoss(),
    lr = 0.001, #SET TO 0.0002
    betas = (0.9, 0.98),
    treat_mh_lori_as_regular_lori = True,
    load_balancing_coefficient=0.01,
    proportions = [0.79, 0.01, 0.1, 0.1] # null, train, validation, test
)

config = config_small

#training hiperparams
save_every_n_baches = 500 #how often do you wish to save the model
epochs = 30

#path to folders where you want to save model checkpoints and val and train logs
model_saving_path = 'D:/Projekt_NLP/Saved_stuff/saved_models_Lori'
log_saving_path = 'D:/Projekt_NLP/Saved_stuff/logs/Lori'

model_name='Lori_model' #name of the model in saving logs
saving_filename = 'Lori_100M-{epoch}-{step}'

In [26]:
# example of usage:
l = give_dataloaders(batch_size = config.batch_size, seq_len = config.seq_len + 1, proportions = config.proportions)
train_dataloader = l["train_dataloader"]
val_dataloader = l["val_dataloader"]
test_dataloader = l["test_dataloader"]
sample = next(iter(train_dataloader))
# print(sample)
sample.shape

torch.Size([20, 513])

In [27]:
model = Transformer(config_small).to(config_small.device)
print('Total parameter count:')
estimate_model_size(model)
print('Layers parameter count (so no embedings or clasification head):')
estimate_model_size(model.layers)

Total parameter count:
Estimated Model Size: 107.82 MB, total number of parameters: 113,061,096
Layers parameter count (so no embedings or clasification head):
Estimated Model Size: 48.09 MB, total number of parameters: 50,429,952
True


In [6]:
#Execute this to make shure all parameters are registerd properly

# for name, param in model.named_parameters():
#     print(f"Parameter name: {name}, Parameter shape: {param.size()}")

In [11]:
torch.set_float32_matmul_precision("medium")
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
import pytorch_lightning as pl


# Define the checkpoint callback to save the model every 1000 batches
checkpoint_callback = ModelCheckpoint(
    dirpath=model_saving_path,  # Directory to save the checkpoints
    filename=saving_filename,  # Filename pattern
    save_top_k=-1,  # Save all models
    save_weights_only=False,  # Save only the weights (or set to False to save the full model)
    every_n_train_steps=save_every_n_baches  # Save the model every 1000 batches
)
#new tensorboard for displaying logs

# Define the logger
logger = TensorBoardLogger("tb_logs", name="moe_plain_model") ### CHANGE NAME FOR DIFFERENT RUN (different model)

# Initialize the trainer with the checkpoint callback
trainer = pl.Trainer(
    callbacks=[checkpoint_callback],
    max_epochs=epochs, # Set the number of epochs
    logger=logger
)

trainer.fit(model = model, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)



GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
c:\Users\Komputer\AppData\Local\Programs\Python\Python310\lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:653: Checkpoint directory D:\Projekt_NLP\Saved_stuff\saved_models_Lori exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type             | Params
------------------------------------------------
0 | loss_fn    | CrossEntropyLoss | 0     
1 | embedding  | Embedding        | 7.8 M 
2 | layers     | ModuleList       | 11.0 M
3 | final_proj | Linear           | 7.8 M 
------------------------------------------------
26.7 M    Trainable params
0         Non-trainable params
26.7 M    Total params
106.757   Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]ERROR: Input has inproper shape
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:00<00:00, 334.18it/s]ERROR: Input has inproper shape
Epoch 0: |          | 0/? [00:00<?, ?it/s]                                  ERROR: Input has inproper shape
Epoch 0: |          | 1/? [00:00<00:00, 200.48it/s, v_num=47]ERROR: Input has inproper shape
Epoch 0: |          | 2/? [00:00<00:00,  9.37it/s, v_num=47] ERROR: Input has inproper shape
Epoch 0: |          | 3/? [00:00<00:00,  9.46it/s, v_num=47]ERROR: Input has inproper shape
Epoch 0: |          | 4/? [00:00<00:00, 12.49it/s, v_num=47]ERROR: Input has inproper shape
Epoch 0: |          | 5/? [00:00<00:00, 15.43it/s, v_num=47]ERROR: Input has inproper shape
Epoch 0: |          | 6/? [00:00<00:00, 18.29it/s, v_num=47]ERROR: Input has inproper shape
Epoch 0: |          | 7/? [00:00<00:00, 21.08it/s, v_num=47]ERROR: Input has inproper shape
Epoch 0: |          | 8/? [00:00<00:00,

c:\Users\Komputer\AppData\Local\Programs\Python\Python310\lib\site-packages\pytorch_lightning\trainer\call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...


In [8]:
### Saving logs to csv
val_data=model.val_losses_list
train_data=model.train_losses_list
# Convert list of dictionaries to DataFrame
log_val_df = pd.DataFrame(val_data)
log_train_df = pd.DataFrame(train_data)

print(log_val_df)
print(log_train_df)

# Directory to save the CSV file
save_dir = log_saving_path #'/path/to/your/directory'  # Replace with your desired directory path
os.makedirs(save_dir, exist_ok=True)  # Create directory if it doesn't exist
# save_dir = 'D:/Projekt_NLP/Saved_stuff/logs/vectorized_moe'#'/path/to/your/directory'  # Replace with your desired directory path
# os.makedirs(save_dir, exist_ok=True)  # Create directory if it doesn't exist


# Define the filename for your CSV file
csv_filename_train = f'logs_train_{model_name}.csv'
csv_filename_val = f'logs_val_{model_name}.csv'

# Construct the full file path
csv_file_path_train = os.path.join(save_dir, csv_filename_train)
csv_file_path_val = os.path.join(save_dir, csv_filename_val)

# Save DataFrame to CSV file
log_train_df.to_csv(csv_file_path_train, index=False)
log_val_df.to_csv(csv_file_path_val, index=False)

print(f"DataFrame saved to {csv_file_path_train}")
print(f"DataFrame saved to {csv_file_path_val}")


     batch_idx   val_loss
0            0  10.502468
1            1  10.506956
2            0   5.306411
3            1   5.185642
4            2   5.019919
..         ...        ...
469        242   5.023704
470        245   4.617348
471        247   4.333498
472        248   4.813462
473        249   5.764741

[474 rows x 2 columns]
      batch_idx  train_loss
0             0   10.505500
1             1   10.080068
2             2    9.593594
3             3   11.128009
4             4    8.940075
...         ...         ...
5295       1483    4.919559
5296       1484    4.832150
5297       1485    4.358718
5298       1486    4.856835
5299       1487    3.753910

[5300 rows x 2 columns]
DataFrame saved to D:/Projekt_NLP/Saved_stuff/logs/Lori\logs_train_Normal_moe_model.csv
DataFrame saved to D:/Projekt_NLP/Saved_stuff/logs/Lori\logs_val_Normal_moe_model.csv
