In [1]:
import sys
import os

main_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(main_dir)

import model_classes
from model_classes import *
from MH_Lori import *
from MH_MoE import *
from dataloader import *
import dataloader
from helper_functions import *
import torch
from transformers import PretrainedConfig
import torch.nn as nn
import math
import copy
#import lightning.pytorch as pl
from pytorch_lightning import LightningModule

from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
import pandas as pd
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import CSVLogger

device = 'cuda' if torch.cuda.is_available() else 'cpu'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
config_small= PretrainedConfig(
    num_experts_per_token=2,
    hidden_size=128,
    num_attention_heads = 8,
    num_MH_MOE_heads = 4,
    num_experts=12,
    batch_size = 16,
    seq_len = 512,
    capacity_factor = 3,
    device = device,
    intermediate_size = 256,
    forward_layer_class = MH_MoE,
    vocab_size = 30522,
    n_layers = 8,
    no_lori_segments = 32,
    py_lightning_loging = False,
    loss_fn = torch.nn.CrossEntropyLoss(),
    lr = 0.0006, #SET TO 0.0002
    betas = (0.9, 0.95),
    treat_mh_lori_as_regular_lori = False,
    load_balancing_coefficient=0.01,
    proportions = [0.5, 0.4, 0.05, 0.05] # null, train, validation, test
)

config = config_small

In [3]:
# example of usage:
l = give_dataloaders(batch_size = config.batch_size, seq_len = config.seq_len + 1, proportions = config.proportions)
train_dataloader_a = l["train_dataloader"]
train_dataloader_b = l["val_dataloader"]
test_dataloader = l["test_dataloader"]
sample = next(iter(train_dataloader_a))
# print(sample)
sample.shape

Token indices sequence length is longer than the specified maximum sequence length for this model (766 > 512). Running this sequence through the model will result in indexing errors


torch.Size([16, 513])

In [4]:
mh_lori = Transformer(config_small).to(config_small.device)
estimate_model_size(mh_lori)
estimate_model_size(mh_lori.layers)
estimate_model_size(mh_lori.layers[0].forward_layer)
get_gpu_memory()
print(isinstance(mh_lori, LightningModule))


Estimated Model Size: 38.96 MB, total number of parameters: 40,854,760
Estimated Model Size: 9.04 MB, total number of parameters: 9,478,144
Estimated Model Size: 0.88 MB, total number of parameters: 920,064
No GPU available.
True


In [5]:
torch.set_float32_matmul_precision("medium")
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
import pytorch_lightning as pl

# Define the checkpoint callback to save the model every 1000 batches
checkpoint_callback = ModelCheckpoint(
    dirpath='saved_models',  # Directory to save the checkpoints
    filename='MH_Lori_dataloader_a-{epoch}-{step}',  # Filename pattern
    save_top_k=-1,  # Save all models
    save_weights_only=False,  # Save only the weights (or set to False to save the full model)
    every_n_train_steps=500  # Save the model every 1000 batches
)
#new tensorboard for displaying logs

# Define the logger
logger = TensorBoardLogger("tb_logs", name="moe_plain_model") ### CHANGE NAME FOR DIFFERENT RUN (different model)

# Initialize the trainer with the checkpoint callback
trainer = pl.Trainer(
    callbacks=[checkpoint_callback],
    max_epochs=30, # Set the number of epochs
    logger=logger
)

trainer.fit(model = mh_lori, train_dataloaders=train_dataloader_a, val_dataloaders=train_dataloader_b)



GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name       | Type             | Params
------------------------------------------------
0 | loss_fn    | CrossEntropyLoss | 0     
1 | embedding  | Embedding        | 3.9 M 
2 | layers     | ModuleList       | 2.4 M 
3 | final_proj | Linear           | 3.9 M 
------------------------------------------------
10.2 M    Trainable params
0         Non-trainable params
10.2 M    Total params
40.855    Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Users\miria\anaconda3\envs\integrated_env\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]   VALIDATION: Batch 0, loss 10.526432991027832
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:09<00:09,  0.11it/s]   VALIDATION: Batch 1, loss 10.495028495788574
                                                                           

c:\Users\miria\anaconda3\envs\integrated_env\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=7` in the `DataLoader` to improve performance.


Epoch 0: |          | 0/? [00:00<?, ?it/s]    TRRAINING: Batch 0, loss 10.515584945678711
Epoch 0: |          | 1/? [00:28<00:00,  0.04it/s, v_num=3]   TRRAINING: Batch 1, loss 10.469141006469727
Epoch 0: |          | 2/? [00:51<00:00,  0.04it/s, v_num=3]   TRRAINING: Batch 2, loss 10.25931453704834


c:\Users\miria\anaconda3\envs\integrated_env\Lib\site-packages\pytorch_lightning\trainer\call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...


In [6]:
### Saving logs to csv
val_data=mh_lori.val_losses_list
train_data=mh_lori.train_losses_list
# Convert list of dictionaries to DataFrame
log_val_df = pd.DataFrame(val_data)
log_train_df = pd.DataFrame(train_data)

print(log_val_df)
print(log_train_df)

# Directory to save the CSV file
save_dir = 'C:/Users/miria/OneDrive/Documents/ML/nlp/NLP_final_project' #'/path/to/your/directory'  # Replace with your desired directory path
os.makedirs(save_dir, exist_ok=True)  # Create directory if it doesn't exist
save_dir = 'C:/Users/miria/OneDrive/Documents/ML/nlp/NLP_final_project'#'/path/to/your/directory'  # Replace with your desired directory path
os.makedirs(save_dir, exist_ok=True)  # Create directory if it doesn't exist

model_name='moe_model'
# Define the filename for your CSV file
csv_filename_train = f'logs_train_{model_name}.csv'
csv_filename_val = f'logs_val_{model_name}.csv'

# Construct the full file path
csv_file_path_train = os.path.join(save_dir, csv_filename_train)
csv_file_path_val = os.path.join(save_dir, csv_filename_val)

# Save DataFrame to CSV file
log_train_df.to_csv(csv_file_path_train, index=False)
log_val_df.to_csv(csv_file_path_val, index=False)

print(f"DataFrame saved to {csv_file_path_train}")
print(f"DataFrame saved to {csv_file_path_val}")


   batch_idx   val_loss
0          0  10.526433
1          1  10.495028
   batch_idx  train_loss
0          0   10.515585
1          1   10.469141
2          2   10.259315
DataFrame saved to C:/Users/miria/OneDrive/Documents/ML/nlp/NLP_final_project\logs_train_moe_model.csv
DataFrame saved to C:/Users/miria/OneDrive/Documents/ML/nlp/NLP_final_project\logs_val_moe_model.csv
