In [None]:
!pip install datasets transformers pytorch-lightning wandb sacrebleu



In [None]:
# Import Libraries and Load Checkpoints

In [None]:
from datasets import load_dataset, DatasetDict
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,T5ForConditionalGeneration, AdamW, get_linear_schedule_with_warmup
import pytorch_lightning as pl
from torch.utils.data import DataLoader
import torch
from torch import nn
import wandb
from pprint import pprint
import os
from sacrebleu.metrics import BLEU
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor, ModelCheckpoint
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available. Using GPU:", torch.cuda.get_device_name(device))
else:
    device = torch.device("cpu")
    print("GPU is not available. Using CPU.")

GPU is available. Using GPU: Tesla T4


In [None]:
train_path = '/content/drive/My Drive/Colab Notebooks/NL2SQL/Datasets/HuggingFace_SQL_Context_Dataset/preprocessed_data_train.pt'
val_path = '/content/drive/My Drive/Colab Notebooks/NL2SQL/Datasets/HuggingFace_SQL_Context_Dataset/preprocessed_data_val.pt'

tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-base")

if os.path.exists(train_path) and os.path.exists(val_path):
  preprocessed_data_train = torch.load(train_path)
  preprocessed_data_val = torch.load(val_path)

else:
  print('Creating Dataset')
  dataset = load_dataset("b-mc2/sql-create-context")
  train_test_split = dataset["train"].train_test_split(test_size=0.2)

  # Now, create a new DatasetDict with the split
  split_dataset = DatasetDict({
      'train': train_test_split['train'],
      'validation': train_test_split['test']})

  def preprocess_examples(data,context_prefix = "tables:\n", question_prefix = "\n" + "query for:" , format='pt'):
    #input_prompt = "tables:\n" + "CREATE TABLE student_course_attendance (student_id VARCHAR); CREATE TABLE students (student_id VARCHAR)" + "\n" + "query for:" + "List the id of students who never attends courses?"

    contexts = [context_prefix + context for context in data['context']]
    questions = [question_prefix + question for question in data['question']]
    assert(len(contexts)==len(questions))
    input_prompt = (list(zip(contexts,questions)))
    input_prompt = [ip[0]+ip[1] for ip in input_prompt]

    # later drop 6 outliers here that are longer than 6 and change to 460.  unless add other dataset
    #input_a_filtered = [ii for ii in input_a if len(ii) <= 460]


    model_inputs = tokenizer(input_prompt, max_length=512, padding="max_length", truncation=True, return_tensors=format)

    labels = tokenizer(data['answer'], max_length=512, padding="max_length", truncation=True, return_tensors=format).input_ids
    # replace the index of the padding tokens by -100 such that they are not taken into account by the CrossEntropyLoss

    labels_with_ignore_index = []
    for labels_example in labels:
      labels_example = [label if label != 0 else -100 for label in labels_example]
      labels_with_ignore_index.append(labels_example)

    model_inputs["labels"] = labels_with_ignore_index
    return model_inputs

  preprocessed_data_train.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])
  preprocessed_data_val.set_format(type="torch", columns=['input_ids', 'attention_mask', 'labels'])

  save_dir = '/content/drive/My Drive/Colab Notebooks/NL2SQL/Datasets/HuggingFace_SQL_Context_Dataset/'
  os.makedirs(save_dir, exist_ok=True)

  # Adjusted save commands
  torch.save(preprocessed_data_train, os.path.join(save_dir, 'preprocessed_data_train.pt'))
  torch.save(preprocessed_data_val, os.path.join(save_dir, 'preprocessed_data_val.pt'))


  #add an if preprocessed does not exist in the current directory, then run below.  maybe add a config file

  preprocessed_data_train = split_dataset['train'].map(preprocess_examples,batched=True)
  preprocessed_data_val = split_dataset['validation'].map(preprocess_examples,batched=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
def calculate_bleu_score(predictions, references):
    bleu = BLEU()
    scores = bleu.corpus_score(predictions, [references])
    return scores.score



In [None]:


# define the LightningModule
class CodeT5(pl.LightningModule):
    def __init__(self,lr=5e-5, num_train_epochs=15, warmup_steps=1000):
        super().__init__()
        self.model = T5ForConditionalGeneration.from_pretrained("Salesforce/codet5-small") #AutoModelForSeq2SeqLM makes it pick the right one so you can experiment
        self.tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-small")
        self.save_hyperparameters()
        #store validation outputs
        self.validation_outputs = []

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        return outputs

    def training_step(self, batch, batch_idx):
      outputs = self.model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])
      loss = outputs.loss
      self.log("train_loss", loss)
      return loss

    #runs at the beginning before epoch 0 as a sanity check
    def validation_step(self, batch, batch_idx):
      outputs = self.model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])
      val_loss = outputs.loss
      self.log('validation_loss', val_loss, on_epoch=True, prog_bar=True, logger=True)

      #if its a multiple of 5 epoch, run generations to calculate bleu score.  if not just return 0's for the pred and target texts
      if (self.current_epoch+1)%5==0:



        # Decode predictions for BLEU scoring, make sure you specify the max length to generate or it will default to 20
        preds = self.model.generate(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'],max_length=200)
        # Assuming you have a way to convert model output and labels to strings
        #print('validation prediction shape: ',type(preds),np.shape(preds))
        #print('input id shape of batch validation: ',np.shape(batch['input_ids']))
        pred_texts = [self.tokenizer.decode(pred, skip_special_tokens=True) for pred in preds]
        #Initially decodes jibberish before anything trains.  everything 8,11
        #print('validation batch label shape: ',np.shape(batch['labels']),type(batch['labels']),batch['labels'])
        # Prepare labels by replacing -100 with tokenizer.pad_token_id and decode
        target_texts = []
        for label in batch['labels']:
            # Replace -100 with pad_token_id
            label = torch.where(label == -100, torch.tensor(self.tokenizer.pad_token_id, device=label.device), label)
            # Decode, skipping special tokens
            decoded_text = self.tokenizer.decode(label.tolist(), skip_special_tokens=True)
            target_texts.append(decoded_text)

        #may not be good over a network according to chatgpt
        #log_file_path = '/content/drive/My Drive/Colab Notebooks/pred_texts.log'
        #with open(log_file_path, 'a') as log_file:
        #    log_file.write('pred texts: '+str(pred_texts) + '\n'+'label texts: '+str(target_texts) + '\n\n\n')


        # Save outputs for use in on_validation_epoch_end
        self.validation_outputs.append({"validation_loss": val_loss, "preds": pred_texts, "targets": target_texts})

        #return {"validation_loss": val_loss, "preds": pred_texts, "targets": target_texts}

      #else:

          #self.validation_outputs.append({"validation_loss": val_loss, "preds": pred_texts, "targets": target_texts})
          #return {"validation_loss": val_loss, "preds": None, "targets": None}
      #  pass


    def on_validation_epoch_end(self):
        # Check if the current epoch + 1 is divisible by 5 (since epochs are 0-indexed)
        if (self.current_epoch + 1) % 5 == 0:
          # Assuming validation_step_outputs is a list of dictionaries with 'preds' and 'targets'
          # that you've accumulated over the validation epoch
          preds = [output["preds"] for output in self.validation_outputs]
          targets = [output["targets"] for output in self.validation_outputs]

          # Flatten lists if necessary and calculate BLEU score
          flat_preds = [p for sublist in preds for p in sublist]
          flat_targets = [t for sublist in targets for t in sublist]
          bleu_score = calculate_bleu_score(flat_preds, flat_targets)
          print(f'BLEU score at epoch {self.current_epoch + 1}: {bleu_score}')

          # Log the BLEU score
          self.log('val_bleu', bleu_score, on_epoch=True, prog_bar=True, logger=True)

          # Optionally, reset validation outputs if you plan to accumulate fresh for each calculation
        self.validation_outputs = []
        # If not the desired epoch, you might still want to reset or handle self.validation_outputs as needed

    # Your existing configure_optimizers, train_dataloader, and val_dataloader methods...


    def configure_optimizers(self):
      # create optimizer
      optimizer = AdamW(self.parameters(), lr=self.hparams.lr)
      # create learning rate scheduler
      num_train_optimization_steps = self.hparams.num_train_epochs * len(self.train_dataloader())
      lr_scheduler = {'scheduler': get_linear_schedule_with_warmup(optimizer,
                                                  num_warmup_steps=self.hparams.warmup_steps,
                                                  num_training_steps=num_train_optimization_steps),
                      'name': 'learning_rate',
                      'interval':'step',
                      'frequency': 1}

      return {"optimizer": optimizer, "lr_scheduler": lr_scheduler}

    #max suggested workers is 2 in this case.  more can be detrimental to data loader.
    def train_dataloader(self):
      return DataLoader(preprocessed_data_train, shuffle=True, num_workers=2,batch_size=16,pin_memory=True)

    def val_dataloader(self):
      return DataLoader(preprocessed_data_val, shuffle=False, num_workers=2,batch_size=16,pin_memory=True)



# init the autoencoder
model = CodeT5()





In [None]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mvivektreddy[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
!nvidia-smi

Sun Mar 31 22:19:15 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   77C    P0              45W /  70W |  12067MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
torch.cuda.empty_cache()

In [None]:
#human eval sql eval drop outlier really long input prompts to shorten input sequence length

wandb_logger = WandbLogger(name='codet5-finetune-code-nl2sql', project='CodeT5')
# for early stopping, see https://pytorch-lightning.readthedocs.io/en/1.0.0/early_stopping.html?highlight=early%20stopping
early_stop_callback = EarlyStopping(
    monitor='validation_loss',
    #monitor='training_loss',
    patience=3,
    strict=False,
    verbose=False,
    mode='min'
)
lr_monitor = LearningRateMonitor(logging_interval='step')



checkpoint_callback = ModelCheckpoint(
    dirpath='/content/drive/My Drive/Colab Checkpoints//NL2SQL/checkpoints/',  # Save to Google Drive
    filename='codeT5-{epoch}',
    save_top_k=-1,
    every_n_epochs=1,
    monitor='validation_loss',
    mode='min',
)


# Add the checkpoint_callback to the list of callbacks in the Trainer
# add mixed precision for increased speed and less memory
trainer = pl.Trainer(precision='16-mixed',
    max_epochs=50,
    logger=wandb_logger,
    callbacks=[early_stop_callback, lr_monitor, checkpoint_callback]  # Add checkpoint_callback here
)


trainer.fit(model)

INFO:pytorch_lightning.utilities.rank_zero:Using 16bit Automatic Mixed Precision (AMP)
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loggers/wandb.py:391: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
---------------------------------------

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]



Validation: |          | 0/? [00:00<?, ?it/s]