# Part-1 : Section A
Training a multi-exit ElasticBERT model on SST-2 dataset

In [None]:
# The code closely follows the original ElasticBERT repository
# Feature to train models with a given exit configuration is added
!git clone https://github.com/MLiONS/MutiExitDNNs.git

%cd /content/MutiExitDNNs/ElasticBERT
!pip install -r requirements.txt

In [None]:
#All the hyper-parameters/ location to training dataset are set in
#MultiExitDNNs -> finetune-dynamic -> finetune_elue_entropy.sh file

#1)Set the correct location to SST-2 dataset
#All models are trained on SST-2 "train" split and evaluated on "dev" split
#"train.tsv" and "dev.tsv" are expected to be in ELUE_DIR/TASK_NAME
#You can set both ELUE_DIR and TASK_NAME in finetune_elue_entropy.sh
#Or change the dataset directory using "data_dir" option

#2)Please change the "num_output_layers" option as per the desired exit-configuration

#3)Model checkpoints will be saved at "output_dir" and
#logs will be available at "log_dir"

!bash /content/MutiExitDNNs/ElasticBERT/finetune-dynamic/finetune_elue_entropy.sh

# Part-1 : Section B
Generating the prediction matrix

In [33]:
#Evaluation on other datasets-IMDb or Yelp

In [None]:
from transformers import BertTokenizer as ElasticBertTokenizer

#Set the current directory location inside "finetune-dynamic" folder
%cd /content/MutiExitDNNs/ElasticBERT/finetune-dynamic

from models.configuration_elasticbert import ElasticBertConfig
from models.modeling_elasticbert_entropy import ElasticBertForSequenceClassification

In [4]:
#Set location to the best performing model
#Model checkpoints are saved at "output_dir" from Part-1: Section A

checkpoint = '/content/MutiExitDNNs/ElasticBERT/ckpts/elue/entropy/SST-2/checkpoint-50'

In [5]:
config = ElasticBertConfig.from_pretrained(checkpoint)
tokenizer = ElasticBertTokenizer.from_pretrained(checkpoint)
model = ElasticBertForSequenceClassification.from_pretrained(checkpoint)
#model

In [6]:
def get_args(arg_vec):
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument(
        "--num_hidden_layers",
        default=None,
        type=int,
        required=True,
        help='The number of layers to import.',
    )
    parser.add_argument(
        "--num_output_layers",
        nargs = 12,
        default=None,
        type=int,
        required=True,
        help='The number of layers to output.',
    )
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help="The input data dir. Should contain the .tsv files (or other data files) for the task.",
    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name.",
    )    
    parser.add_argument(
        "--task_name",
        default=None,
        type=str,
        required=True,
        help="The name of the task to train selected in the list.",
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model predictions and checkpoints will be written.",
    )
    parser.add_argument(
        "--log_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the logs will be written.",
    )
    parser.add_argument(
        "--spec_eval",
        default=None,
        type=str,
        required=False,
        help="'Set as train or test based on specific split on which to evaluate'"
    )
    parser.add_argument(
        "--patience",
        default='0',
        type=str,
        required=False,
    )
    parser.add_argument(
        "--regression_threshold",
        default=0,
        type=float,
        required=False,
    )
    parser.add_argument(
        "--early_exit_entropy",
        default='0.1',
        type=str,
        required=False,
    )
    # Other parameters  
    parser.add_argument(
        "--load",
        default=None,
        type=str,
        help="The path of ckpts used to continue training."
    )
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name",
    )
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name",
    )
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help="Where do you want to store the pre-trained models downloaded from huggingface.co",
    )
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help="The maximum total input sequence length after tokenization. Sequences longer "
             "than this will be truncated, sequences shorter will be padded.",
    )
    parser.add_argument("--debug", action="store_true", help="Whether to use debug mode.")
    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--evaluate_during_training",
        action="store_true",
        help="Run evaluation during training at each logging step.",
    )
    parser.add_argument(
        "--do_lower_case",
        action="store_true",
        help="Set this flag if you are using an uncased model.",
    )
    parser.add_argument(
        "--per_gpu_train_batch_size",
        default=8,
        type=int,
        help="Batch size per GPU/CPU for training.",
    )
    parser.add_argument(
        "--per_gpu_eval_batch_size",
        default=1,
        type=int,
        help="Batch size per GPU/CPU for evaluation.",
    )
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument(
        "--learning_rate",
        default=5e-5,
        type=float,
        help="The initial learning rate for Adam.",
    )
    parser.add_argument("--weight_decay", default=0.01, type=float, help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
    parser.add_argument(
        "--num_train_epochs",
        default=3.0,
        type=float,
        help="Total number of training epochs to perform.",
    )
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
    parser.add_argument("--warmup_rate", default=0, type=float, help="Linear warmup over warmup_rate.")

    parser.add_argument("--logging_steps", type=int, default=500, help="Log every X updates steps.")
    parser.add_argument(
        "--save_steps",
        type=int,
        default=500,
        help="Save checkpoint every X updates steps.",
    )
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
    )
    parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
    parser.add_argument(
        "--overwrite_output_dir",
        action="store_true",
        help="Overwrite the content of the output directory",
    )
    parser.add_argument(
        "--overwrite_cache",
        action="store_true",
        help="Overwrite the cached training and evaluation sets",
    )
    parser.add_argument(
        "--not_save_model",
        action="store_true",
        help="Do not save model checkpoints"
    )
    parser.add_argument("--seed", type=int, default=6, help="random seed for initialization")

    parser.add_argument(
        "--fp16",
        action="store_true",
        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
             "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument(
        "--local_rank",
        type=int,
        default=-1,
        help="For distributed training: local_rank",
    )
    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
    args = parser.parse_args(arg_vec)

    return args

In [7]:
from evaluations import evaluate_elue_entropy

In [8]:
  ELUE_DIR='/content/drive/MyDrive/elue_data'
  TASK_NAME='SST-2'
  
  arg_vec= ['--model_name_or_path', 'fnlp/elasticbert-base', 
  '--task_name', 'SST-2', \
  '--do_train', \
  '--do_lower_case', \
  '--data_dir', "/content/drive/MyDrive/elue_data/SST-2", \
  '--log_dir', '/content/ElasticBERT/logs/elue/entropy/SST-2TestCheck', \
  '--output_dir', '/content/ElasticBERT/ckpts/elue/entropy/SST-2TestCheck', \
  '--num_hidden_layers', '12', \
  '--num_output_layers', '1', '1', '1', '1', '1', '0', '0', '1', '0', '1', '0', '1', \
  '--max_seq_length', '128', \
  '--per_gpu_train_batch_size', '32', \
  '--per_gpu_eval_batch_size',' 32', \
  '--learning_rate', '2e-5', \
  '--weight_decay', '0.1', \
  '--save_steps', '50', \
  '--logging_steps', '50', \
  '--num_train_epochs', '5',  \
  '--warmup_rate', '0.06', \
  '--evaluate_during_training', \
  '--overwrite_output_dir' 
]

import argparse
parser = argparse.ArgumentParser()

args = get_args(arg_vec)

In [None]:
import torch
if args.local_rank == -1 or args.no_cuda:
    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    args.n_gpu = torch.cuda.device_count()
else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
    torch.cuda.set_device(args.local_rank)
    device = torch.device("cuda", args.local_rank)
    torch.distributed.init_process_group(backend="nccl")
    args.n_gpu = 1
args.device = device

args.output_mode = 'classification'

print(args.device)
model.to(args.device)

In [10]:
#Custom Selection
dataset = 'IMDb' #or 'Yelp'

#To check model performance on SST-2 dev split:
#Please set dataset = 'SST-2' and data_split='dev'

In [12]:
import numpy as np
import pandas as pd

def get_preds(eval_dataset='IMDb', data_split='train'):
  args.spec_eval=data_split
  args.task_name=eval_dataset.lower()
  args.data_dir=ELUE_DIR + '/'+args.task_name

  results_all, exit_preds, op_labels = evaluate_elue_entropy(args, model, tokenizer)

  exit_preds_list = np.stack(exit_preds, axis=1)
  df = pd.DataFrame((exit_preds_list) )
  df['op_labels'] = op_labels
  
  return df

In [None]:
df_train = get_preds(eval_dataset=dataset, data_split='train')
df_test = get_preds(eval_dataset=dataset, data_split='test')

df_tot = pd.concat([df_train, df_test])
df_tot = df_tot.reset_index(drop=True)
print(df_tot.head())

df_tot.to_csv(r'/content/Exit_Predictions_TrainTest_IMDb_8exits.csv',sep ='\t', index = False)