In [1]:
import os

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2,3,4,6"

os.environ['CUDA_PATH']='/usr/local/cuda-11'

In [2]:
import pandas as pd
import numpy as np
from datasets import load_dataset, Dataset
import re

In [3]:
def prep_gorilla(addr):
    # manually preparing the dataset
    df = pd.read_json(addr, lines=True)
    df_instructions_list = df.code.apply(lambda x: re.findall(r'###.?Instruction: (.*?)\n', x))
    df_outputs_list = df.code.apply(lambda x: re.findall(r'###.?Output: (.+)', x, flags=re.DOTALL))
    
    both_ok = (df_outputs_list.apply(len) == 1) & (df_instructions_list.apply(len) == 1)
    
    df = df[both_ok]
    
    df['instruction'] = df_instructions_list[both_ok].apply(lambda x: x[0])
    df['output'] = df_outputs_list[both_ok].apply(lambda x: x[0])
    
    return Dataset.from_pandas(df.astype(str))

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch

In [5]:
tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b")
tokenizer.add_special_tokens({'pad_token': '<?>'})

def tokenize_function(examples):
    return tokenizer(examples['instruction'], text_target=examples['output'], truncation=True, 
                     padding='max_length', max_length=640)

In [6]:
# prepare the dataset

hf_train_dset = prep_gorilla('/mnt/data/mart/gorilla/data/apibench/huggingface_train.json')
hf_eval_dset = prep_gorilla('/mnt/data/mart/gorilla/data/apibench/huggingface_eval.json')

tok_hf_train_dset = hf_train_dset.map(tokenize_function, batched=True)
tok_hf_eval_dset = hf_eval_dset.map(tokenize_function, batched=True)

tok_hf_train_dset = tok_hf_train_dset.remove_columns(['code', 
                                                      'api_call', 
                                                      'provider',
                                                      'api_data',
                                                      'instruction',
                                                      'output',
                                                      '__index_level_0__'])

tok_hf_eval_dset = tok_hf_eval_dset.remove_columns(['code', 
                                                    'api_call', 
                                                    'provider',
                                                    'api_data',
                                                    'instruction',
                                                    'output',
                                                    '__index_level_0__'])

Map:   0%|          | 0/8081 [00:00<?, ? examples/s]

Map:   0%|          | 0/899 [00:00<?, ? examples/s]

In [7]:
small_tok_hf_train_dset = hf_train_dset.shuffle(seed=42).select(range(500)) # for quick tests

In [8]:
from accelerate import init_empty_weights, infer_auto_device_map, load_checkpoint_and_dispatch
from transformers import AutoConfig, AutoModelForCausalLM

In [9]:
config = AutoConfig.from_pretrained('tiiuae/falcon-7b', trust_remote_code=True)

with init_empty_weights():
    model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
    
model.tie_weights()

device_map = infer_auto_device_map(model, max_memory={0: "5GiB",1: "5GiB",2: "5GiB",3: "5GiB",4: "5GiB",5: "5GiB"}, 
                                   no_split_module_classes=["DecoderLayer"])

model = load_checkpoint_and_dispatch(model, 
                                     "/mnt/data/mart/falcon-7b-sharded-bf16", 
                                     device_map=device_map)

model.hf_device_map

{'transformer.word_embeddings': 0,
 'lm_head': 0,
 'transformer.h.0': 0,
 'transformer.h.1': 0,
 'transformer.h.2': 0,
 'transformer.h.3': 0,
 'transformer.h.4': 1,
 'transformer.h.5': 1,
 'transformer.h.6': 1,
 'transformer.h.7': 1,
 'transformer.h.8': 1,
 'transformer.h.9': 1,
 'transformer.h.10': 2,
 'transformer.h.11': 2,
 'transformer.h.12': 2,
 'transformer.h.13': 2,
 'transformer.h.14': 2,
 'transformer.h.15': 2,
 'transformer.h.16': 3,
 'transformer.h.17': 3,
 'transformer.h.18': 3,
 'transformer.h.19': 3,
 'transformer.h.20': 3,
 'transformer.h.21': 3,
 'transformer.h.22': 4,
 'transformer.h.23': 4,
 'transformer.h.24': 4,
 'transformer.h.25': 4,
 'transformer.h.26': 4,
 'transformer.h.27': 4,
 'transformer.h.28': 5,
 'transformer.h.29': 5,
 'transformer.h.30': 5,
 'transformer.h.31': 5,
 'transformer.ln_f': 5}

In [9]:
config = AutoConfig.from_pretrained('tiiuae/falcon-7b', trust_remote_code=True)

with init_empty_weights():
    model = AutoModelForCausalLM.from_config(config, trust_remote_code=True)
    
model.tie_weights()

device_map = infer_auto_device_map(model, max_memory={0: "5GiB",1: "5GiB",2: "5GiB",3: "5GiB",4: "5GiB",5: "5GiB"}, 
                                   no_split_module_classes=["DecoderLayer"])

model = load_checkpoint_and_dispatch(model, 
                                     "/mnt/data/mart/test_trainer/checkpoint-126/", 
                                     device_map=device_map)

model.hf_device_map

{'transformer.word_embeddings': 0,
 'lm_head': 0,
 'transformer.h.0': 0,
 'transformer.h.1': 0,
 'transformer.h.2': 0,
 'transformer.h.3': 0,
 'transformer.h.4': 1,
 'transformer.h.5': 1,
 'transformer.h.6': 1,
 'transformer.h.7': 1,
 'transformer.h.8': 1,
 'transformer.h.9': 1,
 'transformer.h.10': 2,
 'transformer.h.11': 2,
 'transformer.h.12': 2,
 'transformer.h.13': 2,
 'transformer.h.14': 2,
 'transformer.h.15': 2,
 'transformer.h.16': 3,
 'transformer.h.17': 3,
 'transformer.h.18': 3,
 'transformer.h.19': 3,
 'transformer.h.20': 3,
 'transformer.h.21': 3,
 'transformer.h.22': 4,
 'transformer.h.23': 4,
 'transformer.h.24': 4,
 'transformer.h.25': 4,
 'transformer.h.26': 4,
 'transformer.h.27': 4,
 'transformer.h.28': 5,
 'transformer.h.29': 5,
 'transformer.h.30': 5,
 'transformer.h.31': 5,
 'transformer.ln_f': 5}

In [10]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

In [11]:
training_args = Seq2SeqTrainingArguments(output_dir="/mnt/data/mart/test_trainer", learning_rate=2e-5, 
                                         num_train_epochs=5, warmup_ratio=0.03, 
                                         gradient_accumulation_steps=64, save_strategy='epoch',
                                         load_best_model_at_end=True, fp16=True,
                                         per_device_train_batch_size=1, evaluation_strategy='epoch')

In [12]:
trainer = Seq2SeqTrainer(model=model, args=training_args, 
                         train_dataset=tok_hf_train_dset, 
                         eval_dataset=tok_hf_eval_dset, )

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss
0,No log,3.129288


In [10]:
model = model.eval()

In [11]:
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
)

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.
The model 'RWForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusFor

In [12]:
sequences = pipeline(
   "The user is interested in a tool to find relationships between medical terms.",
    max_length=1024,
    do_sample=True,
    top_k=1,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")

Setting `pad_token_id` to `eos_token_id`:11 for open-end generation.


Result: The user is interested in a tool to find relationships between medical terms.<___:::('('-('-('----------------
















:::::ersenceenceersenceersersersersersersersersersersersersersersersersersersersersersersersersersersersersersersersersersersersersersersersersersersersers.---------.-.-----.-.-.......................................................
.....
.....
..




























































  - - - - -  - -  - -    - - - -      
