In [1]:
from transformers import set_seed

seed = 42
set_seed(seed)

2024-10-09 13:58:32.240141: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-10-09 13:58:32.356055: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


2024-10-09 13:58:32.825861: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-10-09 13:58:32.825918: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [2]:
import torch
from transformers import AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

lora_config = LoraConfig(
    r = 64,
    lora_alpha = 16,
    init_lora_weights = False,
    lora_dropout = 0.1,
    bias = 'none',
    task_type="CAUSAL_LM"
)

In [3]:
from transformers import AutoTokenizer
from peft import prepare_model_for_kbit_training, get_peft_model

model_name = "TinyLlama/TinyLlama_v1.1"
def init_tokenizer(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    print(tokenizer)
    return tokenizer

In [4]:
def init_model(model_name, tokenizer, bnb_config):
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        revision="main",
        quantization_config=bnb_config
    )

    model.config.use_cache = False
    model = prepare_model_for_kbit_training(model)
    model.config.pad_token_id = tokenizer.pad_token_id

    return model


#model = init_model(model_name, tokenizer, bnb_config, lora_config)

In [5]:
from trl import DataCollatorForCompletionOnlyLM

def init_collator(tokenizer):
    return DataCollatorForCompletionOnlyLM(
        tokenizer=tokenizer,
        mlm=False,
        response_template="<|body|>"
    )


In [6]:
def print_trainable_parameters(model):
  """
  Prints the number of trainable parameters in the model.
  """
  trainable_params = 0
  all_param = 0
  for _, param in model.named_parameters():
    all_param += param.numel()
    if param.requires_grad:
      trainable_params += param.numel()
  print(
    f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
  )

In [7]:
from trl import SFTConfig

def init_args():
    training_args = SFTConfig(
        output_dir=f"./results/chunked/supervised",
        #label_names=['input_ids'],
        weight_decay=0.01,
        num_train_epochs=1,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=8,
        dataloader_num_workers=4,
        logging_dir="./logs",
        logging_strategy="steps",
        learning_rate=1e-5,
        gradient_checkpointing=True,
        gradient_checkpointing_kwargs={'use_reentrant':False},
        fp16=True,
        no_cuda=False,
        #tf32=True,
        optim="paged_adamw_8bit",
        logging_steps=250,
        save_strategy="no",
        #save_steps = 800,
        eval_strategy="steps",
        eval_steps=3200,
        #remove_unused_columns=False
    )
    return training_args

In [8]:
from trl import SFTTrainer

def init_trainer(model, args, train, valid, tokenizer, data_collator):
    return SFTTrainer(
        model=model,
        args=args,
        train_dataset=train,
        eval_dataset=valid,
        tokenizer=tokenizer
    )


In [None]:
from datasets import DatasetDict, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import gc
import numpy as np

def tokenize_fn(example):
    comment = "#" if example["language"] in ["Shell","Ruby","Python"] else "//"
    example["input"] = comment + " <func>\n" + example["head"] 
    example["output"] = example["input"] + example["body"] + "\n" + comment + " </func>"
    return example
    
def tokenize_fn_2(example):
    tokenized_data = tokenizer(
        example["output"], 
        padding="max_length",
        truncation=True,
        return_tensors="np"
    )
    labels = np.copy(tokenized_data["input_ids"])
    
    input_length = len(tokenizer(example["input"])["input_ids"]) + 1 
    labels[:input_length] = -100
    
    tokenized_data["labels"] = labels
    return tokenized_data


def load_dataset(seed):
    df = pd.read_parquet(f"data/chunks/chunk_{seed+1}.parquet")
    
    train, valid = train_test_split(df, train_size=0.8, test_size=0.2, random_state=42)
    
    ds = DatasetDict({
        'train': Dataset.from_pandas(train),
        'valid': Dataset.from_pandas(valid)}
    )
    
   # ds = ds.map(special_tokens)
    tokenized_ds = ds.map(tokenize_fn)
    tokenized_ds = tokenized_ds.map(tokenize_fn_2, batched=True)
    print(tokenized_ds)
    
    
    del df
    del train
    del valid
    del ds
    gc.collect()
    
    return tokenized_ds


In [10]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [11]:
%%time

import numpy as np

model_name = "Microsoft/Phi-1"

model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=model_name,
    device_map="auto",
    quantization_config=bnb_config
)

tokenizer = init_tokenizer(model_name)
tokenizer.pad_token = "<|pad|>"
data_collator = init_collator(tokenizer)

model.config.use_cache = False
model = prepare_model_for_kbit_training(model)
model.config.pad_token_id = tokenizer.pad_token_id
model = get_peft_model(model, peft_config=lora_config)
print_trainable_parameters(model)

training_args = init_args()
 
tokenized_ds = load_dataset(0)
trainer = init_trainer(
    model,
    training_args,
    tokenized_ds["train"],
    tokenized_ds["valid"],
    tokenizer, 
    data_collator
)

# skipped = [3,4,9,11,13,19,20,22,23,25,27,28]
# 22 failed at [5907/6401 6:59:08 < 35:03, 0.23 it/s, Epoch 0.92/1]
# 23 failed at [6095/6401 7:10:49 < 21:38, 0.24 it/s, Epoch 0.95/1] (RuntimeError: NVML_SUCCESS == DriverAPI::get()->nvmlInit_v2_() INTERNAL ASSERT FAILED at "../c10/cuda/CUDACachingAllocator.cpp":813, please report a bug to PyTorch.)

# skipped 21
num_epochs = 100
for i in [1,2,5,6,7,8,10,12,14,15,16,17,18,24,26,29,30,31,32,33,34,35,37]:
    print(i)
    if i:
        tokenized_ds = load_dataset(i)
        
        trainer.train_dataset = tokenized_ds["train"]
        trainer.eval_dataset = tokenized_ds["valid"]
        
    print(trainer.train_dataset[0])
        
        #trainer.args.num_train_epochs = i+1

    trainer.train()
    trainer.save_model(f"./results/tagged/checkpoint-{i}")



CodeGenTokenizerFast(name_or_path='Microsoft/Phi-1', vocab_size=50257, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	50257: AddedToken("                               ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50258: AddedToken("                              ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50259: AddedToken("                             ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50260: AddedToken("                            ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50261: AddedToken("            

trainable params: 44040192 || all params: 858331136 || trainable%: 5.1309092904675895


Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12804 [00:00<?, ? examples/s]

Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2322 > 2048). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/12804 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['language', 'head', 'body', 'file_id', 'split', '__index_level_0__', 'input', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 51212
    })
    valid: Dataset({
        features: ['language', 'head', 'body', 'file_id', 'split', '__index_level_0__', 'input', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 12804
    })
})
1


  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12804 [00:00<?, ? examples/s]

Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12804 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['language', 'head', 'body', 'file_id', 'split', '__index_level_0__', 'input', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 51212
    })
    valid: Dataset({
        features: ['language', 'head', 'body', 'file_id', 'split', '__index_level_0__', 'input', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 12804
    })
})


{'language': 'Python', 'head': '# Python\n# Returns a function that converts an SQL value to a Python value.\n\nresult_processor(self, dialect, coltype):\n', 'body': 'def process(value):\n    """\n      Converts a SQL Geometry string to a corresponding Python value depending\n      on the actual Geometry type stored in the db.\n      """\n    if value is None:\n        return None\n    raise ValueError(\'NEED TO TODO THIS PART\')\nreturn process', 'file_id': '4f848ad88dae074294bcfcc6c3dda3c9437a4386', 'split': 1, '__index_level_0__': 449734, 'input': '# <func>\n# Python\n# Returns a function that converts an SQL value to a Python value.\n\nresult_processor(self, dialect, coltype):\n\ndef process(value):\n    """\n      Converts a SQL Geometry string to a corresponding Python value depending\n      on the actual Geometry type stored in the db.\n      """\n    if value is None:\n        return None\n    raise ValueError(\'NEED TO TODO THIS PART\')\nreturn process\n# </func>', 'input_ids'

/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status






  def forward(ctx, input, weight, bias=None):
  def backward(ctx, grad_output):


  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss
3200,1.5534,1.559397
6400,1.5491,1.540126


2


Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12804 [00:00<?, ? examples/s]

Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12804 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['language', 'head', 'body', 'file_id', 'split', '__index_level_0__', 'input', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 51212
    })
    valid: Dataset({
        features: ['language', 'head', 'body', 'file_id', 'split', '__index_level_0__', 'input', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 12804
    })
})
{'language': 'Python', 'head': '# Python\n# Get the edos array.\n\nedos(self):\n', 'body': 'return self._edos', 'file_id': 'a38f3edbd06be08ea2f66040e4b572980bf6f17a', 'split': 2, '__index_level_0__': 439856, 'input': '# <func>\n# Python\n# Get the edos array.\n\nedos(self):\n\nreturn self._edos\n# </func>', 'input_ids': [2, 1279, 20786, 29, 198, 2, 11361, 198, 2, 3497, 262, 1225, 418, 7177, 13, 198, 198, 276, 418, 7, 944, 2599, 198, 198, 7783, 2116, 13557, 276, 418, 198, 2, 7359, 20786, 29, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 

Step,Training Loss,Validation Loss
3200,1.4852,1.51173


In [None]:
print(trainer.train_dataset[1]["body"])