In [1]:
import torch
from transformers import AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig
from transformers import set_seed
from transformers import AutoTokenizer
from peft import prepare_model_for_kbit_training, get_peft_model
from transformers import DataCollatorForLanguageModeling
from transformers import Trainer
import os
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
from peft import PeftModelForCausalLM
import torch
from datasets import DatasetDict, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import gc
import json


2024-10-28 11:46:06.185024: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-10-28 11:46:06.302549: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


2024-10-28 11:46:06.737160: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-10-28 11:46:06.737221: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [2]:
seed = 42
set_seed(seed)
model_name = "TinyLlama/TinyLlama_v1.1"

In [3]:
def train_model(base_config, lora_config, bnb_config, data, tokenizer, collator):
    global model_name

    if bnb_config:
        print("QLORA")
        bnb = BitsAndBytesConfig(
            load_in_4bit=bnb_config["load_in_4bit"],
            bnb_4bit_use_double_quant=bnb_config["bnb_4bit_use_double_quant"],
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=bnb_config["bnb_4bit_compute_dtype"]
        )
        
        model = AutoModelForCausalLM.from_pretrained(
                    model_name,
                    device_map="auto",
                    revision="main",
                    quantization_config = bnb
                )
    else:
        model = AutoModelForCausalLM.from_pretrained(
                model_name,
                device_map="auto",
                revision="main"
            )
    
    if lora_config:
        print("LORA")
        lora = LoraConfig(
            r = lora_config["r"],
            lora_alpha = lora_config["lora_alpha"],
            init_lora_weights = True,
            lora_dropout = lora_config["lora_dropout"],
            bias = 'none',
            task_type="CAUSAL_LM"
        )
        
        model = prepare_model_for_kbit_training(model)
        model = get_peft_model(model,lora)
    
    args = TrainingArguments(
        output_dir=".",
        fp16=base_config["fp16"],
        weight_decay=base_config["weight_decay"],
        learning_rate=base_config["learning_rate"],
        label_names=['input_ids'],
        num_train_epochs=1,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        gradient_accumulation_steps=8,
        no_cuda=False,
        optim="paged_adamw_8bit"
    )
    
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=data,
        tokenizer=tokenizer,
        data_collator=collator
    )
    
    trainer.train()
    save_path = "tmp_trainer_smol"
    trainer.save_model(save_path)
    model.save_pretrained(save_path+"_peft")
    
    return model

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

def reformat_func(example):
    example["full"] = "# <func>\n" + example["head"] + example["body"] + "\n</func>"
    return example

def tokenize_func(example):
    return tokenizer(example["full"], return_tensors="np",padding="max_length",max_length=1000)

data = Dataset.from_parquet("../data/chunks/chunk_1.parquet")

df = pd.DataFrame(data)

filtered_df = df[df["language"] == "Python"]

sampled_df = filtered_df.sample(frac=.1, random_state=42).reset_index(drop=True)

data = Dataset.from_pandas(sampled_df)

print(f"Original size after filtering: {len(filtered_df)}")
print(f"Sampled size (1%): {len(sampled_df)}")
print(sampled_df.head())


data = data.map(reformat_func)
tokenized_ds = data.map(tokenize_func, batched=True)



Original size after filtering: 34080
Sampled size (1%): 3408
  language                                               head  \
0   Python  # Python\n# ADD ME\n\ntest_build_epg_title(sel...   
1   Python  # Python\n# :param sceneName: (Optional) 场景名字，...   
2   Python  # Python\n# Save frames in animation \n\nsave(...   
3   Python  # Python\n# Sets the coordinate_y of this UIPr...   
4   Python  # Python\n# Command to value (in percent)\n\np...   

                                                body  \
0  item_helper = ItemHelper(constants=Constants()...   
1       self.sceneName = sceneName\nself.rate = rate   
2  global counter\nfilename = f'frame{counter:04d...   
3                  self._coordinate_y = coordinate_y   
4  assert 0 <= value <= 100, value\nsend_command(...   

                                    file_id  split  __index_level_0__  
0  1d09b47b21a7b9333e73343761e428640501363f      0            4567132  
1  d4d1d9b094838079345f9c23e1f12390cca425d0      0             5943

Map:   0%|          | 0/3408 [00:00<?, ? examples/s]

Map:   0%|          | 0/3408 [00:00<?, ? examples/s]

In [5]:
model = train_model({"weight_decay":0.1,"learning_rate":1e-4,"fp16":False},
            {"r":4096,"lora_alpha":4096,"lora_dropout":0.1},
            {"load_in_4bit":True,"bnb_4bit_use_double_quant":True,"bnb_4bit_compute_dtype":"bfloat16"},
            #False,
            tokenized_ds,tokenizer,collator)

prompt = """from typing import List\n# <func>\n# Python\n# Check if in given list of numbers, are any two numbers closer to each other than given threshold.\n#>>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n# False\n# >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n# True\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:"""

gen = pipeline(model=model, tokenizer=tokenizer, task="text-generation", device_map="auto",max_new_tokens=512)
print(gen(prompt))


gen = pipeline(model=model.merge_and_unload(), tokenizer=tokenizer, task="text-generation", device_map="auto",max_new_tokens=512)
print(gen(prompt))


gen = pipeline(model=AutoModelForCausalLM.from_pretrained(model_name), tokenizer=tokenizer, task="text-generation", device_map="auto",max_new_tokens=512)
print(gen(prompt))

QLORA




LORA


[2024-10-28 11:46:21,167] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)




/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status




  def forward(ctx, input, weight, bias=None):
  def backward(ctx, grad_output):


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


  return fn(*args, **kwargs)


  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss


The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausal

  return fn(*args, **kwargs)


[{'generated_text': "from typing import List\n# <func>\n# Python\n# Check if in given list of numbers, are any two numbers closer to each other than given threshold.\n#>>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n# False\n# >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n# True\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    for number in numbers:\n        if number < threshold:\n            return False\n        else:\n            return True\n\nhas_close_elements(numbers: List[float], threshold: float) -> bool:\n    return has_close_elements(numbers, threshold)\n\n</func>\n\n\nA: I'm not sure if this is the best way to do it, but I'm trying to find the closest number to another number.\nI'm using the following function:\ndef closest_number(numbers: List[float], threshold: float):\n    closest_number = 0\n    for number in numbers:\n        if number < threshold:\n            closest_number = number\n    return closest_number\n\nI'm trying t



[{'generated_text': 'from typing import List\n# <func>\n# Python\n# Check if in given list of numbers, are any two numbers closer to each other than given threshold.\n#>>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n# False\n# >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n# True\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    for number in numbers:\n        if number < threshold:\n            return False\n    return True\n\n# <func>\n# Python\n# Check if in given list of numbers, are any two numbers closer to each other than given threshold.\n#>>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n# False\n# >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n# True\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    for number in numbers:\n        if number < threshold:\n            return False\n    return True\n\n# <func>\n# Python\n# Check if in given list of numbers, are any two numbers closer to each other than

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'generated_text': 'from typing import List\n# <func>\n# Python\n# Check if in given list of numbers, are any two numbers closer to each other than given threshold.\n#>>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n# False\n# >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n# True\ndef has_close_elements(numbers: List[float], threshold: float) -> bool:\n    """\n    Check if in given list of numbers, are any two numbers closer to each other than given threshold.\n    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)\n    True\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    False\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)\n    True\n    >>> ha