In [1]:
#from huggingface_hub import login
#login()

In [2]:
from transformers import set_seed

seed = 42
set_seed(seed)

In [3]:
import torch
from transformers import AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

lora_config = LoraConfig(
    r = 64,
    lora_alpha = 16,
    init_lora_weights = False,
    lora_dropout = 0.1,
    bias = 'none',
    task_type="CAUSAL_LM"
)

In [4]:
from transformers import AutoTokenizer
from peft import prepare_model_for_kbit_training, get_peft_model
from transformers import DataCollatorForLanguageModeling

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
#model_name = "microsoft/phi-2"

def init_tokenizer(model_name):
    print(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.bos_token = "<func>"
    tokenizer.eos_token = "</func>"
    print(tokenizer)
    #if model_name == "microsoft/phi-2": 
    tokenizer.pad_token = "</s>"
    return tokenizer

In [5]:
def init_model(model_name, tokenizer, bnb_config, lora_config):
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        revision="main",
        quantization_config=bnb_config
    )

    model.config.use_cache = False
    model = prepare_model_for_kbit_training(model)
    model.config.pad_token_id = tokenizer.pad_token_id

    return model


#model = init_model(model_name, tokenizer, bnb_config, lora_config)

In [6]:
def init_collator(tokenizer):
    return DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )


In [7]:
def print_trainable_parameters(model):
  """
  Prints the number of trainable parameters in the model.
  """
  trainable_params = 0
  all_param = 0
  for _, param in model.named_parameters():
    all_param += param.numel()
    if param.requires_grad:
      trainable_params += param.numel()
  print(
    f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
  )

In [8]:
def init_args():
    training_args = TrainingArguments(
        output_dir=f"./results/TEST",
        label_names=['input_ids'],
        weight_decay=0.01,
        num_train_epochs=1,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=8,
        dataloader_num_workers=4,
        logging_dir="./logs",
        logging_strategy="steps",
        learning_rate=1e-4,
        gradient_checkpointing=True,
        gradient_checkpointing_kwargs={'use_reentrant':False},
        fp16=True,
        no_cuda=False,
        #tf32=True,
        optim="paged_adamw_8bit",
        logging_steps=250,
        save_strategy="epoch",
        #save_steps = 800,
        eval_strategy="steps",
        eval_steps=1500
    )
    return training_args

# TODO custom data loader?

In [9]:
from transformers import Trainer

def init_trainer(model, args, train, valid, tokenizer, data_collator):
    return Trainer(
        model=model,
        args=args,
        train_dataset=train,
        eval_dataset=valid,
        tokenizer=tokenizer,
        data_collator=data_collator
    )


In [10]:
from datasets import DatasetDict, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import gc

def add_special_tokens(example):
    example['body'] = f"{tokenizer.bos_token} {example['body']} {tokenizer.eos_token}"
    return example


def tokenize_function(examples):
    return tokenizer(
    examples['body'], 
    return_tensors="np", 
    padding="max_length",
)


# Function to sample fixed number of rows from each group
def sample_fixed_per_group(df, n_samples, random_state=None):
    return df.groupby("language").apply(lambda x: x.sample(n=n_samples, random_state=random_state)).reset_index(drop=True)


def stratified_sample(df, frac, random_state=None):
    grouped = df.groupby('language')
    sampled_df = grouped.apply(lambda x: x.sample(frac=frac, random_state=random_state)).reset_index(drop=True)
    return sampled_df


def load_dataset(seed, data_split_type):
    df = pd.read_parquet("./data/filtered_funcs.parquet")
    if data_split_type == "fixed":
        samples_per_group = round(len(df)/1000)
        df = sample_fixed_per_group(df, n_samples=samples_per_group,random_state=seed)
    elif data_split_type == "stratified":
        df = stratified_sample(df, frac=0.01, random_state=seed)
    
    #df = pd.read_parquet(f"data/1percent_fixed_{seed}.parquet")

    train, valid = train_test_split(df, train_size=0.8, test_size=0.2, random_state=42)
    
    ds = DatasetDict({
        'train': Dataset.from_pandas(train),
        'valid': Dataset.from_pandas(valid)}
    )
    
    ds = ds.map(add_special_tokens)
    tokenized_ds = ds.map(tokenize_function, batched=True)
    print(tokenized_ds)
    
    del df
    del train
    del valid
    del ds
    gc.collect()
    
    return tokenized_ds


In [11]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [12]:
%%time
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=model_name,
    device_map="auto",
    quantization_config=bnb_config
)

tokenizer = init_tokenizer(model_name)
data_collator = init_collator(tokenizer)

model.config.use_cache = False
model = prepare_model_for_kbit_training(model)
model.config.pad_token_id = tokenizer.pad_token_id
model = get_peft_model(model, peft_config=lora_config)
print_trainable_parameters(model)

data_split_type = "stratified"
training_args = init_args()
 
tokenized_ds = load_dataset(0, data_split_type)
trainer = init_trainer(
    model,
    training_args,
    tokenized_ds["train"],
    tokenized_ds["valid"],
    tokenizer, 
    data_collator
)

num_epochs = 100
for i in range(num_epochs):
    print(i)
    if i:
        tokenized_ds = load_dataset(i, data_split_type)
        
        trainer.train_dataset = tokenized_ds["train"]
        trainer.eval_dataset = tokenized_ds["valid"]
        
    print(trainer.train_dataset[0])
        
        #trainer.args.num_train_epochs = i+1

    trainer.train()
    trainer.save_model(f"./results/{data_split_type}/checkpoint-{i}-fixed")


TinyLlama/TinyLlama-1.1B-Chat-v1.0
LlamaTokenizerFast(name_or_path='TinyLlama/TinyLlama-1.1B-Chat-v1.0', vocab_size=32000, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<func>', 'eos_token': '</func>', 'unk_token': '<unk>', 'pad_token': '</s>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}


trainable params: 9011200 || all params: 624617472 || trainable%: 1.4426749817206521


  sampled_df = grouped.apply(lambda x: x.sample(frac=frac, random_state=random_state)).reset_index(drop=True)


Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12803 [00:00<?, ? examples/s]

Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12803 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['language', 'body', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 51212
    })
    valid: Dataset({
        features: ['language', 'body', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 12803
    })
})
0
{'language': 'Python', 'body': '<func> # Python\n# EnvProps - a model defined in Swagger\n__init__(self, analytics_props, cluster_info, custom_theme, user_info, version_info):\nself._analytics_props = None\nself._cluster_info = None\nself._custom_theme = None\nself._user_info = None\nself._version_info = None\nself.discriminator = None\nif analytics_props is not None:\n    self.analytics_props = analytics_props\nif cluster_info is not None:\n    self.cluster_info = cluster_info\nif custom_theme is not None:\n    self.custom_theme = custom_theme\nif user_info is not None:\n    self.user_info = user_info\nif version_info is not None:\n    self.version_info = version_info </func>', '__in

[2024-08-25 18:41:44,502] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)




/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


Step,Training Loss,Validation Loss
1500,1.3936,1.40407
3000,1.4028,1.38848
4500,1.3897,1.380932
6000,1.3564,1.377175


1


  sampled_df = grouped.apply(lambda x: x.sample(frac=frac, random_state=random_state)).reset_index(drop=True)


Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12803 [00:00<?, ? examples/s]

Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12803 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['language', 'body', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 51212
    })
    valid: Dataset({
        features: ['language', 'body', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 12803
    })
})
{'language': 'Python', 'body': "<func> # Python\n# @deprecated replaced by separate widgets for relf and hifi\nilp_treeliker(input_dict):\ntemplate = input_dict['template']\ndataset = input_dict['dataset']\nsettings = {'algorithm': input_dict.get('algorithm'), 'minimum_frequency': input_dict.get('minimum_frequency'), 'covered_class': input_dict.get('covered_class'), 'maximum_size': input_dict.get('maximum_size'), 'use_sampling': input_dict.get('use_sampling'), 'sample_size': input_dict.get('sample_size'), 'max_degree': input_dict.get('max_degree')}\ntreeliker = TreeLiker(dataset, template, settings=settings)\narff_train, arff_test = treeliker.run()\nreturn {'arff': arff_train, 'treelike

Step,Training Loss,Validation Loss
1500,1.3823,1.370984
3000,1.3654,1.364724
4500,1.3664,1.361109
6000,1.3685,1.358295


2


  sampled_df = grouped.apply(lambda x: x.sample(frac=frac, random_state=random_state)).reset_index(drop=True)


Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12803 [00:00<?, ? examples/s]

Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12803 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['language', 'body', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 51212
    })
    valid: Dataset({
        features: ['language', 'body', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 12803
    })
})
{'language': 'Python', 'body': '<func> # Python\n# Initialize the Axis event.\n__init__(self, device):\nself.device = device\nself._attr_device_info = DeviceInfo(identifiers={(AXIS_DOMAIN, device.unique_id)}) </func>', '__index_level_0__': 30748, 'input_ids': [1, 529, 9891, 29958, 396, 5132, 13, 29937, 25455, 278, 319, 11497, 1741, 29889, 13, 1649, 2344, 12035, 1311, 29892, 4742, 1125, 13, 1311, 29889, 10141, 353, 4742, 13, 1311, 3032, 5552, 29918, 10141, 29918, 3888, 353, 21830, 3401, 29898, 1693, 14903, 3790, 29898, 6604, 3235, 29918, 3970, 29032, 29892, 4742, 29889, 13092, 29918, 333, 26972, 1533, 9891, 29958, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

Step,Training Loss,Validation Loss
1500,1.3681,1.361935
3000,1.3761,1.357388
4500,1.3465,1.354135
6000,1.3688,1.35193


3


  sampled_df = grouped.apply(lambda x: x.sample(frac=frac, random_state=random_state)).reset_index(drop=True)


Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12803 [00:00<?, ? examples/s]

Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12803 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['language', 'body', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 51212
    })
    valid: Dataset({
        features: ['language', 'body', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 12803
    })
})
{'language': 'Python', 'body': "<func> # Python\n# Returns the field's value on the given object instance, or the\n# field's default value if no value for the field is available.\n# \n# Note the field's value will be decoded from API data if necessary,\n# raising any exceptions that the field's `decode()` method may raise.\n__get__(self, obj, cls):\nif obj is None:\n    return self\nif self.attrname not in obj.__dict__:\n    try:\n        value = obj.api_data[self.api_name]\n    except KeyError:\n        if callable(self.default):\n            value = self.default(obj)\n        else:\n            value = self.default\n    else:\n        value = self.decode(value)\n    obj.__dict__[self.

Step,Training Loss,Validation Loss
1500,1.338,1.352023
3000,1.3612,1.349211
4500,1.3427,1.346728
6000,1.3573,1.344594


4


  sampled_df = grouped.apply(lambda x: x.sample(frac=frac, random_state=random_state)).reset_index(drop=True)


Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12803 [00:00<?, ? examples/s]

Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12803 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['language', 'body', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 51212
    })
    valid: Dataset({
        features: ['language', 'body', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 12803
    })
})
{'language': 'Python', 'body': '<func> # Python\n# Builds given targets and returns paths to their important outputs.\n# \n# Args:\n#   labels: Labels of the targets to build.\n#   tail_args: Arguments to append to the Bazel command.\n# \n# Returns:\n#   For each label returns a list of its important outputs.\nbuild_targets(self, labels, tail_args):\nraise NotImplementedError </func>', '__index_level_0__': 30748, 'input_ids': [1, 529, 9891, 29958, 396, 5132, 13, 29937, 8878, 29879, 2183, 22525, 322, 3639, 10898, 304, 1009, 4100, 14391, 29889, 13, 29937, 29871, 13, 29937, 826, 3174, 29901, 13, 29937, 259, 11073, 29901, 15796, 29879, 310, 278, 22525, 304, 2048, 29889, 13, 29937, 259, 1246

Step,Training Loss,Validation Loss
1500,1.3507,1.350961
