In [1]:
#from huggingface_hub import login
#login()

In [2]:
from transformers import set_seed

seed = 42
set_seed(seed)

In [3]:
import torch
from transformers import AutoModelForCausalLM, TrainingArguments, BitsAndBytesConfig
from peft import LoraConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

lora_config = LoraConfig(
    r = 64,
    lora_alpha = 16,
    init_lora_weights = False,
    lora_dropout = 0.1,
    bias = 'none',
    task_type="CAUSAL_LM"
)

In [4]:
from transformers import AutoTokenizer
from peft import prepare_model_for_kbit_training, get_peft_model
from transformers import DataCollatorForLanguageModeling

model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
#model_name = "microsoft/phi-2"

def init_tokenizer(model_name):
    print(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.bos_token = "<func>"
    tokenizer.eos_token = "</func>"
    print(tokenizer)
    #if model_name == "microsoft/phi-2": 
    tokenizer.pad_token = "</s>"
    return tokenizer

In [5]:
def init_model(model_name, tokenizer, bnb_config, lora_config):
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        device_map="auto",
        revision="main",
        quantization_config=bnb_config
    )

    model.config.use_cache = False
    model = prepare_model_for_kbit_training(model)
    model.config.pad_token_id = tokenizer.pad_token_id

    return model


#model = init_model(model_name, tokenizer, bnb_config, lora_config)

In [6]:
def init_collator(tokenizer):
    return DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )


In [7]:
def print_trainable_parameters(model):
  """
  Prints the number of trainable parameters in the model.
  """
  trainable_params = 0
  all_param = 0
  for _, param in model.named_parameters():
    all_param += param.numel()
    if param.requires_grad:
      trainable_params += param.numel()
  print(
    f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
  )

In [8]:
def init_args():
    training_args = TrainingArguments(
        output_dir=f"./results/",
        label_names=['input_ids'],
        weight_decay=0.01,
        num_train_epochs=1,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        gradient_accumulation_steps=8,
        dataloader_num_workers=4,
        logging_dir="./logs",
        logging_strategy="steps",
        learning_rate=1e-4,
        gradient_checkpointing=True,
        gradient_checkpointing_kwargs={'use_reentrant':False},
        fp16=True,
        no_cuda=False,
        #tf32=True,
        optim="paged_adamw_8bit",
        logging_steps=250,
        save_strategy="epoch",
        #save_steps = 800,
        eval_strategy="steps",
        eval_steps=1500
    )
    return training_args

# TODO custom data loader?

In [9]:
from transformers import Trainer

def init_trainer(model, args, train, valid, tokenizer, data_collator):
    return Trainer(
        model=model,
        args=args,
        train_dataset=train,
        eval_dataset=valid,
        tokenizer=tokenizer,
        data_collator=data_collator
    )


In [10]:
from datasets import DatasetDict, Dataset
from sklearn.model_selection import train_test_split
import pandas as pd
import gc

def add_special_tokens(example):
    example['body'] = f"{tokenizer.bos_token} {example['body']} {tokenizer.eos_token}"
    return example


def tokenize_function(examples):
    return tokenizer(
    examples['body'], 
    return_tensors="np", 
    padding="max_length",
)


# Function to sample fixed number of rows from each group
def sample_fixed_per_group(df, n_samples, random_state=None):
    return df.groupby("language").apply(lambda x: x.sample(n=n_samples, random_state=random_state)).reset_index(drop=True)


def stratified_sample(df, frac, random_state=None):
    grouped = df.groupby('language')
    sampled_df = grouped.apply(lambda x: x.sample(frac=frac, random_state=random_state)).reset_index(drop=True)
    return sampled_df


def load_dataset(seed, data_split_type):
    df = pd.read_parquet("./data/filtered_funcs.parquet")
    if data_split_type == "fixed":
        samples_per_group = round(len(df)/1000)
        df = sample_fixed_per_group(df, n_samples=samples_per_group,random_state=seed)
    elif data_split_type == "stratified":
        df = stratified_sample(df, frac=0.01, random_state=seed)
    
    #df = pd.read_parquet(f"data/1percent_fixed_{seed}.parquet")

    train, valid = train_test_split(df, train_size=0.8, test_size=0.2, random_state=42)
    
    ds = DatasetDict({
        'train': Dataset.from_pandas(train),
        'valid': Dataset.from_pandas(valid)}
    )
    
    ds = ds.map(add_special_tokens)
    tokenized_ds = ds.map(tokenize_function, batched=True)
    print(tokenized_ds)
    
    del df
    del train
    del valid
    del ds
    gc.collect()
    
    return tokenized_ds


In [11]:
import os

os.environ["TOKENIZERS_PARALLELISM"] = "true"

In [12]:
%%time
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=model_name,
    device_map="auto",
    quantization_config=bnb_config
)

tokenizer = init_tokenizer(model_name)
data_collator = init_collator(tokenizer)

model.config.use_cache = False
model = prepare_model_for_kbit_training(model)
model.config.pad_token_id = tokenizer.pad_token_id
model = get_peft_model(model, peft_config=lora_config)
print_trainable_parameters(model)

data_split_type = "stratified"
training_args = init_args()

num_epochs = 100
for i in range(num_epochs):
    tokenized_ds = load_dataset(i, data_split_type)
    
    trainer = init_trainer(
        model,
        training_args,
        tokenized_ds["train"],
        tokenized_ds["valid"],
        tokenizer, 
        data_collator
    )
    
    trainer.train()
    trainer.save_model(f"./results/{data_split_type}/checkpoint-{i}")


TinyLlama/TinyLlama-1.1B-Chat-v1.0
LlamaTokenizerFast(name_or_path='TinyLlama/TinyLlama-1.1B-Chat-v1.0', vocab_size=32000, model_max_length=2048, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<func>', 'eos_token': '</func>', 'unk_token': '<unk>', 'pad_token': '</s>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}


trainable params: 9011200 || all params: 624617472 || trainable%: 1.4426749817206521


  sampled_df = grouped.apply(lambda x: x.sample(frac=frac, random_state=random_state)).reset_index(drop=True)


Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12803 [00:00<?, ? examples/s]

Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12803 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['language', 'body', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 51212
    })
    valid: Dataset({
        features: ['language', 'body', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 12803
    })
})


[2024-08-24 00:16:00,061] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)




/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


Step,Training Loss,Validation Loss
1500,1.3936,1.404054
3000,1.4028,1.38849
4500,1.3897,1.380924
6000,1.3564,1.377169


  sampled_df = grouped.apply(lambda x: x.sample(frac=frac, random_state=random_state)).reset_index(drop=True)


Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12803 [00:00<?, ? examples/s]

Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12803 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['language', 'body', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 51212
    })
    valid: Dataset({
        features: ['language', 'body', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 12803
    })
})


Step,Training Loss,Validation Loss
1500,1.3825,1.371243
3000,1.3656,1.364858
4500,1.3667,1.361206
6000,1.3688,1.358397


  sampled_df = grouped.apply(lambda x: x.sample(frac=frac, random_state=random_state)).reset_index(drop=True)


Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12803 [00:00<?, ? examples/s]

Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12803 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['language', 'body', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 51212
    })
    valid: Dataset({
        features: ['language', 'body', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 12803
    })
})


Step,Training Loss,Validation Loss
1500,1.368,1.362067
3000,1.3762,1.357525
4500,1.3463,1.354237
6000,1.3685,1.351982


  sampled_df = grouped.apply(lambda x: x.sample(frac=frac, random_state=random_state)).reset_index(drop=True)


Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12803 [00:00<?, ? examples/s]

Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12803 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['language', 'body', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 51212
    })
    valid: Dataset({
        features: ['language', 'body', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 12803
    })
})


Step,Training Loss,Validation Loss
1500,1.3381,1.352139
3000,1.3614,1.349274
4500,1.3426,1.346777
6000,1.3572,1.344632


  sampled_df = grouped.apply(lambda x: x.sample(frac=frac, random_state=random_state)).reset_index(drop=True)


Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12803 [00:00<?, ? examples/s]

Map:   0%|          | 0/51212 [00:00<?, ? examples/s]

Map:   0%|          | 0/12803 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['language', 'body', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 51212
    })
    valid: Dataset({
        features: ['language', 'body', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 12803
    })
})


Step,Training Loss,Validation Loss
1500,1.3511,1.351206
3000,1.3346,1.348564
