# CodeMind fine tuning
## Methods:
* model: gemma-2b-it

In [1]:
import datasets
import torch
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import SFTTrainer

In [2]:
import wandb

wandb.login()

wandb.init(
    project='google gemma 2b it',
    name='peft-qlora-1epoch',
)

wandb: Currently logged in as: jehwan-kim (codemind). Use `wandb login --relogin` to force relogin


In [3]:
import os

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_id = 'google/gemma-2b-it'
token = os.getenv('HF_READ')

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"": 0},
                                             token=token)
tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True, token=token)

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
model

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=16384, out_features=2048, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
    

In [5]:
# Find the modules in the model for qlora target modules.
import bitsandbytes as bnb


def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit  #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
        if 'lm_head' in lora_module_names:  # needed for 16-bit
            lora_module_names.remove('lm_head')
    return list(lora_module_names)


modules = find_all_linear_names(model)
modules

['down_proj', 'up_proj', 'v_proj', 'q_proj', 'o_proj', 'k_proj', 'gate_proj']

In [6]:
lora_config = LoraConfig(
    r=8,  #64,
    lora_alpha=32,
    target_modules=modules,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

trainable, total = model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable / total * 100:.4f}%")

Trainable: 9805824 | total: 2515978240 | Percentage: 0.3897%


In [7]:
# dataset = datasets.load_dataset('kreimben/leetcode_user_submissions', split='train')
dataset = datasets.load_dataset('csv', data_files='../user_submission_only_python.csv', split='train')
dataset

Dataset({
    features: ['title_slug', 'question_content', 'tag', 'level', 'question_hints', 'view_count', 'vote_count', 'content'],
    num_rows: 17861
})

In [8]:
def generate_prompt(data_point):
    prefix_text = """Below is an coding test problem. Write a response that appropriately completes the request.\n\n"""
    return f"""<start_of_turn>user {prefix_text}I don't know {data_point['title_slug']} problem. give me the insight or appoach. here are some content of question.\n
{data_point["question_content"]}<end_of_turn>\n<start_of_turn>model {data_point["content"]}<end_of_turn>"""


# add the "prompt" column in the dataset
text_column = [generate_prompt(data_point) for data_point in dataset]
dataset = dataset.add_column("prompt", text_column)
dataset = dataset.shuffle(seed=42)  # Shuffle dataset here
dataset = dataset.map(lambda samples: tokenizer(samples["prompt"]), batched=True)

dataset = dataset.train_test_split(test_size=0.2)
train_dataset = dataset["train"]
test_dataset = dataset["test"]

train_dataset, test_dataset

Map:   0%|          | 0/17861 [00:00<?, ? examples/s]

(Dataset({
     features: ['title_slug', 'question_content', 'tag', 'level', 'question_hints', 'view_count', 'vote_count', 'content', 'prompt', 'input_ids', 'attention_mask'],
     num_rows: 14288
 }),
 Dataset({
     features: ['title_slug', 'question_content', 'tag', 'level', 'question_hints', 'view_count', 'vote_count', 'content', 'prompt', 'input_ids', 'attention_mask'],
     num_rows: 3573
 }))

In [11]:
import transformers

tokenizer.pad_token = tokenizer.eos_token
torch.cuda.empty_cache()

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    dataset_text_field="prompt",
    peft_config=lora_config,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=0.03,
        num_train_epochs=3,
        logging_steps=10,
        output_dir="out",
        optim="paged_adamw_8bit",
        save_strategy="epoch",
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

Map:   0%|          | 0/3573 [00:00<?, ? examples/s]

In [12]:
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

Step,Training Loss
10,1.153
20,1.214
30,1.104
40,0.9962
50,1.1293
60,1.0055
70,0.993
80,0.9683
90,0.8955
100,1.062


Step,Training Loss
10,1.153
20,1.214
30,1.104
40,0.9962
50,1.1293
60,1.0055
70,0.993
80,0.9683
90,0.8955
100,1.062


Step,Training Loss
10,1.153
20,1.214
30,1.104
40,0.9962
50,1.1293
60,1.0055
70,0.993
80,0.9683
90,0.8955
100,1.062


TrainOutput(global_step=10716, training_loss=0.6192928239363884, metrics={'train_runtime': 26823.9846, 'train_samples_per_second': 1.598, 'train_steps_per_second': 0.399, 'total_flos': 2.974800094267392e+17, 'train_loss': 0.6192928239363884, 'epoch': 3.0})

In [13]:
from dotenv import load_dotenv

load_dotenv()

# upload the trained model to huggingface.
peft_model_id = 'kreimben/CodeMind-gemma'
write_token = os.getenv('HF_WRITE')

In [14]:
%cd out/

!echo %cd%

  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


C:\Users\aksid\PycharmProjects\CodeMind\fine-tuning\out
C:\Users\aksid\PycharmProjects\CodeMind\fine-tuning\out


In [15]:
from peft import PeftModel

trainer.model.save_pretrained(peft_model_id + 'Peft')

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
)
merged_model = PeftModel.from_pretrained(base_model, peft_model_id + 'Peft')
merged_model = merged_model.merge_and_unload()

# Save the merged model
merged_model.save_pretrained(peft_model_id + 'Merged', safe_serialization=True)
tokenizer.save_pretrained(peft_model_id + 'Merged')
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [16]:
from huggingface_hub import notebook_login

notebook_login()

merged_model.push_to_hub(peft_model_id, token=write_token, use_temp_dir=True)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

README.md:   0%|          | 0.00/697 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kreimben/CodeMind-gemma/commit/354f552da315934844cd6f59ea149a256da3c57a', commit_message='Upload GemmaForCausalLM', commit_description='', oid='354f552da315934844cd6f59ea149a256da3c57a', pr_url=None, pr_revision=None, pr_num=None)

In [17]:
tokenizer.push_to_hub(peft_model_id, token=write_token, use_temp_dir=True)

README.md:   0%|          | 0.00/685 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kreimben/CodeMind-gemma/commit/430280ac5f2c42d01953f73b96164db3fec72195', commit_message='Upload tokenizer', commit_description='', oid='430280ac5f2c42d01953f73b96164db3fec72195', pr_url=None, pr_revision=None, pr_num=None)