# CodeMind fine tuning
## Methods:
* model: gemma-2b-it

In [None]:
from google.colab import drive
from google.colab import userdata

drive.mount('/content/drive')
%cd /content/drive/MyDrive/CodeMind
!pip install -r requirements.txt --upgrade

In [1]:
import datasets
import torch
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from trl import SFTTrainer

In [2]:
import wandb

wandb.login()

wandb.init(
    project='google gemma 2b it',
    name='4 bit qlora',
)

wandb: Currently logged in as: jehwan-kim (codemind). Use `wandb login --relogin` to force relogin


In [5]:
import os

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_id = 'google/gemma-2b-it'
token = os.getenv('HF_READ') # userdata.get('HF_READ')

model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map={"": 0},
                                             token=token)
tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True, token=token)

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)
model

GemmaForCausalLM(
  (model): GemmaModel(
    (embed_tokens): Embedding(256000, 2048, padding_idx=0)
    (layers): ModuleList(
      (0-17): 18 x GemmaDecoderLayer(
        (self_attn): GemmaSdpaAttention(
          (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear4bit(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear4bit(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): GemmaRotaryEmbedding()
        )
        (mlp): GemmaMLP(
          (gate_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (up_proj): Linear4bit(in_features=2048, out_features=16384, bias=False)
          (down_proj): Linear4bit(in_features=16384, out_features=2048, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): GemmaRMSNorm()
        (post_attention_layernorm): GemmaRMSNorm()
    

In [7]:
import bitsandbytes as bnb


def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit  #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
        if 'lm_head' in lora_module_names:  # needed for 16-bit
            lora_module_names.remove('lm_head')
    return list(lora_module_names)


modules = find_all_linear_names(model)
print(modules)

['q_proj', 'gate_proj', 'down_proj', 'o_proj', 'v_proj', 'k_proj', 'up_proj']


In [8]:
lora_config = LoraConfig(
    r=64,
    lora_alpha=32,
    target_modules=modules,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

trainable, total = model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable / total * 100:.4f}%")

Trainable: 78446592 | total: 2584619008 | Percentage: 3.0351%


In [9]:
df = datasets.load_dataset('kreimben/leetcode_user_submissions', split='train')
dataset = df
dataset

Dataset({
    features: ['title_slug', 'question_content', 'tag', 'level', 'question_hints', 'view_count', 'vote_count', 'content'],
    num_rows: 109309
})

In [10]:
def generate_prompt(data_point):
    prefix_text = 'Below is an instruction that describes a task. Write a response that ' \
                  'appropriately completes the request.\n\n'
    return f"""<start_of_turn>user {prefix_text} {data_point["question_content"]} <end_of_turn>\n<start_of_turn>model{data_point["content"]} <end_of_turn>"""


# add the "prompt" column in the dataset
text_column = [generate_prompt(data_point) for data_point in dataset]
dataset = dataset.add_column("prompt", text_column)
dataset = dataset.shuffle(seed=1234)  # Shuffle dataset here
dataset = dataset.map(lambda samples: tokenizer(samples["prompt"]), batched=True)

dataset = dataset.train_test_split(test_size=0.2)
train_dataset = dataset["train"]
test_dataset = dataset["test"]

train_dataset, test_dataset

Map:   0%|          | 0/109309 [00:00<?, ? examples/s]

(Dataset({
     features: ['title_slug', 'question_content', 'tag', 'level', 'question_hints', 'view_count', 'vote_count', 'content', 'prompt', 'input_ids', 'attention_mask'],
     num_rows: 87447
 }),
 Dataset({
     features: ['title_slug', 'question_content', 'tag', 'level', 'question_hints', 'view_count', 'vote_count', 'content', 'prompt', 'input_ids', 'attention_mask'],
     num_rows: 21862
 }))

In [11]:
import transformers

tokenizer.pad_token = tokenizer.eos_token
torch.cuda.empty_cache()

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    dataset_text_field="prompt",
    peft_config=lora_config,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=0.03,
        # max_steps=5000,
        logging_steps=10,
        output_dir="out",
        optim="paged_adamw_8bit",
        save_strategy="epoch",
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)



Map:   0%|          | 0/87447 [00:00<?, ? examples/s]

Map:   0%|          | 0/21862 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [12]:
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

  attn_output = torch.nn.functional.scaled_dot_product_attention(


Step,Training Loss
10,2.4686
20,1.7769
30,1.4424
40,1.2351
50,1.1595
60,1.3084
70,1.1737
80,1.1236
90,1.0852
100,1.0212


config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

Step,Training Loss
10,2.4686
20,1.7769
30,1.4424
40,1.2351
50,1.1595
60,1.3084
70,1.1737
80,1.1236
90,1.0852
100,1.0212


Step,Training Loss
10,2.4686
20,1.7769
30,1.4424
40,1.2351
50,1.1595
60,1.3084
70,1.1737
80,1.1236
90,1.0852
100,1.0212


Step,Training Loss
10,2.4686
20,1.7769
30,1.4424
40,1.2351
50,1.1595
60,1.3084
70,1.1737
80,1.1236
90,1.0852
100,1.0212


TrainOutput(global_step=65583, training_loss=0.5229590458030381, metrics={'train_runtime': 168674.6561, 'train_samples_per_second': 1.555, 'train_steps_per_second': 0.389, 'total_flos': 1.7257921077163745e+18, 'train_loss': 0.5229590458030381, 'epoch': 3.0})

In [45]:
from dotenv import load_dotenv

load_dotenv()

# upload the trained model to huggingface.
peft_model_id = 'kreimben/CodeMind'
write_token = os.getenv('HF_WRITE') # userdata.get
revision_id = 'gemma-2b-it-20240412'

In [39]:
%cd out/

!echo %cd%

C:\Users\aksid\PycharmProjects\CodeMind\fine-tuning\out


In [40]:
from peft import PeftModel

trainer.model.save_pretrained(peft_model_id+'Peft')

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
)
merged_model = PeftModel.from_pretrained(base_model, peft_model_id+'Peft', revision=revision_id)
merged_model = merged_model.merge_and_unload()

# Save the merged model
merged_model.save_pretrained(peft_model_id+'Merged', safe_serialization=True)
tokenizer.save_pretrained(peft_model_id+'Merged')
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [49]:
from huggingface_hub import notebook_login

notebook_login()

merged_model.push_to_hub(peft_model_id, token=write_token, use_temp_dir=True)
tokenizer.push_to_hub(peft_model_id, token=write_token, use_temp_dir=True)

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

README.md:   0%|          | 0.00/432 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/kreimben/CodeMind/commit/807dd43dd5bdfbfa8522530fb8763fa6716bfc8e', commit_message='Upload tokenizer', commit_description='', oid='807dd43dd5bdfbfa8522530fb8763fa6716bfc8e', pr_url=None, pr_revision=None, pr_num=None)

In [47]:
def get_completion(query: str, model, tokenizer) -> str:
    device = "cuda:0"

    prompt_template = """
    <start_of_turn>user
    Below is an instruction that describes a task. Write a response that appropriately completes the request.
    {query}
    <end_of_turn>\n<start_of_turn>model


    """
    prompt = prompt_template.format(query=query)

    encodeds = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)

    model_inputs = encodeds.to(device)

    generated_ids = model.generate(**model_inputs, max_new_tokens=1000, do_sample=True,
                                   pad_token_id=tokenizer.eos_token_id)
    # decoded = tokenizer.batch_decode(generated_ids)
    decoded = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
    return (decoded)

In [48]:
result = get_completion(query="leetcode 1 add sum. i don't know the approach.", model=merged_model, tokenizer=tokenizer)
print(result)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.



    user
    Below is an instruction that describes a task. Write a response that appropriately completes the request.
    leetcode 1 add sum. i don't know the approach.
    
model


     
    class Solution {\n    public:\n    int addSum(int a, int b) {                     \n        return a + b;    \n    }\n    }; \n\t\n          ```\n    ```     Paglinawan o  ```    Paglinawan o  ```  Paglinawan o  ```   Paglinawan o  ```  Paglinawan o  ```  Paglinawan o  ```  Paglinawan o  ```  Paglinawan o  ```  Paglinawan o  ```  Paglinawan o  ```  Paglinawan o  ```  Paglinawan o  ```  Paglinawan o  ```  Paglinawan o  ```  Paglinawan o  ```  Paglinawan o  ```  Paglinawan o  ```  Paglinawan o  ```  Paglinawan o  ```  Paglinawan o  ```  Paglinawan o  ```  Paglinawan o  ```  Paglinawan o  ```  Paglinawan o  ```  Paglinawan o  ```  Paglinawan o  ```  Paglinawan o  ```  Paglinawan o  ```  Paglinawan o  ```  Paglinawan o  ```  Paglinawan o  ```  Paglinawan o  ```  Paglinawan o  ```  Paglinawan o  ``` 