# Install & Google Mount

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q trl xformers wandb datasets einops gradio sentencepiece

In [None]:
from datasets import load_dataset, Dataset
from trl import SFTTrainer
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
)
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
import torch
import numpy as np
import pandas as pd
import wandb

# checking GPU(A100)

In [None]:
!nvidia-smi

Sun Apr 14 14:18:16 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA A100-SXM4-40GB          Off | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0              41W / 400W |      2MiB / 40960MiB |      0%      Default |
|                                         |                      |             Disabled |
+-----------------------------------------+----------------------+----------------------+
                                                                    

# HF & Wandb login

In [None]:
# HuggingFace login -- training model/data & Pushing model
!pip install huggingface_hub
from huggingface_hub import notebook_login

notebook_login()

In [None]:
# Log in to Weights & Biases
wandb.login()
# Initialize and configure experiment
run = wandb.init(name="name", project="your-project name", job_type="training", anonymous="allow", entity="your-entity")

# Loading the Model & Dataset

In [None]:
# model : Gemma-7B-it
model_id = "google/gemma-7b-it"

# 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map={"":0}
    )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.11G [00:00<?, ?B/s]

Gemma's activation function should be approximate GeLU and not exact GeLU.
Changing the activation function to `gelu_pytorch_tanh`.if you want to use the legacy `gelu`, edit the `model.config` to set `hidden_activation=gelu`   instead of `hidden_act`. See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [None]:
#Importing the dataset
dataset_name = "LimYeri/LeetCode_with_Solutions"
dataset = load_dataset(dataset_name)
dataset

Downloading readme:   0%|          | 0.00/955 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/40.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/34903 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'question_content', 'title_slug', 'tag', 'level', 'question_hints', 'content'],
        num_rows: 34903
    })
})

In [None]:
df = pd.DataFrame(dataset['train'])

In [None]:
model.config.use_cache = False
# model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token

tokenizer_config.json:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/888 [00:00<?, ?B/s]

(True, True)

# Calculating the number of trainable parameters

In [None]:
# Prepare the model for k-bit training.
model = prepare_model_for_kbit_training(model)

# Set up the PEFT configuration using (Q)LoRA
peft_config = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.05,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['o_proj', 'q_proj', 'up_proj', 'v_proj', 'k_proj', 'down_proj', 'gate_proj']
)

# Apply PEFT to the model using the configured PEFT settings.
model = get_peft_model(model, peft_config)

trainable, total = model.get_nb_trainable_parameters()
print(f"Trainable: {trainable} | total: {total} | Percentage: {trainable/total*100:.4f}%")

Trainable: 200015872 | total: 8737696768 | Percentage: 2.2891%


In [None]:
# import os
# os.environ['PYTORCH_CUDA_ALLOC_CONF'] = "expandable_segments:True"

In [None]:
torch.cuda.empty_cache()

# Hyperparamter
training_arguments = TrainingArguments(
    output_dir="/content/drive/MyDrive/Colab Notebooks/CodeMind/checkpoint",
    logging_dir="/content/drive/MyDrive/Colab Notebooks/CodeMind/logs",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    optim="paged_adamw_8bit",
    logging_steps=20,
    learning_rate=2e-4,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    report_to="wandb",
    save_strategy="epoch"
)

In [None]:
# wandb setting
wandb.config = {
    "learning_rate": 2e-4,
    "epochs":3,
    "per_device_train_batch_size":1,
    "gradient_accumulation_steps":4,
    "optim":"paged_adamw_8bit",
    "logging_steps":20,
    "save_strategy":"epoch"
}
wandb.watch(model)

[]

# Formatting the Dataset

In [None]:
def generate_coding_test_prompt(data_point):
    output_texts = []
    for i in range(len(data_point['question_content'])):  # Assuming 'question_content' is a list of questions
        prefix_text = 'Solve the coding problem described below, considering the suggested data structures and techniques provided.\n\n'
        problem_description = f"Problem Description:\n{data_point['question_content'][i]}\n\n"
        used_data_structures_techniques = f"Suggested Data Structures and Techniques: {data_point['tag'][i]}\n"  # Assuming 'tag' is a list of lists
        hints = f"Hints:\n{data_point['question_hints'][i]}\n\n" if data_point['question_hints'][i] else ""

        # Concatenating all parts to form the full prompt for the current index
        prompt = f"""<start_of_turn>user {prefix_text}{problem_description}{used_data_structures_techniques}{hints}<end_of_turn>\n<start_of_turn>model{data_point['content'][i]} <end_of_turn>"""

        output_texts.append(prompt)

    return output_texts

In [None]:
output_texts = generate_coding_test_prompt(df)
print(len(output_texts))

34903


In [None]:
output_texts[0]

'<start_of_turn>user Solve the coding problem described below, considering the suggested data structures and techniques provided.\n\nProblem Description:\nGiven two strings `s` and `t` of lengths `m` and `n` respectively, return _the **minimum window**_ **_substring_** _of_ `s` _such that every character in_ `t` _(**including duplicates**) is included in the window_. If there is no such substring, return _the empty string_ `" "`.\n\nThe testcases will be generated such that the answer is **unique**.\n\n**Example 1:**\n\n**Input:** s =  "ADOBECODEBANC ", t =  "ABC "\n**Output:**  "BANC "\n**Explanation:** The minimum window substring  "BANC " includes \'A\', \'B\', and \'C\' from string t.\n\n**Example 2:**\n\n**Input:** s =  "a ", t =  "a "\n**Output:**  "a "\n**Explanation:** The entire string s is the minimum window.\n\n**Example 3:**\n\n**Input:** s =  "a ", t =  "aa "\n**Output:**  " "\n**Explanation:** Both \'a\'s from t must be included in the window.\nSince the largest window of

In [None]:
data_dict = {"text": output_texts}
dataset = Dataset.from_dict(data_dict)

# Fine-Tuning with qLora and Supervised Finetuning

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    max_seq_length= None,
    args=training_arguments,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
)



Map:   0%|          | 0/34903 [00:00<?, ? examples/s]

In [None]:
# train
trainer.train()

# Continue training from a checkpoint with Trainer. (resume training)

In [None]:
checkpoint_path = '/content/drive/MyDrive/Colab Notebooks/CodeMind/result/checkpoint-24000'

In [None]:
model = PeftModel.from_pretrained(model, checkpoint_path, is_trainable=True)

In [None]:
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    max_seq_length= None,
    args=training_arguments,
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
)

Map:   0%|          | 0/34903 [00:00<?, ? examples/s]

In [None]:
# train
trainer.train(resume_from_checkpoint=checkpoint_path)

	save_steps: 500 (from args) != 1000 (from trainer_state.json)


Step,Training Loss
24020,0.4313
24040,0.2695
24060,0.3835
24080,0.3011
24100,0.2185
24120,0.4934
24140,0.275
24160,0.2922
24180,0.3859
24200,0.2414


TrainOutput(global_step=26175, training_loss=0.026595723198614696, metrics={'train_runtime': 26164.3296, 'train_samples_per_second': 4.002, 'train_steps_per_second': 1.0, 'total_flos': 4.139963542437157e+18, 'train_loss': 0.026595723198614696, 'epoch': 3.0})

In [None]:
wandb.finish()

# Merge and Share the model

In [None]:
new_model = "LimYeri/CodeMind-Gemma-7B-4bit" #Name of the model you will be pushing to huggingface model hub
# Save the fine-tuned model
trainer.model.save_pretrained(new_model)

In [None]:
# Merge the model with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map={"": 0},
)
merged_model= PeftModel.from_pretrained(base_model, new_model)
merged_model= merged_model.merge_and_unload()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
# Save the merged model
merged_model.save_pretrained("merged_model",safe_serialization=True)
tokenizer.save_pretrained("merged_model")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
new_model = "LimYeri/CodeMind-Gemma-7B-QLoRA-4bit"

In [None]:
# Push the model and tokenizer to the Hugging Face Model Hub
merged_model.push_to_hub(new_model, use_auth_token=True)
tokenizer.push_to_hub(new_model, use_auth_token=True)

model-00001-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.11G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]



README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/LimYeri/CodeMind-Gemma-7B-QLoRA-4bit/commit/59aa4fe8a219bc8725cd6adc99a641dc5307be7c', commit_message='Upload tokenizer', commit_description='', oid='59aa4fe8a219bc8725cd6adc99a641dc5307be7c', pr_url=None, pr_revision=None, pr_num=None)