# Install & Google Mount

In [21]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q -U bitsandbytes
!pip install -q -U git+https://github.com/huggingface/transformers.git
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q -U git+https://github.com/huggingface/accelerate.git
!pip install -q trl xformers wandb datasets einops gradio sentencepiece

In [2]:
from datasets import load_dataset, Dataset
from trl import SFTTrainer
from transformers import (
    AutoModelForTokenClassification,
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
)
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
import torch
import numpy as np
import pandas as pd
import wandb

# HF & Wandb login

In [None]:
# HuggingFace login -- training model/data & Pushing model
!pip install huggingface_hub
from huggingface_hub import notebook_login

notebook_login()

In [None]:
# Log in to Weights & Biases
wandb.login()
# Initialize and configure experiment
run = wandb.init(name="name", project="your-project name", job_type="training", anonymous="allow", entity="your-entity")

# Loading the Model & Dataset

In [6]:
# model : Mistral 7B
model_id = "mistralai/Mistral-7B-v0.1"

# 8-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=200.0
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    torch_dtype=torch.bfloat16,
    device_map={"":0}
    )

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [7]:
#Importing the dataset
dataset_name = "kreimben/leetcode_with_youtube_captions"
dataset = load_dataset(dataset_name)
dataset

Downloading readme:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/75.1M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/75.2M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18136 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['cc_content', 'id', 'thumbnail', 'title_x', 'question_content', 'java', 'c++', 'python', 'javascript', 'title_y', 'tag', 'level', 'success_rate', 'total_submission', 'total_accepted', 'question_likes', 'question_dislikes', 'question_hints', 'similar_question_ids'],
        num_rows: 18136
    })
})

In [8]:
df = pd.DataFrame(dataset['train'])

In [9]:
model.config.use_cache = False
model.config.pretraining_tp = 1
model.gradient_checkpointing_enable()

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.padding_side = 'right'
tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_eos_token = True
tokenizer.add_bos_token, tokenizer.add_eos_token

(True, True)

In [10]:
# Prepare the model for k-bit training.
model = prepare_model_for_kbit_training(model)

# Set up the PEFT configuration using (Q)LoRA
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj","gate_proj"]
)

# Apply PEFT to the model using the configured PEFT settings.
model = get_peft_model(model, peft_config)

In [11]:
# Hyperparamter
training_arguments = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=50,
    logging_steps=1,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    report_to="wandb",
    lr_scheduler_type="constant",
)

In [12]:
# wandb setting
wandb.config = {
    "learning_rate": 2e-4,
    "epochs":3,
    "batch_size": 8
}
wandb.watch(model)

[]

# Formatting the Dataset

In [13]:
# text
def formatting_func(example):
    output_texts = []
    for i in range(len(example)):
        text = f"problem: {example['question_content'][i]}\nExplain: {example['cc_content'][i]}"
        output_texts.append(text)
    return output_texts

In [14]:
output_texts = formatting_func(df)

In [15]:
len(output_texts)

18136

In [16]:
data_dict = {"text": output_texts}
dataset = Dataset.from_dict(data_dict)

# Fine-Tuning with qLora and Supervised Finetuning

In [17]:
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    max_seq_length= None,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)



Map:   0%|          | 0/18136 [00:00<?, ? examples/s]

In [18]:
# train
trainer.train()



Step,Training Loss
1,0.9681
2,0.8261
3,0.9065
4,0.911
5,0.9619
6,0.8842
7,0.7195
8,0.6887
9,0.7395
10,0.8136




Step,Training Loss
1,0.9681
2,0.8261
3,0.9065
4,0.911
5,0.9619
6,0.8842
7,0.7195
8,0.6887
9,0.7395
10,0.8136




TrainOutput(global_step=6801, training_loss=0.49115940565187216, metrics={'train_runtime': 69076.1772, 'train_samples_per_second': 0.788, 'train_steps_per_second': 0.098, 'total_flos': 2.3989299387590246e+18, 'train_loss': 0.49115940565187216, 'epoch': 3.0})

# Save the model

In [31]:
# model.save_pretrained("/content/drive/MyDrive/Colab Notebooks/CodeMind/Mistral-7b-8bit-QLoRA")
# tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/CodeMind/Mistral-7b-8bit-QLoRA")

('/content/drive/MyDrive/Colab Notebooks/CodeMind/Mistral-7b-8bit-QLoRA/tokenizer_config.json',
 '/content/drive/MyDrive/Colab Notebooks/CodeMind/Mistral-7b-8bit-QLoRA/special_tokens_map.json',
 '/content/drive/MyDrive/Colab Notebooks/CodeMind/Mistral-7b-8bit-QLoRA/tokenizer.model',
 '/content/drive/MyDrive/Colab Notebooks/CodeMind/Mistral-7b-8bit-QLoRA/added_tokens.json',
 '/content/drive/MyDrive/Colab Notebooks/CodeMind/Mistral-7b-8bit-QLoRA/tokenizer.json')

In [27]:
# Define the save and push paths
new_model = f"LimYeri/Mistral-7B-v0.1-CodeMind"

In [None]:
# Save the model
model.save_pretrained(new_model, push_to_hub=True, use_auth_token=True)
tokenizer.save_pretrained(new_model, push_to_hub=True)

In [30]:
# Push the model to the hub
model.push_to_hub("CodeMind-Mistral-7B-v0.1")
tokenizer.push_to_hub("CodeMind-Mistral-7B-v0.1")

adapter_model.safetensors:   0%|          | 0.00/369M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/LimYeri/CodeMind-Mistral-7B-v0.1/commit/108ee542ec45e1f36741f7c6fdbc7af058616217', commit_message='Upload tokenizer', commit_description='', oid='108ee542ec45e1f36741f7c6fdbc7af058616217', pr_url=None, pr_revision=None, pr_num=None)

In [35]:
wandb.finish()

VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/grad_norm,▂▂▁▂▁▃▂▁▄▅▃▃▅▂▄▃▃▃▂▃▂▄▂▃▂▅▂▃▅▃█▄▅▅▃▄▃▃▆▄
train/learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/loss,▆▇▆█▅▆▆▅▅▇▄█▃▆▅▄▅█▅▆▄▂▄▄▇▄▄▃▄▄▄▂▃▃▃▄▄▁▄▃

0,1
total_flos,2.3989299387590246e+18
train/epoch,3.0
train/global_step,6801.0
train/grad_norm,0.25344
train/learning_rate,0.0002
train/loss,0.3652
train_loss,0.49116
train_runtime,69076.1772
train_samples_per_second,0.788
train_steps_per_second,0.098
