In [1]:
!pip install -q pandas
!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install -q transformers
!pip install -q datasets
!pip install -q git+https://github.com/huggingface/peft.git
!pip install -q bitsandbytes
!pip install -q trl
!pip install -q tensorboardX
!pip install -q wandb -U

In [2]:
!transformers-cli env


Copy-and-paste the text below in your GitHub issue and FILL OUT the two last points.

- `transformers` version: 4.39.1
- Platform: Windows-11-10.0.22621-SP0
- Python version: 3.12.2
- Huggingface_hub version: 0.21.4
- Safetensors version: 0.4.2
- Accelerate version: 0.28.0
- Accelerate config: 	not found
- PyTorch version (GPU?): 2.2.1+cu121 (True)
- Tensorflow version (GPU?): not installed (NA)
- Flax version (CPU?/GPU?/TPU?): not installed (NA)
- Jax version: not installed
- JaxLib version: not installed
- Using GPU in script?: <fill in>
- Using distributed or parallel set-up in script?: <fill in>



In [3]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:09:35_Pacific_Daylight_Time_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


## Prueba del modelo base

In [4]:
import json
import pandas as pd
import torch
from datasets import Dataset, load_dataset
from huggingface_hub import notebook_login # Usaremos las herramientas de HuggingFace para el entrenamiento
from peft import LoraConfig, PeftModel
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from trl import SFTTrainer
import accelerate
import tensorboardX

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Base Model
model_id = "mistralai/Mistral-7B-v0.1"

# Load MitsralAi tokenizer for dataset formatting
#tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
#tokenizer.pad_token = tokenizer.eos_token
#tokenizer.padding_side = "left"

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    trust_remote_code=True,
    padding_side="right",
    add_eos_token=True,
    add_bos_token=True,
)
tokenizer.pad_token = tokenizer.eos_token

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [27]:
# Parámetros del modelo

# Final model name
tuned_model = "mistral7b_code"

######### QLORA Params #############
# (Para reducir el uso de memoria)

# Estos valores dependen del dataset
# The rank of the low-rank matrix used in the adapters, which thus controls the number of parameters trained. 
# A higher rank will allow for more expressivity, but there is a compute tradeoff. (2^x)
lora_r = 32

# Scaling factor for the learned weights. The weight matrix is scaled by alpha/r, and thus a higher value for alpha assigns more weight to 
# the LoRA activations.
lora_alpha = 64

# NOTA: En el paper de QLoRA utiliza los valoes de r = 64 y alpha = 16, argumentando que estos valores generalizan bastante bien. Si queremos darle mas
# importancia a la data fine-tuneada aumentamos los valores de alpha y si queremos mejor rendimiento disminuimos R.

# Dropout probability
# Durante el entranmiento, en cada epoch hay un {lora_dropout}% de que las neuronas se desactiven (para que trabaje mas)
lora_dropout = 0.1

####### BitsAndBytes param ###########

#Activamos la reducción de precisión a 4-bit
use_4bit = True

# Parámetro para los modelos 4-bit
bnb_4bit_compute_dtype = "bfloat16" #torch.float16 != torch.bfloat16

# Tipo de cuantización (fp4 o nf4)
# nf4 utiliza una distribución normal
bnb_4bit_quant_type = "nf4"

# Nested quantization for 4-bit base models (double quantization)
# Nos proporciona una mayor eficiencia de memoria sin sacrificiar rendimiento. Lo que hace es
# realizar una segunda cuantización de los pesos ya cuantizados para ahorrar 0.4 bits/parametro.
use_nested_quant = True

####### Training Arguments param #########

#Aqui se guardarán las predicciones y los checkpoints
output_dir = "./resultados"

# Número de epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for train
per_device_train_batch_size = 1

# Batch size per GPU for evaluation
per_device_eval_batch_size = 2

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2.5e-5

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule (constant a bit better than cosine)
lr_scheduler_type = "constant"

# Number of training steps (overrides num_train_epochs)
# The total number of training steps to perform.
# Uso: Inicialmente a muchos steps y comprobamos a partir de que steps el modelo empieza a degradarse. Para evitar hacer muchos entrenamientos
# en la proxima iteración empezamos desde un checkpoint.

max_steps = 500

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 0

# Log every X updates steps
logging_steps = 1

###### Parámetros para SFT ########

# Max sequence length
max_seq_length = 512

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
# (En el código de Accelerate esta todo explicado)
# device_map = {"": 0}
device_map = "auto"

In [7]:
#Load base model

# Load with QLoRA config
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

# https://huggingface.co/docs/transformers/main_classes/quantization

# Con la librería Transformers podemos usar los algoritmos AWQ y GPTQ de cuantización y soporta
# cuantizaciones de 4 y 8 bits. (Se pueden añadir más técnicas con la clase HfQuantizer)
# En este caso cuantizaremos a 4-bit con el tipo NF4
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [8]:
# https://huggingface.co/docs/transformers/model_doc/auto
# https://huggingface.co/transformers/v2.9.1/main_classes/model.html

# Utilizamos la arquitectura que viene ya incluida en el modelo
# Instantiate a pretrained pytorch model from a pre-trained model configuration.
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    low_cpu_mem_usage = True,
    device_map = device_map
)

# https://huggingface.co/transformers/v2.9.1/main_classes/model.html#transformers.PreTrainedModel.generate
#Use past key values?
base_model.config.use_cache = False  # Nos interesa usar los parametros actualizados, no los viejos (cached)

# Mimic the behaviour of the original model at inference?
base_model.config.pretraining_tp = 1 #1 = disable

print(base_model)

Downloading shards: 100%|████████████████████████████████████████████████████████████████| 2/2 [02:38<00:00, 79.16s/it]
Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 2/2 [00:05<00:00,  2.82s/it]


MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32000, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): MistralRotaryEmbedding()
        )
        (mlp): MistralMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm()
        (post_attention_layernorm): MistralRMSNorm()
      )
    )

In [9]:
# Prueba del modelo base

#eval_prompt = """Print hello world in python java and c"""

eval_prompt = """You're a time traveler from the year 4055. Write a letter to your past self describing the future."""

# CUDA: Para programar directamente la GPU
model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")

base_model.eval()
with torch.no_grad():
    print(tokenizer.decode(base_model.generate(**model_input, max_new_tokens=256, pad_token_id=2)[0], skip_special_tokens=True))

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


You're a time traveler from the year 4055. Write a letter to your past self describing the future.м

Dear Me,

I know you're probably wondering why I'm writing to you from the future. Well, I'm here to tell you that you're about to make a huge mistake. You're about to go on a date with that guy you met at the bar last night. He seems nice enough, but he's actually a serial killer. I know you don't believe me, but trust me, he's going to kill you.

I know you're probably thinking, "But he seemed so nice! He bought me a drink and we had a great conversation." But trust me, he's not who he seems. He's going to take you back to his place and kill you. I know you're probably thinking, "But he said he was a doctor and he seemed so smart." But trust me, he's not who he seems. He's going to take you back to his place and kill you.

I know you're probably thinking, "But he said he was a doctor and he seemed so smart." But trust me, he's not who he seems. He's going to take you back to his place

## Fine-tunning

In [53]:
!pip install -q ipywidgets

Collecting ipywidgets
  Downloading ipywidgets-8.1.2-py3-none-any.whl.metadata (2.4 kB)
Collecting widgetsnbextension~=4.0.10 (from ipywidgets)
  Downloading widgetsnbextension-4.0.10-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab-widgets~=3.0.10 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.10-py3-none-any.whl.metadata (4.1 kB)
Downloading ipywidgets-8.1.2-py3-none-any.whl (139 kB)
   ---------------------------------------- 0.0/139.4 kB ? eta -:--:--
   -------- ------------------------------- 30.7/139.4 kB 1.3 MB/s eta 0:00:01
   ---------------------------------------- 139.4/139.4 kB 1.7 MB/s eta 0:00:00
Downloading jupyterlab_widgets-3.0.10-py3-none-any.whl (215 kB)
   ---------------------------------------- 0.0/215.0 kB ? eta -:--:--
   ---------------------------------------- 215.0/215.0 kB ? eta 0:00:00
Downloading widgetsnbextension-4.0.10-py3-none-any.whl (2.3 MB)
   ---------------------------------------- 0.0/2.3 MB ? eta -:--:--
   ---------------------

In [47]:
#Log in to the HugginFace Model Hub
notebook_login()

ImportError: The `notebook_login` function can only be used in a notebook (Jupyter or Colab) and you need the `ipywidgets` module: `pip install ipywidgets`.

In [24]:
# Log in to WandDB
import wandb

!wandb login b0ee138ef7cb51349541df5f648e2172d699101c

run = wandb.init(
    project='mistral7b-instruct-code',
    job_type="training",
    anonymous="allow"
)

ModuleNotFoundError: No module named 'distutils'

#### Dataset

In [10]:
#Dataset Load
#https://huggingface.co/datasets/TokenBender/code_instructions_122k_alpaca_style

#Usaremos la librería datsets de HuggingFace
dataset = load_dataset("TokenBender/code_instructions_122k_alpaca_style", split="train")
dataset

Downloading readme: 100%|███████████████████████████████████████████████████████████████████| 28.0/28.0 [00:00<?, ?B/s]
Downloading data: 100%|█████████████████████████████████████████████████████████████| 169M/169M [00:05<00:00, 28.4MB/s]
Generating train split: 121959 examples [00:00, 127729.84 examples/s]


Dataset({
    features: ['text', 'output', 'instruction', 'input'],
    num_rows: 121959
})

In [11]:
df = dataset.to_pandas()
df.head(10)

Unnamed: 0,text,output,instruction,input
0,Below is an instruction that describes a task....,# Python code\ndef sum_sequence(sequence):\n ...,Create a function to calculate the sum of a se...,"[1, 2, 3, 4, 5]"
1,Below is an instruction that describes a task....,"def add_strings(str1, str2):\n """"""This func...",Develop a function that will add two strings,"str1 = ""Hello ""\nstr2 = ""world"""
2,Below is an instruction that describes a task....,#include <map>\n#include <string>\n\nclass Gro...,Design a data structure in C++ to store inform...,
3,Below is an instruction that describes a task....,def bubble_sort(arr):\n n = len(arr)\n \n ...,Implement a sorting algorithm to sort a given ...,"[3, 1, 4, 5, 9, 0]"
4,Below is an instruction that describes a task....,import UIKit\n\nclass ExpenseViewController: U...,Design a Swift application for tracking expens...,Not applicable
5,Below is an instruction that describes a task....,<?php\n$timestamp = $_GET['timestamp'];\n\nif(...,Create a REST API to convert a UNIX timestamp ...,Not Applicable
6,Below is an instruction that describes a task....,import requests\nimport re\n\ndef crawl_websit...,Generate a Python code for crawling a website ...,website: www.example.com \ndata to crawl: phon...
7,Below is an instruction that describes a task....,"[x*x for x in [1, 2, 3, 5, 8, 13]]",Create a Python list comprehension to get the ...,
8,Below is an instruction that describes a task....,SELECT * FROM products ORDER BY price DESC LIM...,Create a MySQL query to find the most expensiv...,
9,Below is an instruction that describes a task....,public class Library {\n \n // map of books in...,Create a data structure in Java for storing an...,Not applicable


In [12]:
#Mistral - Instruct requiere un formato específico para los prompts para que el modelo entienda mejor, en este caso [INST] [/INST]

# Mas info del formato aca: https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1

# Para entrenar mediante aprendizaje supervisado tenemos que meterle la instrucción y el output para que
# pueda realizar la predicción.
# Instruction: El texto introducido por el usuario
# Input: Por si el usuario introduce algo de código o data que el modelo deba tener en cuenta
# Output: Respuesta que debería dar

def generate_prompt(data_point):
    """Gen. input text based on a prompt, task instruction, (context info.), and answer

    :param data_point: dict: Data point
    :return: dict: tokenzed prompt
    """
    prefix_text = 'Below is an instruction that describes a task. Write a response that ' \
               'appropriately completes the request.\n\n'
    # Samples with additional context into.
    if data_point['input']:
        text = f"""<s>[INST]{prefix_text} {data_point["instruction"]} here are the inputs {data_point["input"]} [/INST]{data_point["output"]}</s>"""
    # Without
    else:
        text = f"""<s>[INST]{prefix_text} {data_point["instruction"]} [/INST]{data_point["output"]} </s>"""
    return text

# add the "prompt" column in the dataset
text_column = [generate_prompt(data_point) for data_point in dataset]
dataset = dataset.add_column("prompt", text_column)

In [13]:
dataset = dataset.shuffle(seed=1234)  # Shuffle dataset here
dataset = dataset.map(lambda samples: tokenizer(samples["prompt"]), batched=True)

Map: 100%|████████████████████████████████████████████████████████████| 121959/121959 [00:16<00:00, 7181.41 examples/s]


In [14]:
print(dataset)

Dataset({
    features: ['text', 'output', 'instruction', 'input', 'prompt', 'input_ids', 'attention_mask'],
    num_rows: 121959
})


In [15]:
#Split dataset in 80% train, 20% test

dataset = dataset.train_test_split(test_size=0.2)
train_data = dataset["train"]
test_data = dataset["test"]

In [16]:
print(train_data)
print(test_data)

Dataset({
    features: ['text', 'output', 'instruction', 'input', 'prompt', 'input_ids', 'attention_mask'],
    num_rows: 97567
})
Dataset({
    features: ['text', 'output', 'instruction', 'input', 'prompt', 'input_ids', 'attention_mask'],
    num_rows: 24392
})


In [17]:
# Fine-tunning with QLoRA y Supervised Fine Tunning (SFT)
from peft import get_peft_model

# Set LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    task_type="CAUSAL_LM",
)

# Model with LoRA adapters added
print(get_peft_model(base_model, peft_config))

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32000, 4096)
        (layers): ModuleList(
          (0-31): 32 x MistralDecoderLayer(
            (self_attn): MistralSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4bit(
                (base_layer): 

In [28]:
# Set training parameters (Loading the trainer)
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard" #Default logging_dir = *output_dir/runs/CURRENT_DATETIME_HOSTNAME*
)

# Initialize the SFTTrainer for fine-tuning
trainer = SFTTrainer(
    model=base_model,
    train_dataset=train_data,
    eval_dataset=test_data,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,  # Specify the maximum sequence length here
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)

Map: 100%|██████████████████████████████████████████████████████████████| 97567/97567 [00:11<00:00, 8577.05 examples/s]


In [29]:
###### START TRAIN ########

# Initialize SFTTrainer (Wandb starts automatically when this is run)
trainer.train()

# Save the fine-tuned model (tuned_model = "mistral-code-test1" )
trainer.model.save_pretrained(tuned_model)

wandb.finish()

# www.wandb.ai/<your-profile-name>/projects

print(base_model.get_memory_footprint())

###########################

Step,Training Loss
1,0.6197
2,0.5429
3,0.9672
4,0.472
5,0.4994
6,0.3548
7,0.6257
8,0.6481
9,0.7777
10,0.4274


5416345600


In [None]:
trainer.model.push_to_hub(new_model, use_temp_dir=False)

In [20]:
# python -m tensorboard.main --logdir=resultados/

5416345600


In [22]:
################# A PARTIR DE AQUI, EL MODELO YA HA SIDO FINE-TUNEADO ########################

# Inference test

eval_prompt = """Do a java function for getting the even leters in an array"""

model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")
base_model.eval()
with torch.no_grad():
    generated_code = tokenizer.decode(base_model.generate(**model_input, max_new_tokens=256, pad_token_id=2)[0], skip_special_tokens=True)
print(generated_code)

#outputs = model.generate(**model_input, max_new_tokens=100, return_dict_in_generate=True, output_scores=True)
#generated_token_ids = outputs.sequences
#generated_text = tokenizer.decode(generated_token_ids[0], skip_spectial_tokens=True)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Do a java function for getting the even leters in an array

import java.util.Scanner;

public class EvenLetters {
    public static void main(String[] args) {
        Scanner sc = new Scanner(System.in);
        System.out.println("Enter the size of the array: ");
        int size = sc.nextInt();
        String[] arr = new String[size];
        System.out.println("Enter the elements of the array: ");
        for (int i = 0; i < size; i++) {
            arr[i] = sc.next();
        }
        System.out.println("The even letters in the array are: ");
        for (int i = 0; i < size; i++) {
            if (arr[i].length() % 2 == 0) {
                System.out.println(arr[i]);
            }
        }
    }
}


In [None]:
# Empty VRAM
del model
del pipe
del trainer
import gc
gc.collect()
gc.collect()

In [None]:
# Pipeline function from Transformers library to generate response based on the prompt

pipe = pipeline(
    "text-generation",
    model = base_model,
    tokenizer = tokenizer,
    torch_dtype = torch.bfloat16,
    device_map = "auto"
)

prompt = "Programa en python una funcion para contar sumar los 5 primeros números"

sequences = pipe(
    prompt,
    do_sample = True,
    max_new_tokens = 100,
    temperature = 0.7,
    top_k = 50,
    top_p = 0.95,
    num_return_sequences = 1,
)
print(sequences[0]['generated_text'])

In [None]:
prompt = "What is Datacamp Career track?"
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

In [None]:
!huggingface-cli login

model.push_to_hub(new_model, use_temp_dir=False)
tokenizer.push_to_hub(new_model, use_temp_dir=False)

In [None]:
new_model = "mistralaiCode" #Name of the model you will be pushing to huggingface model hub

merged_model= PeftModel.from_pretrained(base_model, new_model)
merged_model= merged_model.merge_and_unload()

# Save the merged model
merged_model.save_pretrained("merged_model",safe_serialization=True)
tokenizer.save_pretrained("merged_model")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
# Reload model in FP16 (Para cargarlo luego mas tarde)
base_model = AutoModelForCausalLM.from_pretrained(
    model_name, #HuggingFace upload
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map,
)