In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
%pip install accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7

Collecting accelerate==0.21.0
  Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/244.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━[0m [32m163.8/244.2 kB[0m [31m4.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft==0.4.0
  Downloading peft-0.4.0-py3-none-any.whl (72 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/72.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes==0.40.2
  Downloading bitsandbytes-0.40.2-py3-none-any.whl (92.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.5/92.5 MB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m


In [None]:
# Import necessary libraries
import os
import torch
from datasets import load_dataset, Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, Trainer, pipeline, logging
import torch
import gc
import pandas as pd

In [None]:
# loading custom dataset
testdataframe = pd.read_parquet('gdrive/MyDrive/Colab Notebooks/lora/training.parquet')
df_val = pd.read_parquet('gdrive/MyDrive/Colab Notebooks/lora/validation.parquet')

In [None]:
# format dataset to validation dataset
def parse_row(row):
    # Split the string on the '[INST]' and '[/INST]' tags
    parts = row.split('[/INST]')
    prompt = parts[0].replace('<s>[INST]', '').strip()  # Remove the tag and extra whitespace
    response = parts[1].replace('</s>', '').strip()     # Remove the closing tag and extra whitespace
    return prompt, response

df_val['parsed'] = df_val['text'].apply(parse_row)  # Apply the parsing function

# Split the parsed tuples into two separate columns
df_val['prompt'] = df_val['parsed'].apply(lambda x: x[0])
df_val['response'] = df_val['parsed'].apply(lambda x: x[1])
# Drop the intermediate column
df_val.drop(columns=['parsed'], inplace=True)

# load validation dataset
valdataframe = df_val[['prompt', 'response']]

print(valdataframe.head())


                                                  prompt  \
5315                                    Как помыть кота?   
10265  Сколько времени нужно чтобы обучить нейронную ...   
9527   I want to create a web application to organize...   
4155                            Soletre a palavra "Amor"   
6450   ¿Cómo puedo crear un videojuego desde cero sin...   

                                                response  
5315   Личного опыта у меня нет, но вот как мыть кошк...  
10265  Время, необходимое для обучения нейронной сети...  
9527   Great idea! Here are some steps you can follow...  
4155   A palavra "amor" é soletrada "a-m-o-r". A pala...  
6450   Crear un videojuego desde cero sin conocimient...  


In [None]:
# Convert to Llama dataset
dataset = Dataset.from_pandas(testdataframe)

# validation prompts
val_questions = valdataframe['prompt'].tolist()
# validation response
val_answers = valdataframe['response'].tolist()

#force garbage collection
gc.collect()

print(dataset, "\n", len(val_questions), "\n", len(val_answers))

Dataset({
    features: ['text', '__index_level_0__'],
    num_rows: 9314
}) 
 1050 
 1050



## Model Setup

In [None]:
def display_cuda_memory():
    print("\n--------------------------------------------------\n")
    print("torch.cuda.memory_allocated: %fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
    print("torch.cuda.memory_reserved: %fGB"%(torch.cuda.memory_reserved(0)/1024/1024/1024))
    print("torch.cuda.max_memory_reserved: %fGB"%(torch.cuda.max_memory_reserved(0)/1024/1024/1024))
    print("\n--------------------------------------------------\n")

# Pytorch memory management config for CUDA
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:1024"


In [None]:
# 4-bit Quantization Configuration
compute_dtype = getattr(torch, "float16")                                 # compute data type, use float16 for memory efficienct
bnb_config = BitsAndBytesConfig(load_in_4bit=True,                        # use 4 bit quantization
                                  bnb_4bit_quant_type="nf4",              # use normalized float 4-bit
                                  bnb_4bit_compute_dtype=compute_dtype,   # float 16 for computing
                                  bnb_4bit_use_double_quant=False)        # double quantization disabled



### Load Base Model

In [None]:
# declare base model
base_model = "NousResearch/Llama-2-7b-chat-hf" # hugging face base model llama 2 7B chat

# Load model with 4-bit precision
model = AutoModelForCausalLM.from_pretrained(base_model,                        # llama 2 base model
                                             quantization_config=bnb_config,    # use the bnb quantization config
                                             device_map={"": 0})                # load the entire model on GPU 0
 # evaluates baseline
bmodel = AutoModelForCausalLM.from_pretrained(base_model)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load llama tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model,
                                          trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/583 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/179 [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/746 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/435 [00:00<?, ?B/s]

### Fine-Tune & Train Model

In [None]:
from peft import LoraConfig
from trl import SFTTrainer

In [None]:
# Stores the LoRA fine tuned model
custom_model = "gdrive/MyDrive/llama-2-7b-chat-custom"

# Set PEFT Parameters
peft_params = LoraConfig(lora_alpha=16,     # scaling factor, the higher, the more significant the update
                         lora_dropout=0.1,  # prevents overfitting by setting portion of the neurons to 0 at a prob.
                         r=64,              # rank of the low-rank matrices
                         bias="none",
                         task_type="CAUSAL_LM") # optimize task type

# Define training parameters
training_params = TrainingArguments(output_dir="./results",
                                    num_train_epochs=1,
                                    per_device_train_batch_size=4,
                                    gradient_accumulation_steps=1,
                                    optim="paged_adamw_32bit",
                                    save_steps=25,
                                    logging_steps=25,
                                    learning_rate=2e-4,
                                    weight_decay=0.001,
                                    fp16=False,
                                    bf16=False,
                                    max_grad_norm=0.3,
                                    max_steps=-1,
                                    warmup_ratio=0.03,
                                    group_by_length=True,
                                    lr_scheduler_type="cosine",  # constant or cosine
                                    report_to="tensorboard")

# Initialize the trainer
trainer = SFTTrainer(model=model,
                     train_dataset=dataset,
                     peft_config=peft_params,
                     dataset_text_field="text", # field in the dataset
                     max_seq_length=None,
                     tokenizer=tokenizer,
                     args=training_params,
                     packing=False)




Map:   0%|          | 0/9314 [00:00<?, ? examples/s]

In [None]:
#Force clean the pytorch cache
gc.collect()
torch.cuda.empty_cache()

In [None]:
# Train the model
trainer.train()

# Save the model and tokenizer
trainer.model.save_pretrained(custom_model)
trainer.tokenizer.save_pretrained(custom_model)

Step,Training Loss
25,1.6608
50,2.1199
75,1.1956
100,1.6429
125,1.1999
150,1.6166
175,1.1616
200,1.5461
225,1.1164
250,1.4993




('gdrive/MyDrive/llama-2-7b-chat-custom/tokenizer_config.json',
 'gdrive/MyDrive/llama-2-7b-chat-custom/special_tokens_map.json',
 'gdrive/MyDrive/llama-2-7b-chat-custom/tokenizer.model',
 'gdrive/MyDrive/llama-2-7b-chat-custom/added_tokens.json',
 'gdrive/MyDrive/llama-2-7b-chat-custom/tokenizer.json')

## Model Evaluation


In [None]:
# Create Pipelines
pipe = pipeline(task="text-generation", model=bmodel, tokenizer=tokenizer, max_length=500)
custom_pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_length=500)

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


### Sample Run

In [None]:
idx = 48
question = val_questions[idx]
print(question)

Que función cumple un perceptrón multicapa?


In [None]:
answer = val_answers[idx]
print(answer)

Un perceptrón multicapa (MLP), es un algoritmo de aprendizaje supervisado que aprende una función entrenándose en un conjunto de datos.  Se utiliza para resolver problemas de asociación de patrones, segmentación de imágenes, compresión de datos, etc. y permite resolver problemas que no son linealmente separables, lo cual es la principal limitación del perceptrón.


In [None]:

#prompt = "Who is Leonardo Da Vinci?"
result = custom_pipe(f"<s>[INST] {question} [/INST]")
print(result[0]['generated_text'])

<s>[INST] Que función cumple un perceptrón multicapa? [/INST] Un perceptrón multicapa es una red neuronal artificial que se utiliza para clasificar objetos o categorías en función de sus características. Un perceptrón multicapa es una red neuronal que se compone de varias capas ocultas y una capa de salida.

La función principal del perceptrón multicapa es aprender a extraer características relevantes de los datos de entrada y utilizarlas para clasificar los objetos o categorías. La red neuronal se entrena con un conjunto de datos etiquetados, y a medida que se entrena, la red neuronal aprende a identificar patrones en los datos de entrada y a clasificar los objetos o categorías en función de esos patrones.

El perceptrón multicapa es una herramienta útil en el campo de la inteligencia artificial y la machine learning, ya que puede ser utilizado para clasificar objetos o categorías complejos y para resolver problemas de aprendizaje automático.

Es importante destacar que el perceptrón 

In [None]:
result = pipe(f"<s>[INST] {question} [/INST]")
print(result[0]['generated_text'])


<s>[INST] Que función cumple un perceptrón multicapa? [/INST]  Un perceptrón multicapa es una red neuronal compuesta por varias capas ocultas, cada una de las cuales está formada por un conjunto de neuronas.

La función principal de un perceptrón multicapa es la siguiente:

1. Procesar información de entrada: Las capas ocultas toman la información de entrada y la procesan de manera más profunda, permitiendo a la red neuronal aprender patrones más complejos en la información de entrada.
2. Aumentar la capacidad de aprendizaje: Al agregar más capas ocultas, la red neuronal puede aprender patrones más complejos y abstractos en la información de entrada, lo que la hace más útil en tareas de clasificación y regresión.
3. Reducir la dimensionalidad de la información: Las capas ocultas pueden reducir la dimensionalidad de la información de entrada, lo que facilita el aprendizaje y la generalización de la red neuronal.
4. Implementar funciones de aprendizaje no lineales: Las capas ocultas pued

In [None]:
# Test 30 validation questions & Answers
val_questions = val_questions[:30]
val_answers = val_answers[:30]

In [None]:
#force garbage collection
gc.collect()

37

In [None]:
base_responses = []
custom_responses = []
def generate_responses(pipe, prompts):
    responses = []
    for prompt in prompts:
        result = pipe(f"<s>[INST] {prompt} [/INST]")
        print(result[0]['generated_text'])
        responses.append(result[0]['generated_text'])
    return responses

# Generate responses
base_responses = generate_responses(pipe, val_questions)
custom_responses = generate_responses(custom_pipe, val_questions)

### Rouge score

In [None]:
%pip install evaluate
%pip install rouge_score

In [None]:
from datasets import load_metric
import evaluate

rouge = evaluate.load('rouge')

base_rouge = rouge.compute(predictions=base_responses, references=val_answers, rouge_types=['rouge1', 'rouge2', 'rougeL'], use_aggregator=True)
custom_rouge = rouge.compute(predictions=custom_responses, references=val_answers, rouge_types=['rouge1', 'rouge2', 'rougeL'], use_aggregator=True)

print(base_rouge.keys())
print(f"Base Model{[base_rouge[i] for i in base_rouge.keys()]}")
print(f"Custom Model{[custom_rouge[i] for i in custom_rouge.keys()]}")
