In [None]:
# import pandas as pd
# import json

# # Define the CSV file path
# csv_file_path = 'BOQ_Codification_Dataset.csv'  # Replace 'your_file.csv' with the actual file path

# # Read the specified sheet from the CSV file into a DataFrame with ";" delimiter
# df = pd.read_csv(csv_file_path, delimiter=';')  # Use delimiter parameter

# # Convert the DataFrame to a list of lists
# csv_data = df.values.tolist()

# # Transpose the list of lists to get each column separately
# columns = list(map(list, zip(*csv_data)))

# # Extract the data ID and dialogue data
# data_id_column = columns[0]  # Assuming the first column contains data IDs
# dialogue_data_columns = columns[1:]  # Assuming the rest of the columns contain dialogue data

# # Create a list of JSON dictionaries
# json_data = []

# # Iterate through the rows and create JSON dictionaries
# for i in range(len(data_id_column)):
#     data_id = data_id_column[i]
#     dialogue = [column[i] for column in dialogue_data_columns]
    
#     # Create a JSON dictionary for each row
#     json_entry = {
#         "id": data_id,
#         "data": dialogue
#     }
    
#     json_data.append(json_entry)

# # Now, 'json_data' contains a list of JSON dictionaries with data IDs and dialogue data
# # You can convert this list to a JSON string if needed:
# json_string = json.dumps(json_data)

# # # Print the JSON data
# # print(json_string)
# json_data['data']

Development Environment
--------------

The Python version used is 3.11.5
****
IDE of choice is VSCode
****
The required python packages to be installed using the pip command are:

1- trl

2- peft

3- pandas

4- datasets

5- torch with cuda enabled

6- tensorflow

7- transformers

8- bitsandbytes

****
PC specs:

1- CPU --> Intel core i9 -12900KF , 24 core, 32 threads

2- GPU --> Nvidia RTX 3090 - 24GB GDDR6 VRAM

3- RAM --> 32GB 4800MHZ DDR5

Importing required libraries and the training dataset
------------------

In [73]:
# The dataset imported in this case is in the form of a csv file which is the reommended version.
#The csv file type has to be converted into datasets.arrow_dataset.Dataset format to be able to use the dataset with NLP and LLMs.
#To undertake that, you have to convert your CSV file into a pandas dataframe and then convert the dataframe into a dataset object using the code below.

#Importing the required libararies
import pandas as pd
import datasets

# Load the CSV file into a Pandas DataFrame
csv_file_path = 'BOQ_Codification_Dataset.csv'
df = pd.read_csv(csv_file_path, delimiter=';')

# Create a new 'data' column by combining all columns (except 'id') into a list
df['data'] = df.iloc[:, 1:].apply(lambda row: row.tolist(), axis=1)

# Drop the original columns (except 'id') to keep only the 'id' and 'data' columns
df = df[['id', 'data']]

# Convert the Pandas DataFrame into a Dataset
train_dataset = datasets.Dataset.from_pandas(df)
train_dataset['data']

[['Do you have NRM1 coding information for the following activity: 40 SMA surface 60 SMA binder 170 EME2 Base?',
  'In order to understand the NRM1 coding for the activity: 40 SMA surface 60 SMA binder 170 EME2 Base , we have to understand first the hierarchy of codification of this activity with the NRM1 method of measurement.\\n\\n The first level of hierarchy of this activity is lying within section: 08 - External Works.As for level 2, the activity: 40 SMA surface 60 SMA binder 170 EME2 Baselie under sub-section: 08.2 - Roads_Paths_Pavings and Surfacing within NRM1.I also have access to level 3 data. The level 3 coding for the activity: 40 SMA surface 60 SMA binder 170 EME2 Base is: 08.2.1 - Roads_paths and pavings.I am afraid that in this case I do not have any access to any further level of details.  From my knowledge, NRM1 provides level 4 coding information only in section 09 Preliminaries.'],
 ['Do you have NRM1 coding information for the following activity: 40 SMA surface 60 S

Loading the required LLM
------------------

In [74]:
# to sort out the issue of bitsandbytes, uninstall the current bitsandbytes version using pip uninstall bitsandbytes
# Install it from source using the following command : pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.1-py3-none-win_amd64.whl

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig

# Load the stock 7b mistral model
model_id = "mistralai/Mistral-7B-Instruct-v0.2"

#Configure quantization to optimize and compress models for deployment on resource-constrained devices
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16, # Define the data type used during the quantization computations. FP16 is used to maintain a balance between precision and memory efficiency 
    bnb_4bit_quant_type="nf4" #non-linear 4-bit quantization method to optimize the representation of numerical values in the model during deployment.
)

# Load model
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

# Set it to a new token to correctly attend to EOS tokens.
tokenizer.add_special_tokens({'pad_token': '<PAD>'})

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

1

Applying the LoRA configuration
--------------

In [75]:
# check this link for further explanation: https://huggingface.co/docs/peft/conceptual_guides/lora

lora_config = LoraConfig(
    r=8, # Rank of update matrices, lower rank leads to smaller update matrices with fewer trainable parameters
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"], #Attention blocks to apply to the loRA update matrices
    bias="none",#determine if bias paramters should be trained. Could take the value of "none","all","lora_only"
    task_type="CAUSAL_LM", 
)

model.add_adapter(lora_config)

In [76]:
#The purpose of this step is to initiate the training arguments used for the imported LLM and the use case

from transformers import TrainingArguments

YOUR_HF_USERNAME = "MohamedAshour1993" # Hugingface username. Differs from oneuser to another. Not necessary here in this example

output_dir = f"{YOUR_HF_USERNAME}/mistral-7b-qlora-boqv1" # Location of saving the trained checkpoints of the model
per_device_train_batch_size = 4 # Each GPU (or device) will process a batch of X training examples at a time
gradient_accumulation_steps = 4 # Gradients are accumulated over X mini-batches before performing a weight update. 
optim = "paged_adamw_32bit"# A custom optimizer or a variant of the AdamW optimizer with 32-bit precision.
save_steps = 10 #How often model checkpoints are saved during training
logging_steps = 10 #How often training logs are printed or recorded
learning_rate = 2e-4
max_grad_norm = 0.3 # Gradients that exceed this norm are scaled down to prevent exploding gradients
max_steps = 50 # Maximum number of training steps.
warmup_ratio = 0.03 #The learning rate will have a warm-up phase that covers X% of the total training steps
lr_scheduler_type = "constant" #Means that the learning rate remains constant throughout training

training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    lr_scheduler_type=lr_scheduler_type,
    gradient_checkpointing=True, #can help reduce memory usage during training at the cost of slightly longer training times
    push_to_hub=False, #Choose whether you would like to save the trained model on the Huggingface website
)

Model Training
---------

In [77]:
from trl import SFTTrainer

#A function to prepare the model to be fine-tuned using the prepared dataset.

def formatting_func(example):
    text = f"### USER: {example['data'][0]}\n### ASSISTANT: {example['data'][1]}" 
    return text

In [78]:
#Setting out the training arguments for the fine-tuning of the imported LLM

trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=train_dataset,
    packing=True,  #Packing typically refers to data preprocessing or batching strategies that optimize training by efficiently packing data into batches
    dataset_text_field="data",
    tokenizer=tokenizer,
    max_seq_length=1024, #sets the maximum sequence length for input data. Sequences longer than this value may be truncated or split to fit within the specified length.
    formatting_func=formatting_func,
)

Generating train split: 0 examples [00:00, ? examples/s]



In [79]:
trainer.train()

  0%|          | 0/50 [00:00<?, ?it/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.6328, 'learning_rate': 0.0002, 'epoch': 0.07}




{'loss': 0.3048, 'learning_rate': 0.0002, 'epoch': 0.14}




{'loss': 0.2683, 'learning_rate': 0.0002, 'epoch': 0.22}




{'loss': 0.2195, 'learning_rate': 0.0002, 'epoch': 0.29}




{'loss': 0.2068, 'learning_rate': 0.0002, 'epoch': 0.36}




{'train_runtime': 900.3316, 'train_samples_per_second': 0.889, 'train_steps_per_second': 0.056, 'train_loss': 0.32644065856933596, 'epoch': 0.36}


TrainOutput(global_step=50, training_loss=0.32644065856933596, metrics={'train_runtime': 900.3316, 'train_samples_per_second': 0.889, 'train_steps_per_second': 0.056, 'train_loss': 0.32644065856933596, 'epoch': 0.36})

Saving the trained model
-------------

In [80]:
save_directory="./Trained_BoQ_Coding_Mistal/"
trainer.save_model(save_directory)



Testing the trained model
---------

In [81]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig
)

#Import the saved fine-tuned LLM
save_directory="./Trained_BoQ_Coding_Mistal/"
tokenizer = AutoTokenizer.from_pretrained(save_directory)

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True, 
    bnb_4bit_compute_dtype=torch.float16)

model = AutoModelForCausalLM.from_pretrained(
    save_directory,
    quantization_config=quantization_config,
    adapter_kwargs={"revision": "09487e6ffdcc75838b10b6138b6149c36183164e"} # Adapters are modules that can be added to pre-trained language models to perform task-specific fine-tuning without modifying the base model architecture
)

#Prompting the model
text = "### USER: What is the NRM1 coding for testing and commissioning?### Assistant:"

#tokenize the text into pytorch tensors and send them to the GPU. In this case it is 0 - first GPU.
inputs = tokenizer(
    text, 
    return_tensors="pt").to(0) 

outputs = model.generate(
    inputs.input_ids, #provides Inputs IDs created in the previous step to the LLM
    max_new_tokens=250, # The generated text should not exceed 250 tokens
    do_sample=False # The model will generate text deterministically rather than stochastically
    )

print("After attaching Lora adapters:")
print(
    tokenizer.decode(
    outputs[0], #It retrieves the first generated output
    skip_special_tokens=False #This parameter ensures that special tokens, such as [CLS], [SEP], or [PAD], are not skipped during decoding
    )
    )

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


After attaching Lora adapters:
<s> ### USER: What is the NRM1 coding for testing and commissioning?### Assistant: In order to understand the NRM1 coding for the activity: Testing and commissioning , we have to understand first the hierarchy of codification of this activity with the NRM1 method of measurement.\n\n The first level of hierarchy of this activity is lying within section: 09 - Main contractor’s preliminaries.As for level 2, the activity: Testing and commissioninglie under sub-section: 09.2 - Main contractor’s cost items within NRM1.I also have access to level 3 data. The level 3 coding for the activity: Testing and commissioning is: 09.2.11 - Testing and commissioning.I am afraid that in this case I do not have any access to any further level of details.  From my knowledge, NRM1 provides level 4 coding information only in section 09 Preliminaries.</s>


Comparing the responses to the non-finetuned (stock) LLM
----------

In [83]:
model.disable_adapters()

outputs = model.generate(
    inputs.input_ids, 
    max_new_tokens=250, 
    do_sample=False)

print("Before Lora:")
print(
    tokenizer.decode(
        outputs[0], 
        skip_special_tokens=False
        )
        )

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Before Lora:
<s> ### USER: What is the NRM1 coding for testing and commissioning?### Assistant: I'm here to help answer questions, but it seems there might be a misunderstanding in your query. The term "NRM1" is not directly related to testing and commissioning in the context of electrical or electronic systems. Instead, it's a term used in the National Regulatory Reform Act of 1996, specifically in the context of the Occupational Safety and Health Administration (OSHA) and the Environmental Protection Agency (EPA). It refers to a set of rules that streamline the regulatory process for certain industries. If you're looking for information related to testing and commissioning in the context of electrical or electronic systems, you might want to consider terms like "commissioning process," "test and inspection," or "acceptance testing." If you could please provide more context or clarify your question, I'd be happy to help further.</s>
