In [1]:
import joblib
import torch
import pandas as pd
import transformers
from transformers import StoppingCriteria, StoppingCriteriaList

from torch import bfloat16

from langchain.llms import HuggingFacePipeline
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

import argparse
import bitsandbytes as bnb
from functools import partial
import os
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, AutoPeftModelForCausalLM
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, BitsAndBytesConfig, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import load_dataset




  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Constants
import os
from dotenv import load_dotenv
# load the environment
load_dotenv()
HF_PAT = os.getenv('HUGGING_FACE_PAT')
CONVERSATION_PATH = os.getenv('CONVERSATION')
SEED = 42

In [3]:
# GPU setup
device = torch.device("cuda")
device

device(type='cuda')

In [40]:
### Functions
# Loading in the model
# SOURCE https://blog.ovhcloud.com/fine-tuning-llama-2-models-using-a-single-gpu-qlora-and-ai-notebooks/ 
def load_model(model_name, bnb_config):
    n_gpus = torch.cuda.device_count()
    max_memory = f'{40960}MB'

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto", # dispatch efficiently the model on the available ressources
        max_memory = {i: max_memory for i in range(n_gpus)},
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)

    # Needed for LLaMA tokenizer
    tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

# Configure quantization settings for loading the model with less GPU memory usage
def bnb_cfg():
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )
    return bnb_config

# Create a peft config for fine tuning with Lora
def create_peft_config(modules):
    lora_config = LoraConfig(
        r=16,  # dimension of the updated matrices
        lora_alpha=64,  # parameter for scaling
        target_modules=modules,
        lora_dropout=0.1,  # dropout probability for layers
        bias="none",
        task_type="CAUSAL_LM",
    )

    return lora_config

# SOURCE https://github.com/artidoro/qlora/blob/main/qlora.py
def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:  # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

# SOURCE https://blog.ovhcloud.com/fine-tuning-llama-2-models-using-a-single-gpu-qlora-and-ai-notebooks/ 
def print_trainable_parameters(model, use_4bit=False):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        num_params = param.numel()
        # if using DS Zero 3 and the weights are initialized empty
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel

        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params
    if use_4bit:
        trainable_params /= 2
    print(
        f"all params: {all_param:,d} || trainable params: {trainable_params:,d} || trainable%: {100 * trainable_params / all_param}"
    )

# Display Reults
def print_results(query,chat_history,chain):
    result = chain({"question": query, "chat_history": chat_history})
    # Find the start of the "Helpful Answer" section
    start_marker = "Helpful Answer: "
    start_index = result['answer'].find(start_marker)

    if start_index != -1:
        # Extract the helpful answer text
        helpful_answer =  result['answer'][start_index + len(start_marker):].strip()

        # Extrac the first sentence
        pos = helpful_answer.find('\n\n')
        first_sentence = helpful_answer[:pos]

        # Print helpful ouput
        print(first_sentence)
    else:
        print("Helpful Answer not found in the output.")

### Fine-Tuned DataSet + Exploration

In [5]:
# Load in fine-tuned dataset from 
ds = load_dataset("databricks/databricks-dolly-15k", split="train")

In [6]:
# View length and column names
len(ds),ds.column_names

(15011, ['instruction', 'context', 'response', 'category'])

In [7]:
# View Head for all Columns
ds['instruction'][:5]

['When did Virgin Australia start operating?',
 'Which is a species of fish? Tope or Rope',
 'Why can camels survive for long without water?',
 "Alice's parents have three daughters: Amy, Jessy, and what’s the name of the third daughter?",
 'When was Tomoaki Komorida born?']

In [8]:
ds['context'][:5]

["Virgin Australia, the trading name of Virgin Australia Airlines Pty Ltd, is an Australian-based airline. It is the largest airline by fleet size to use the Virgin brand. It commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route. It suddenly found itself as a major airline in Australia's domestic market after the collapse of Ansett Australia in September 2001. The airline has since grown to directly serve 32 cities in Australia, from hubs in Brisbane, Melbourne and Sydney.",
 '',
 '',
 '',
 'Komorida was born in Kumamoto Prefecture on July 10, 1981. After graduating from high school, he joined the J1 League club Avispa Fukuoka in 2000. Although he debuted as a midfielder in 2001, he did not play much and the club was relegated to the J2 League at the end of the 2001 season. In 2002, he moved to the J2 club Oita Trinita. He became a regular player as a defensive midfielder and the club won the championship in 2002 and was promoted in 2003. He played ma

In [9]:
ds['response'][:5]

['Virgin Australia commenced services on 31 August 2000 as Virgin Blue, with two aircraft on a single route.',
 'Tope',
 'Camels use the fat in their humps to keep them filled with energy and hydration for long periods of time.',
 'The name of the third daughter is Alice',
 'Tomoaki Komorida was born on July 10,1981.']

In [10]:
ds['category'][:5]

['closed_qa', 'classification', 'open_qa', 'open_qa', 'closed_qa']

In [11]:
null_rows = [i for i, row in enumerate(ds) if any(value is None for value in row.values())]

if null_rows:
    print(f"Rows with null values found: {null_rows}")
else:
    print("No rows with null values found.")

No rows with null values found.


### Preprocessing Functions

In [12]:
# Creating the prompt format based on the fine-tuning dataset -> Preprocessing
# SOURCE https://blog.ovhcloud.com/fine-tuning-llama-2-models-using-a-single-gpu-qlora-and-ai-notebooks/ 
def create_prompt_formats(sample):
    INTRO_BLURB = "Instruction with Response Task"
    INSTRUCTION_KEY = "### Instruction:"
    INPUT_KEY = "Input:"
    RESPONSE_KEY = "### Response:"
    END_KEY = "### End"
    
    blurb = f"{INTRO_BLURB}"
    instruction = f"{INSTRUCTION_KEY}\n{sample['instruction']}"
    input_context = f"{INPUT_KEY}\n{sample['context']}" if sample["context"] else None
    response = f"{RESPONSE_KEY}\n{sample['response']}"
    end = f"{END_KEY}"
    
    parts = [part for part in [blurb, instruction, input_context, response, end] if part]

    formatted_prompt = "\n\n".join(parts)
    
    sample["text"] = formatted_prompt

    return sample

# Preprocessing
# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length

# Preprocessing
# Tokenizing batch
def preprocess_batch(batch, tokenizer, max_length):
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )

# PreProcessing
# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, seed, dataset: str):
    
    # Add prompt to each sample
    print("Preprocessing dataset...")
    dataset = dataset.map(create_prompt_formats)#, batched=True)
    
    # Apply preprocessing to each batch of the dataset & and remove 'instruction', 'context', 'response', 'category' fields
    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        remove_columns=["instruction", "context", "response", "text", "category"],
    )

    # Filter out samples that have input_ids exceeding max_length
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)
    
    # Shuffle dataset
    dataset = dataset.shuffle(seed=seed)

    return dataset

In [13]:
# Define the model ID for a pre-trained language model
model_id = 'Llama-2-13b-chat-hf'

# Create configuration
bnb_config = bnb_cfg()

# Load in the model
model, tokenizer = load_model(model_id, bnb_config)

Loading checkpoint shards: 100%|██████████| 3/3 [00:10<00:00,  3.50s/it]


In [14]:
# Preprocess the dataset
max_length = get_max_length(model)

dataset = preprocess_dataset(tokenizer, max_length, SEED, ds)

Found max lenth: 4096
Preprocessing dataset...


### Training the Model

In [15]:
# Trainer Function 
# SOURCE https://blog.ovhcloud.com/fine-tuning-llama-2-models-using-a-single-gpu-qlora-and-ai-notebooks/
def train(model, tokenizer, dataset, output_dir):
    # Apply preprocessing to the model to prepare it by
    # 1 - Enabling gradient checkpointing to reduce memory usage during fine-tuning
    model.gradient_checkpointing_enable()

    # 2 - Using the prepare_model_for_kbit_training method from PEFT
    model = prepare_model_for_kbit_training(model)

    # Get lora module names
    modules = find_all_linear_names(model)

    # Create PEFT config for these modules and wrap the model to PEFT
    peft_config = create_peft_config(modules)
    model = get_peft_model(model, peft_config)
    
    # Print information about the percentage of trainable parameters
    print_trainable_parameters(model)
    
    # Training parameters
    trainer = Trainer(
        model=model,
        train_dataset=dataset,
        args=TrainingArguments(
            per_device_train_batch_size=1,
            gradient_accumulation_steps=4,
            warmup_steps=2,
            max_steps=20,
            learning_rate=2e-4,
            fp16=True,
            logging_steps=1,
            output_dir="outputs",
            optim="paged_adamw_8bit",
        ),
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
    )
    
    model.config.use_cache = False  # re-enable for inference to speed up predictions for similar inputs
    
    ### SOURCE https://github.com/artidoro/qlora/blob/main/qlora.py
    # Verifying the datatypes before training
    
    dtypes = {}
    for _, p in model.named_parameters():
        dtype = p.dtype
        if dtype not in dtypes: dtypes[dtype] = 0
        dtypes[dtype] += p.numel()
    total = 0
    for k, v in dtypes.items(): total+= v
    for k, v in dtypes.items():
        print(k, v, v/total)
     
    do_train = True
    
    # Launch training
    print("Training...")
    
    if do_train:
        train_result = trainer.train()
        metrics = train_result.metrics
        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()
        print(metrics)    
    
    ###
    
    # Saving model
    print("Saving last checkpoint of the model...")
    os.makedirs(output_dir, exist_ok=True)
    trainer.model.save_pretrained(output_dir)
    
    # Free memory for merging weights
    del model
    del trainer
    torch.cuda.empty_cache()
    

In [16]:
# Create path for model output + train
output_dir = 'training/llama2-13b-train'
train(model=model, tokenizer=tokenizer, dataset=dataset, output_dir=output_dir)

max_steps is given, it will override any value given in num_train_epochs


all params: 6,734,566,400 || trainable params: 62,586,880 || trainable%: 0.9293379303528732
torch.float32 390681600 0.05801139624965313
torch.uint8 6343884800 0.9419886037503469
Training...


  attn_output = torch.nn.functional.scaled_dot_product_attention(
  5%|▌         | 1/20 [00:03<01:10,  3.71s/it]

{'loss': 2.7124, 'grad_norm': 2.366163730621338, 'learning_rate': 0.0001, 'epoch': 0.0}


 10%|█         | 2/20 [00:07<01:10,  3.91s/it]

{'loss': 1.8355, 'grad_norm': 1.244856595993042, 'learning_rate': 0.0002, 'epoch': 0.0}


 15%|█▌        | 3/20 [00:11<01:03,  3.74s/it]

{'loss': 2.387, 'grad_norm': 1.6521025896072388, 'learning_rate': 0.00018888888888888888, 'epoch': 0.0}


 20%|██        | 4/20 [00:14<00:55,  3.46s/it]

{'loss': 1.527, 'grad_norm': 1.5103988647460938, 'learning_rate': 0.00017777777777777779, 'epoch': 0.0}


 25%|██▌       | 5/20 [00:18<00:54,  3.64s/it]

{'loss': 1.6932, 'grad_norm': 1.3844774961471558, 'learning_rate': 0.0001666666666666667, 'epoch': 0.0}


 30%|███       | 6/20 [00:22<00:52,  3.73s/it]

{'loss': 1.5146, 'grad_norm': 1.065812110900879, 'learning_rate': 0.00015555555555555556, 'epoch': 0.0}


 35%|███▌      | 7/20 [00:25<00:45,  3.47s/it]

{'loss': 1.2965, 'grad_norm': 1.4830759763717651, 'learning_rate': 0.00014444444444444444, 'epoch': 0.0}


 40%|████      | 8/20 [00:28<00:42,  3.58s/it]

{'loss': 1.5145, 'grad_norm': 0.9698284864425659, 'learning_rate': 0.00013333333333333334, 'epoch': 0.0}


 45%|████▌     | 9/20 [00:32<00:38,  3.52s/it]

{'loss': 1.4825, 'grad_norm': 1.5077791213989258, 'learning_rate': 0.00012222222222222224, 'epoch': 0.0}


 50%|█████     | 10/20 [00:36<00:35,  3.58s/it]

{'loss': 1.1593, 'grad_norm': 0.8885797262191772, 'learning_rate': 0.00011111111111111112, 'epoch': 0.0}


 55%|█████▌    | 11/20 [00:39<00:33,  3.68s/it]

{'loss': 1.8113, 'grad_norm': 0.6621375679969788, 'learning_rate': 0.0001, 'epoch': 0.0}


 60%|██████    | 12/20 [00:43<00:28,  3.53s/it]

{'loss': 1.5595, 'grad_norm': 1.740478754043579, 'learning_rate': 8.888888888888889e-05, 'epoch': 0.0}


 65%|██████▌   | 13/20 [00:46<00:23,  3.41s/it]

{'loss': 1.3677, 'grad_norm': 1.433552622795105, 'learning_rate': 7.777777777777778e-05, 'epoch': 0.0}


 70%|███████   | 14/20 [00:49<00:20,  3.40s/it]

{'loss': 1.3067, 'grad_norm': 1.295323133468628, 'learning_rate': 6.666666666666667e-05, 'epoch': 0.0}


 75%|███████▌  | 15/20 [00:52<00:16,  3.32s/it]

{'loss': 1.3175, 'grad_norm': 2.0544493198394775, 'learning_rate': 5.555555555555556e-05, 'epoch': 0.0}


 80%|████████  | 16/20 [00:56<00:13,  3.33s/it]

{'loss': 1.595, 'grad_norm': 1.437535285949707, 'learning_rate': 4.4444444444444447e-05, 'epoch': 0.0}


 85%|████████▌ | 17/20 [00:59<00:09,  3.29s/it]

{'loss': 1.4228, 'grad_norm': 2.224057197570801, 'learning_rate': 3.3333333333333335e-05, 'epoch': 0.0}


 90%|█████████ | 18/20 [01:02<00:06,  3.27s/it]

{'loss': 1.3196, 'grad_norm': 1.6524991989135742, 'learning_rate': 2.2222222222222223e-05, 'epoch': 0.0}


 95%|█████████▌| 19/20 [01:06<00:03,  3.36s/it]

{'loss': 1.1481, 'grad_norm': 1.2832984924316406, 'learning_rate': 1.1111111111111112e-05, 'epoch': 0.01}


100%|██████████| 20/20 [01:09<00:00,  3.46s/it]

{'loss': 1.1746, 'grad_norm': 1.368126630783081, 'learning_rate': 0.0, 'epoch': 0.01}


100%|██████████| 20/20 [01:10<00:00,  3.51s/it]


{'train_runtime': 70.207, 'train_samples_per_second': 1.139, 'train_steps_per_second': 0.285, 'train_loss': 1.5572651326656342, 'epoch': 0.01}
***** train metrics *****
  epoch                    =     0.0053
  total_flos               =  1267740GF
  train_loss               =     1.5573
  train_runtime            = 0:01:10.20
  train_samples_per_second =      1.139
  train_steps_per_second   =      0.285
{'train_runtime': 70.207, 'train_samples_per_second': 1.139, 'train_steps_per_second': 0.285, 'total_flos': 1361225849702400.0, 'train_loss': 1.5572651326656342, 'epoch': 0.005333688912594173}
Saving last checkpoint of the model...


In [17]:
# Grab model from output dir, merge, and unload
model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map="auto", torch_dtype=torch.bfloat16)
model = model.merge_and_unload()

output_merged_dir = "llama2-13b-fined-tuned-model-final"
os.makedirs(output_merged_dir, exist_ok=True)
model.save_pretrained(output_merged_dir, safe_serialization=True)

Loading checkpoint shards: 100%|██████████| 3/3 [00:08<00:00,  2.99s/it]
Some parameters are on the meta device device because they were offloaded to the cpu.
Some parameters are on the meta device device because they were offloaded to the cpu.
Saving checkpoint shards: 100%|██████████| 1/1 [02:17<00:00, 137.83s/it]


In [18]:
# save tokenizer for future reference
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.save_pretrained(output_merged_dir)

('llama2-13b-fined-tuned-model-final\\tokenizer_config.json',
 'llama2-13b-fined-tuned-model-final\\special_tokens_map.json',
 'llama2-13b-fined-tuned-model-final\\tokenizer.json')

### Model Evaluattion

In [15]:
# Initialize HuggingFace authentication token
hf_auth = HF_PAT
model_id_refined = 'llama2-13b-fined-tuned-model-final'

# Load the configuration for the pre-trained model
model_config = transformers.AutoConfig.from_pretrained(
    model_id_refined,
    use_auth_token=hf_auth
)



In [18]:
# Load the model for causal language modeling
model = transformers.AutoModelForCausalLM.from_pretrained(
    model_id_refined,
    trust_remote_code=True,
    config=model_config,
    quantization_config=bnb_cfg(),
    # offload_folder="offload", 
    # offload_state_dict = True,
    device_map='auto',
    low_cpu_mem_usage=True,
    torch_dtype=torch.float32,
    use_auth_token=hf_auth
)



In [19]:
# Set the model in evaluation mode for inference
model.eval()

# Print device information where the model is loaded
print(f"Model loaded on {device}")

Model loaded on cuda


In [20]:
tokenizer = transformers.AutoTokenizer.from_pretrained(
    model_id_refined,
    use_auth_token=hf_auth
)

stop_list = ['\nHuman:', '\n```\n']

stop_token_ids = [tokenizer(x)['input_ids'] for x in stop_list]
stop_token_ids = [torch.LongTensor(x).to(device) for x in stop_token_ids]

stop_token_ids



[tensor([    1, 29871,    13, 29950,  7889, 29901], device='cuda:0'),
 tensor([    1, 29871,    13, 28956,    13], device='cuda:0')]

In [21]:
class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        for stop_ids in stop_token_ids:
            if torch.eq(input_ids[0][-len(stop_ids):], stop_ids).all():
                return True
        return False

stopping_criteria = StoppingCriteriaList([StopOnTokens()])

In [22]:
generate_text = transformers.pipeline(
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,  # langchain expects the full text
    task='text-generation',
    # we pass model parameters here too
    stopping_criteria=stopping_criteria,  # without this model rambles during chat
    temperature=0.0001,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max
    max_new_tokens=512,  # max number of tokens to generate in the output
    repetition_penalty=1.1  # without this output begins repeating
)

llm = HuggingFacePipeline(pipeline=generate_text)

In [23]:
loader = CSVLoader(file_path=CONVERSATION_PATH,
                   encoding="utf-8", csv_args={'delimiter': ','})
data = loader.load()

# loader = CSVLoader(file_path="test.csv",
#                    encoding="utf-8", csv_args={'delimiter': ','})
# data = loader.load()

In [24]:
# Initialize embeddings
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}
embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)
# Initialize the vector store
vectorstore = FAISS.from_documents(data, embeddings)

In [30]:
chain = ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever(), return_source_documents=True)

In [41]:
chat_history = []
query = "Where is paris located?"
print_results(query,chat_history,chain)

Paris is located in France.


In [42]:
query = "What is the solar system?"
print_results(query,chat_history,chain)

The solar system consists of eight planets and their moons, as well as asteroids, comets, meteoroids, and other celestial bodies orbiting around the Sun.


In [43]:
query = "Who is the fariest of them all?"
print_results(query,chat_history,chain)

The fairest of them all is the one who tells the most jokes.
