In [34]:
import os
import getpass

if not os.environ.get("NVIDIA_API_KEY", "").startswith("nvapi-"):
    nvapi_key = getpass.getpass("Enter your NVIDIA API key: ")
    assert nvapi_key.startswith("nvapi-"), f"{nvapi_key[:5]}... is not a valid key"
    os.environ["NVIDIA_API_KEY"] = nvapi_key
    os.environ["NGC_API_KEY"] = nvapi_key

Enter your NVIDIA API key:  ········


In [35]:
! echo -e "$NGC_API_KEY" | docker login nvcr.io --username '$oauthtoken' --password-stdin

https://docs.docker.com/engine/reference/commandline/login/#credentials-store

Login Succeeded


In [36]:
# create loras directory
!mkdir -p loras

In [1]:
from datasets import load_dataset

ds = load_dataset("ajsbsd/nvidia-qa")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
ds = ds["train"].train_test_split(test_size=0.2)

In [4]:
for sample in ds['train'].select(range(5)):
    print(f"\n {'*' * 64}\n{sample}\n{'*' * 64}")


 ****************************************************************
{'Unnamed: 0': 1209, 'question': 'How does the nvJitLink library improve upon the previous JIT LTO implementation in CUDA 11.4?', 'answer': 'The nvJitLink library introduced in CUDA Toolkit 12.0 improves upon the previous CUDA 11.4 implementation of JIT LTO. It offers a more streamlined approach by eliminating the dependency on the CUDA driver and providing compatibility guarantees through the CUDA Toolkit.'}
****************************************************************

 ****************************************************************
{'Unnamed: 0': 259, 'question': 'What difficulty has been associated with understanding compiler heuristics on inlining?', 'answer': 'Understanding compiler heuristics on inlining has been difficult without heavy post-processing of assembly output.'}
****************************************************************

 ****************************************************************
{'Unn

In [7]:
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25ldone
[?25h  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993227 sha256=4603535347a903f0bd04e834d461ff948e5fa8659c395400b9b0f6d957908e9f
  Stored in directory: /home/gsh-3atzc7/.cache/pip/wheels/d1/c1/d9/7e068de779d863bc8f8fc9467d85e25cfe47fa5051fff1a1bb
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [5]:
from langdetect import detect

def remove_nonEnglish_rows(ds):
    # Initialize an empty list to store rows detected as English
    new_ds = []
    
    # Initialize a list to store indices of rows that cause issues (corner cases)
    corner_case = []
    
    # Iterate through each row in the dataset's 'text' column
    for i, row in enumerate(ds['question']):
        try:
            # Detect the language of the text
            if detect(str(row)) == 'en':
                # If the language is English, add the row to new_ds
                new_ds.append(row)
        except:
            # If an exception occurs, add the index to corner_case
            corner_case.append(i)
    
    # Return the list of English rows and the indices of corner cases
    return new_ds, corner_case


In [6]:

filter_train_samples,cc_train = remove_nonEnglish_rows(ds['train'])

print("Count of training samples: ",len(filter_train_samples))

Count of training samples:  5650


In [7]:
filter_test_samples,cc_test = remove_nonEnglish_rows(ds['test'])
print("Count of testing samples: ",len(filter_test_samples))


Count of testing samples:  1413


In [8]:
# save English text samples
import json
def save_jsonl(ds,filename):
    with open(f"data/{filename}.jsonl", "w") as write_file:
            json.dump(ds, write_file, indent=4)
            print("dataset saved in jsonl format ....")

In [9]:
def transform_to_template(example):
    conversation_text = example['text']
    segments = conversation_text.split("###")[1:]
    

    for idx,segment in enumerate(segments):
        if idx%2==0:
            segments[idx] = segment.replace('Human:',"<|start_header_id|>user<|end_header_id|>") + "<|eot_id|>"
        else:
            segments[idx] = segment.replace('Assistant:',"<|start_header_id|>assistant<|end_header_id|>") + "<|eot_id|>"
    
    

    segments = ["<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a helpful assistant<|eot_id|>"] + segments

    return {'text': ''.join(segments)}

In [10]:
! mkdir -p data
! mkdir -p data/filtered

In [11]:
# set file names  
save_train_filename = 'filtered/train'
save_test_filename = 'filtered/test'

# save file
save_jsonl(filter_train_samples, save_train_filename)
save_jsonl(filter_test_samples, save_test_filename)

dataset saved in jsonl format ....
dataset saved in jsonl format ....


In [12]:
dataset = load_dataset('data/filtered/')

Generating train split: 5650 examples [00:00, 128151.73 examples/s]
Generating test split: 1413 examples [00:00, 34353.63 examples/s]


In [13]:
template_dataset = dataset.map(transform_to_template)

Map: 100%|██████████| 5650/5650 [00:00<00:00, 19244.69 examples/s]
Map: 100%|██████████| 1413/1413 [00:00<00:00, 9158.93 examples/s]


In [14]:
!mkdir -p data/ds_preprocess
template_dataset.save_to_disk('data/ds_preprocess/')

Saving the dataset (1/1 shards): 100%|██████████| 5650/5650 [00:00<00:00, 1035743.78 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1413/1413 [00:00<00:00, 439948.89 examples/s]


<h3>Training</h3>

In [2]:
!pip install torch



In [28]:
# In some cases where you have access to limited computing resources, you might have to uncomment os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:64" if you run into not enough memory issue 

import os
import torch
import json
import re

from peft import LoraConfig, PeftModel
from trl import SFTTrainer
from datasets import load_dataset, load_from_disk
from langdetect import detect
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)

# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:64"

Setting up the important paths for loading and saving important artifacts.

Llama-3 family of models are open source but require an access request approval. For the bootcamp environment, the weights have already been converted to huggingface compatible format and stored at a shared location for quicker access for the participants. 

In case of running the material on your own environment, please request access for Llama models from [here](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) and generate your HuggingFace user access token from this [link](https://huggingface.co/settings/tokens)

In [23]:
# initialize path to the base model 
# base_model = "meta-llama/Meta-Llama-3-8B-Instruct" # Use this while running the material in your own standalone environment.

base_model = "/raid/tmp/docker-container-storage-2072/overlay2" # shared model weight location

# set the path to the dataset template
data_path = "data/ds_preprocess/train"
# set the path to the dataset template
eval_path = "data/ds_preprocess/test"

# load the transformed dataset
dataset = load_from_disk(data_path)
eval_dataset = load_from_disk(eval_path)

In [8]:
!docker info

Client:
 Version:    24.0.7
 Context:    default
 Debug Mode: false

Server:
 Containers: 6
  Running: 3
  Paused: 0
  Stopped: 3
 Images: 3
 Server Version: 24.0.7
 Storage Driver: overlay2
  Backing Filesystem: xfs
  Supports d_type: true
  Using metacopy: false
  Native Overlay Diff: false
  userxattr: true
 Logging Driver: json-file
 Cgroup Driver: none
 Cgroup Version: 1
 Plugins:
  Volume: local
  Network: bridge host ipvlan macvlan null overlay
  Log: awslogs fluentd gcplogs gelf journald json-file local logentries splunk syslog
 Swarm: inactive
 Runtimes: io.containerd.runc.v2 runc
 Default Runtime: runc
 Init Binary: docker-init
 containerd version: 091922f03c2762540fd057fba91260237ff86acb
 runc version: v1.1.9-0-gccaecfc
 init version: de40ad0
 Security Options:
  seccomp
   Profile: builtin
  rootless
 Kernel Version: 5.15.0-1029-nvidia
 Operating System: Ubuntu 22.04.2 LTS
 OSType: linux
 Architecture: x86_64
 CPUs: 256
 Total Memory: 1.968TiB
 Name: dgx01
 ID: 8c29f74a-22f

In [24]:
# Needed for standalone run
token='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'

In [35]:
!pip install transformers --upgrade



In [46]:
tokenizer = AutoTokenizer.from_pretrained(base_model,
                                          # token=token,
                                          # trust_remote_code=True
                                         )

ValueError: A configuraton of type rag cannot be instantiated because both `question_encoder` and `generator` sub-configurations were not passed, only {'attn_implementation': None}

In [37]:
from transformers import RagTokenizer

# Load the RAG tokenizer with specified models
tokenizer = RagTokenizer.from_pretrained(
    "facebook/rag-token-nq",  # Specify a valid RAG model name
    use_auth_token=token  # Include token if required for private models
)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

In [26]:
from transformers import RagTokenizer, RagRetriever, RagTokenForGeneration

# Load the tokenizer for RAG
tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")

# Load the question encoder and generator configurations
question_encoder_model = "facebook/dpr-question_encoder-single-nq-base"
generator_model = "facebook/bart-large"

# Load the RAG retriever
retriever = RagRetriever.from_pretrained(
    "facebook/rag-token-nq",
    index_name="exact",  # Choose the index type
)

# Instantiate the RAG model
model = RagTokenForGeneration.from_pretrained(
    "facebook/rag-token-nq",
    retriever=retriever,
    question_encoder=question_encoder_model,
    generator=generator_model,
)


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

ImportError: 
RagRetriever requires the faiss library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/facebookresearch/faiss/blob/master/INSTALL.md and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


In [38]:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

AttributeError: 'RagTokenizer' object has no attribute 'eos_token'

In [39]:
! mkdir -p model

In [40]:
training_params = TrainingArguments(
    output_dir="model/results",             # Directory to save the model results
    num_train_epochs=2,                     # Number of training epochs
    per_device_train_batch_size=5,          # Batch size per device during training
    gradient_accumulation_steps=4,          # Number of steps to accumulate gradients
    group_by_length=True,                   # Group sequences of similar lengths together
    save_steps=100,                         # Save model checkpoint every 100 steps
    logging_steps=25,                       # Log metrics every 25 steps
    learning_rate=2e-4,                     # Initial learning rate
    weight_decay=0.001,                     # Weight decay to apply (L2 regularization)
    fp16=False,                             # Use 16-bit precision (half-precision) during training
    bf16=False,                             # Use bfloat16 precision
    max_grad_norm=0.3,                      # Maximum gradient norm (for gradient clipping)
    max_steps=-1,                           # Total number of training steps (-1 means no limit)
    warmup_ratio=0.03,                      # Ratio of steps to perform learning rate warmup
    optim="paged_adamw_32bit",              # Optimizer to use (32-bit AdamW with paged memory)
    lr_scheduler_type="constant",           # Learning rate scheduler type (constant)
    report_to="tensorboard"                 # Reporting tool (TensorBoard in this case)
)


In [41]:
peft_params = LoraConfig(
    lora_alpha=16,                # Alpha parameter for Lora scaling
    lora_dropout=0.1,             # Dropout rate for Lora layers
    r=64,                         # Rank of the Lora matrices
    bias="none",                  # Type of bias to apply (none in this case)
    task_type="CAUSAL_LM",        # Type of task (Causal Language Modeling in this case)
)


In [44]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=False,
)

In [43]:
!pip install bitsandbytes


Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl (122.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m112.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.44.1


In [45]:
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map={"": 0},
    # token=token
)
model.config.use_cache = False
model.config.pretraining_tp = 1

ValueError: A configuraton of type rag cannot be instantiated because both `question_encoder` and `generator` sub-configurations were not passed, only {'attn_implementation': None}