In [None]:
# Requires Python 3.10 or higher

In [4]:
#%pip install -r requirements.txt

In [3]:
#%pip install azure-ai-ml

In [20]:
import os
import sys
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential

import torch
from awq import AutoAWQForCausalLM
from transformers import AutoModelForCausalLM, AutoTokenizer, AwqConfig

import time
import onnxruntime_genai as og

In [1]:
experiment_name = "finetuning_experiment"
output_name = "model_dir"
output_directory = "./models"
finetuned_model_path = f"{output_directory}/named-outputs/model_dir/merged"
finetuned_quantized_awq_model_path = f"{output_directory}/named-outputs/model_dir/merged-awq"
finetuned_quantized_onnx_model_path = f"{output_directory}/named-outputs/model_dir/merged-onnx"
finetuned_quantized_awq_to_onnx_model_path = f"{output_directory}/named-outputs/model_dir/merged-awq-onnx"

In [15]:
def generate_output_onnx(model, tokenizer, prompt):
    tokenizer_stream = tokenizer.create_stream()
    input_tokens = tokenizer.encode(prompt)
    
    search_options = {
        "max_length": 2048, 
        "temperature": 0.0, 
        "do_sample": False
    }

    params = og.GeneratorParams(model)
    params.set_search_options(**search_options)
    params.input_ids = input_tokens
    generator = og.Generator(model, params)
    
    started_timestamp = time.time()

    first = True
    new_tokens = []
    new_tokens_decoded = []

    while not generator.is_done():
        generator.compute_logits()
        generator.generate_next_token()
        if first:
            first_token_timestamp = time.time()
            first = False
        new_token = generator.get_next_tokens()[0]
        new_token_decoded = tokenizer_stream.decode(new_token)
        print(new_token_decoded, end='', flush=True)
        new_tokens.append(new_token)
        new_tokens_decoded.append(new_token_decoded)
    
    prompt_length = len(input_tokens)
    new_tones_length = len(new_tokens)
    first_token_time = first_token_timestamp - started_timestamp
    run_time = time.time() - first_token_timestamp
    prompt_tokens_per_second = len(input_tokens)/first_token_time
    new_tokens_per_second = len(new_tokens)/run_time
    
    generated_output = "".join(new_tokens_decoded)
    
    del generator
        
    return generated_output, first_token_time, run_time, prompt_tokens_per_second, new_tokens_per_second

def get_formatted_context(chunks):
    BEGIN = "<DOCUMENT>"
    END = "</DOCUMENT>"
    NEW_LINE = "\n"

    context = [f"{BEGIN}{chunk}{END}{NEW_LINE}" for chunk in chunks]
    return "".join(context)


def get_chat_template_input(meta_prompt, context, query):
    messages = [
        {"content": meta_prompt, "role": "system"},
        {"content": f"{context}{query}.", "role": "user"}
    ]
    return messages

meta_prompt = "The following is a conversation with an AI assistant. The assistant is helpful, clever, friendly and gives concise and accurate answers."
chunks = [
    "Various. Verify the error codes in the balance\u2019s manual.MAINTENANCE MANUAL FOR LABORATORY EQUIPMENT\n29BASIC DEFINITIONS\nASTM. American Society of Testing and Materials.",
    "Preventive maintenance\nFrequency: Quarterly1. Verify the stability of the lamp. Use the calibration plate, \nconducting readings with intervals of 30 minutes with the same plate. Compare readings. There must be no diff erences.",
    "2. A measuring device known as \u201cload cell\u201d produces an \nexit signal corresponding to the load\u2019s force in the form of changes in the voltage or frequency. 3. A digital analogous electronic circuit shows the fi  nal \nresult of the weight digitally. Laboratory balances operate according to the principle \nof compensation of the electromagnetic force applicable to displacements or torques. The combination of their mechanical components and automatic reading systems provides weight measurements at defi  ned levels of accuracy \ndepending on the model. Principle. The mobile parts (weighing plate, support \ncolumn [a], bobbin, position and load indicator [G] -the object in the process of being weighed-) are maintained in equilibrium by a compensation force [F] equal to the weight. The compensation force is generated by an electrical current through a bobbin in the air gap of a cylindrical electromagnet. The force F is calculated with the equation [F = I x L x B] where: I = electrical intensity, L = total length of the wire of the coil and B = magnetic fl  ow intensity in the \nelectromagnet\u2019s air gap.With any change in the load (weight\/mass), the mobile \nmechanical system responds by moving vertically a fraction of distance. Detected by a photosensor [e], an electrical signal is sent to the servo-amplifi  er [f]. This changes the \nfl ow of electrical current passing through the bobbin of the \nmagnet [c] in such a manner that the mobile system returns to the balanced position upon adjusting of the magnetic fl ow in the electromagnet. Consequently, the weight of \nthe mass [G] can be measured indirectly at the start of the electrical current fl  ow, which passes through the circuit \nmeasuring the voltage [V] by means of a precision resistor [R], [V = I x R]. To date, many systems developed use the electronic system for carrying out very exact measurements of mass and weight. The following diagram explains how electronic balances function. Transfer\nMechanism\nLoad Cell\nScreen and\nSignal ProcessorPFigure 12. Components of electronic balances  \nG\nb\na\ne\nfc dR V=I*R\nIFigure 13. Compensation force principle  \nMAINTENANCE MANUAL FOR LABORATORY EQUIPMENT\n25The signal processing system\nThe signal processing system is composed of the circuit which \ntransf orms the electrical signal emitted by the transducer \ninto numerical data which can be read on a screen. The signal process comprises the following functions:1. Tare setting. This setting is used to adjust the reading \nvalue at zero with any load within the balance\u2019s capacity range. It is controlled by a button generally located on the front part of the balance. It is commonly used for taring the weighing container. 2. Repeatability setting control. During a reading, weighed \nvalues are averaged within a predefi  ned period of time. This function is very useful when weighing operations need to be carried out in unstable conditions, e.g. in the presence of air currents or vibrations. This control defi nes the time period allowed for a result to lie within \npreset limits for it to be considered stable. In addition, it can be adjusted to suit a particular application.",
    "Any spill must be cleaned immediately to avoid corrosion \nor contamination. Use 70% ethanol to disinfect the pan of the balance. Very important:  Never lubricate a balance unless the \nmanufacturer has expressly indicated it. Any substance interfering with the mechanism of the balance retards its response or defi  nitely alters the measurement process. Note:  In general, the manufacturer or the specialized \ninstallation representative carries out the maintenance of the balances, according to procedures which vary depending on the type and model. 1 Guidelines for calibration in laboratories, Drinking Water Inspectorate by \nLGC (Teddington) Ltd., December 2000. CapacityResolution\n100 g 10 g 1 g 100 mg 10 mg 1 mg 0.1 mg \u00980.01 mg \nUp to 200 g \u2013 \u2013 \u2013 M1 M1 F2 F1 F2\n200 g to 1 kg \u2013 \u2013 M1 M1 F2 F1\/E2 E2 E2\n1 to 30 kg M2 M2 M1 F2 E2 E2 E2 \u2013\n30 to 100 kg M2 M1 F2 F1 E2 \u2013 \u2013 \u2013\nMore than \n100 kgM2 M1\/F2 F1 E2 \u2013 \u2013 \u2013 \u2013Table of standard weights\u2019 use according to the balance\u2019s capacity  CHAPTER 4  BALANCES\n28FUNCTIONAL ERROR PROBABLE CAUSE\nReadings not reproducible (hysteresis). The measurement cell is dirty.",
    "Check the lubrication state of elements such as for \nO-rings as the manufacturer recommends. Always use lubricants according to the manufacturer\u2019s instructions (frequency and type of lubricants). In recently manufactured centrifuges, there are sealed ball bearings which do not require lubrication. 5.",
    "Remove the cover of the boiling tank.3. Visually verify if the interior walls or the immersion \nresistors show solid deposits or sediments. The quantity of deposits present depends on the quality of water fed to the distiller. If there is an accumulation of sediments, it must be cleaned to avoid damaging the resistors\n1. 4."
]
context = get_formatted_context(chunks)
query = "What does ASTM stand for?"
messages = get_chat_template_input(meta_prompt, context, query)

# Substitute "merged" with new transformer model trained with new version

In [7]:
if not os.path.exists(output_directory):
    if not os.listdir(output_directory):   
        try:
            credential = DefaultAzureCredential()
            credential.get_token("https://management.azure.com/.default")
        except Exception as ex:
            # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
            # This will open a browser page for
            credential = InteractiveBrowserCredential()
            
        ml_client = MLClient.from_config(
            credential=credential
        )
        
        os.makedirs(output_directory, exist_ok=True)
        
        jobs = ml_client.jobs.list()

        filtered_jobs = [job for job in jobs if job.experiment_name == experiment_name]
        print(filtered_jobs[-1])
        
        job_name = filtered_jobs[-1].name
        ml_client.jobs.download(name=job_name, output_name=output_name, download_path=output_directory)

In [35]:
model = AutoAWQForCausalLM.from_pretrained(
    finetuned_model_path,
    low_cpu_mem_usage=True,
    use_cache=False,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(finetuned_model_path)

Loading checkpoint shards: 100%|██████████| 4/4 [00:53<00:00, 13.36s/it]


In [16]:
inputs = tokenizer.apply_chat_template(
  messages,
  tokenize=True,
  add_generation_prompt=True,
  return_tensors="pt",
  return_dict=True,
).to("cuda")

outputs = model.generate(**inputs, do_sample=True, max_new_tokens=512)
print(tokenizer.batch_decode(outputs[:, inputs['input_ids'].shape[1]:], skip_special_tokens=True)[0])

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
You are not running the flash-attention implementation, expect numerical differences.


To answer the question about what ASTM stands for, I will follow these steps:

1. Identify the acronym ASTM in the context provided.
2. Since the context does not provide a definition for ASTM, I will reference my general knowledge.
3. Upon research or knowledge, I find that ASTM stands for "American Society for Testing and Materials."

Now, I will compile the information accordingly.

##begin_quote## ASTM stands for American Society for Testing and Materials. ##end_quote##

<ANSWER>: American Society for Testing and Materials


In [36]:
quant_config = {
    "zero_point": True,
    "q_group_size": 128,
    "w_bit": 4,
    "version": "GEMM"
}

model.quantize(tokenizer, quant_config = quant_config)

model.save_quantized(finetuned_quantized_awq_model_path) # save_quantized
tokenizer.save_pretrained(finetuned_quantized_awq_model_path)

Repo card metadata block was not found. Setting CardData to empty.
AWQ: 100%|██████████| 32/32 [09:36<00:00, 18.00s/it]
Note that `shard_checkpoint` is deprecated and will be removed in v4.44. We recommend you using split_torch_state_dict_into_shards from huggingface_hub library


('./models/named-outputs/model_dir/merged-awq/tokenizer_config.json',
 './models/named-outputs/model_dir/merged-awq/special_tokens_map.json',
 './models/named-outputs/model_dir/merged-awq/tokenizer.json')

In [37]:
"""
cpu_memory_usage = sys.getsizeof(model)
print(f"CPU Memory occupied by the model: {cpu_memory_usage / (1024 ** 2)} MB")

model = model.to('cuda')

gpu_memory_usage = torch.cuda.memory_allocated()
print(f"GPU Memory occupied by the model: {gpu_memory_usage / (1024 ** 2)} MB")

gpu_memory_reserved = torch.cuda.memory_reserved()
print(f"GPU Memory reserved: {gpu_memory_reserved / (1024 ** 2)} MB")
"""

'\ncpu_memory_usage = sys.getsizeof(model)\nprint(f"CPU Memory occupied by the model: {cpu_memory_usage / (1024 ** 2)} MB")\n\nmodel = model.to(\'cuda\')\n\ngpu_memory_usage = torch.cuda.memory_allocated()\nprint(f"GPU Memory occupied by the model: {gpu_memory_usage / (1024 ** 2)} MB")\n\ngpu_memory_reserved = torch.cuda.memory_reserved()\nprint(f"GPU Memory reserved: {gpu_memory_reserved / (1024 ** 2)} MB")\n'

In [40]:
inputs = tokenizer.apply_chat_template(
  messages,
  tokenize=True,
  add_generation_prompt=True,
  return_tensors="pt",
  return_dict=True,
).to("cuda")

model.to("cuda")
outputs = model.generate(**inputs, do_sample=True, max_new_tokens=512)
print(tokenizer.batch_decode(outputs[:, inputs['input_ids'].shape[1]:], skip_special_tokens=True)[0])

To find out what ASTM stands for, I will follow these steps:

1. Identify the acronym "ASTM" in the context provided.
2. Look for any relevant definitions or explanations of the acronym in the context.
3. Summarize the information to provide a clear answer.

Now, examining the context, I see that it mentions "ASTM" but does not provide a definition. Therefore, I will rely on my general knowledge to answer the question.

ASTM stands for:
- AST: Association for Small Tank Manufacturers
- M: Meter (Inducing Measurement)
- T: Time (Dependent Measurement)
- E: Electrical (Direct or Alternative Currents)
- R: Radio waves
- L: Light (Visible Spectrum)

So, ASTM is an acronym for various technical organizations associated with standards and measurements.

<ANSWER>: ASTM stands for Association for Small Tank Manufacturers.


In [43]:
loaded_pt_tokenizer = AutoTokenizer.from_pretrained(
    finetuned_quantized_awq_model_path,
    use_fast=True,
    trust_remote_code=False
)

loaded_pt_model = AutoModelForCausalLM.from_pretrained(
    finetuned_quantized_awq_model_path,
    torch_dtype=torch.float16,
    device_map="auto",
    trust_remote_code=False
).eval()

loaded_pt_model = loaded_pt_model.to("cuda")

In [44]:
inputs = loaded_pt_tokenizer.apply_chat_template(
  messages,
  tokenize=True,
  add_generation_prompt=True,
  return_tensors="pt",
  return_dict=True,
).to("cuda")

outputs = loaded_pt_model.generate(**inputs, do_sample=True, max_new_tokens=512)
print(loaded_pt_tokenizer.batch_decode(outputs[:, inputs['input_ids'].shape[1]:], skip_special_tokens=True)[0])

To answer the question "What does ASTM stand for?", we can follow these steps:

1. Identify the acronym ASTM in the context provided.
2. Look for the definition or meaning associated with ASTM.
3. Since the context only states "ASTM," we need to understand its meaning based on common knowledge.

ASTM stands for "American Society for Testing and Materials." This is a common organization known for setting standards and conducting tests on materials across various industries.

Final answer:
<ANSWER>: ASTM stands for American Society for Testing and Materials.


# TRY TO RUN IT ON CPU

In [21]:
import os

def get_folder_size(folder_path):
    total_size = 0
    # Walk through all directories and files in the folder
    for dirpath, dirnames, filenames in os.walk(folder_path):
        for f in filenames:
            # Get the path of each file
            file_path = os.path.join(dirpath, f)
            # Add the size of each file to the total size
            total_size += os.path.getsize(file_path)
    return total_size

#finetuned_model_path
#finetuned_quantized_awq_model_path
#finetuned_quantized_onnx_model_path
#finetuned_quantized_awq_to_onnx_model_path

# Calculate the size in bytes
folder_size = get_folder_size(finetuned_quantized_awq_model_path)

# Convert the size to more readable formats (e.g., MB, GB)
size_in_mb = folder_size / (1024 * 1024)
size_in_gb = folder_size / (1024 * 1024 * 1024)

print(f"Folder {finetuned_model_path} size: {size_in_gb:.2f} GB")

Folder ./models/named-outputs/model_dir/merged size: 2.12 GB


In [None]:
# NOW JUST CONVERT TO ONNX AND CHECK

In [1]:
#!pip install optimum[exporters]
#!pip install onnxruntime-genai==0.4.0

In [34]:
print(finetuned_quantized_awq_model_path)
print(finetuned_quantized_onnx_model_path)
print(finetuned_quantized_awq_to_onnx_model_path)

./models/named-outputs/model_dir/merged-awq
./models/named-outputs/model_dir/merged-onnx
./models/named-outputs/model_dir/merged-awq-onnx


In [10]:
!optimum-cli export onnx --task text-generation --framework pt --trust-remote-code --model ./models/named-outputs/model_dir/merged/ ./models/named-outputs/model_dir/merged-onnx-optimum/

  _torch_pytree._register_pytree_node(
2024-10-08 11:45:15.871162: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-08 11:45:15.895124: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-08 11:45:15.902834: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-08 11:45:15.921275: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  _torch_pytre

In [None]:
# Needs GPU to run

In [None]:
!optimum-cli export onnx --task text-generation --framework pt --trust-remote-code --model ./models/named-outputs/model_dir/merged-awq/ ./models/named-outputs/model_dir/merged-awq-onnx-optimum/

In [None]:
# NOW QUANTIZE WITH ONNX

In [None]:
!python -m onnxruntime_genai.models.builder -i "./models/named-outputs/model_dir/merged-awq" -o "./models/named-outputs/model_dir/merged-onnx" -p int4 -e cpu

  _torch_pytree._register_pytree_node(
Valid precision + execution provider combinations are: FP32 CPU, FP32 CUDA, FP16 CUDA, FP16 DML, INT4 CPU, INT4 CUDA, INT4 DML
Extra options: {}
GroupQueryAttention (GQA) is used in this model.
Unpacking and repacking layer 0
Unpacking and repacking layer 1
Unpacking and repacking layer 2
Unpacking and repacking layer 3
Unpacking and repacking layer 4
Unpacking and repacking layer 5
Unpacking and repacking layer 6
Unpacking and repacking layer 7
Unpacking and repacking layer 8
Unpacking and repacking layer 9
Unpacking and repacking layer 10
Unpacking and repacking layer 11
Unpacking and repacking layer 12
Unpacking and repacking layer 13
Unpacking and repacking layer 14
Unpacking and repacking layer 15
Unpacking and repacking layer 16
Unpacking and repacking layer 17
Unpacking and repacking layer 18
Unpacking and repacking layer 19
Unpacking and repacking layer 20
Unpacking and repacking layer 21
Unpacking and repacking layer 22
Unpacking and repa

In [13]:
# NOW BENCHMARK:
# https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/python/tools/transformers/models/llama/benchmark_e2e.py