# Simple Quantization (using CLI)

In [None]:
optimum-cli inc quantize --model facebook/opt-1.3b --output opt1.3b-quantized

# Flexible Quantization (using Code)

In [None]:
model_name "aman-mehra/opt-1.3b-finetune-squad-ep-0.4-lr-2e-05-wd-0.01"
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir="./opt-1.3b")
model = AutoModelForQuestionAnswering.from_pretrained(model_name, cache_dir="./opt-1.3b")

In [None]:
task_evaluator = evaluate.evaluator("question-answering")

eval_dataset = load_dataset("squad", split="validation", cache_dir="./squad-ds")
eval_dataset = eval_dataset.select(range(64)) # Ues a subset of dataset

In [None]:
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

def eval_fn(model):
    qa_pipeline.model = model
    metrics = task_evaluator.compute(model_or_pipeline=qa_pipeline, data=eval_dataset, metric="squad")
    return metrics["f1"]

In [None]:
# Set the accepted accuracy loss to 1%
accuracy_criterion = AccuracyCriterion(tolerable_loss=0.01)

# Set the maximum number of trials to 10
tuning_criterion = TuningCriterion(max_trials=10)

quantization_config = PostTrainingQuantConfig(
    approach="dynamic", accuracy_criterion=accuracy_criterion, tuning_criterion=tuning_criterion
)

In [None]:
quantizer = INCQuantizer.from_pretrained(model, eval_fn=eval_fn)

quantizer.quantize(quantization_config=quantization_config, save_directory="opt1.3b-quantized")

# Inference

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("facebook/opt-1.3b")

In [None]:
from optimum.intel import INCModelForCausalLM

model = INCModelForCausalLM.from_pretrained("./opt1.3b-quantized")

In [None]:
inputs = tokenizer("<PROMPT>", return_tensors="pt")

generation_output = model.generate(**inputs,
                                   return_dict_in_generate=True,
                                   output_scores=True,
                                   min_length=512,
                                   max_length=512,
                                   num_beams=1,
                                   do_sample=True,
                                   repetition_penalty=1.5)

In [None]:
print( tokenizer.decode(generation_output.sequences[0]) )