# Finetune LLaMA2 and MPT on Intel Xeon CPU

## 1. Prerequisite​

In [None]:
!git clone https://github.com/intel/intel-extension-for-transformers.git

!pip install -r requirements.txt

### 1.1 Setup Environment​

In [None]:
!pip install intel-extension-for-transformers torch datasets

### 1.2 Prepare Dataset

Download Alpaca dataset from [here](https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json).

In [1]:
#provide the input data
alpaca_data_path = "bot_data_1_4_chapter.json"  

#using open llama 3b
llama2_model_name_or_path = "openlm-research/open_llama_3b" 

# use this for LLAMA-2-7b-hf : meta-llama/Llama-2-7b-hf"

## 2. Finetune LLaMA2 on Intel Xeon CPU with LoRA

### 2.1 Setup Finetuning Config

In [None]:
from transformers import TrainingArguments
from intel_extension_for_transformers.neural_chat.config import (
    ModelArguments,
    DataArguments,
    FinetuningArguments,
    TextGenerationFinetuningConfig,
)

model_args = ModelArguments(
    model_name_or_path=llama2_model_name_or_path,
    use_fast_tokenizer=False,
)

data_args = DataArguments(
    train_file=alpaca_data_path,
    dataset_concatenation=True,
)

training_args = TrainingArguments(
    output_dir="./llama_peft_finetuned_model",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=0.01,
    num_train_epochs=2,
    save_strategy="no",
    log_level="info",
    save_total_limit=2,
    bf16=True,
)

finetune_args = FinetuningArguments(
    lora_all_linear=True,
    do_lm_eval=True,
)

finetune_cfg = TextGenerationFinetuningConfig(
        model_args=model_args,
        data_args=data_args,
        training_args=training_args,
        finetune_args=finetune_args,
)

### 2.2 Finetuning

In [None]:
from intel_extension_for_transformers.neural_chat.chatbot import finetune_model
finetune_model(finetune_cfg)

### Note: For using LLAMA-2-7b-hf, you need to get the access form the official website and then generate access token in write mode

In [7]:
# Paste the access code here
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### 3. Inference with the model

In [None]:
from intel_extension_for_transformers.neural_chat.models.model_utils import load_model, predict_stream
from transformers import set_seed
set_seed(27)


base_model_path = "meta-llama/Llama-2-7b-hf"
peft_model_path = "./llama_peft_finetuned_model"

load_model(model_name=base_model_path,
        tokenizer_name=base_model_path,
        peft_path=peft_model_path,
        device="cuda",
        )

template = """
### System:
- You are a helpful assistant chatbot trained by Intel.
- You answer questions.
- You are excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
- You are more than just an information source, you are also able to write poetry, short stories, and make jokes.</s>
### User:
{}</s>
### Assistant:
"""

query = "who founded cnvrg.io?"

params = {
        "prompt": template.format(query),
        "device": "cuda",
        "model_name": base_model_path,
        "use_cache": True,
        "repetition_penalty": 1.0,
        "temperature": 0.1,
        "top_k": 10,
        "top_p": 0.75,
        "num_beams": 0,
        "max_new_tokens": 128
        }

for new_text in predict_stream(**params):
    print(new_text, end="", flush=True)
