In [1]:
# import lib
import json
import copy
import logging
from dataclasses import dataclass, field
from typing import Optional, Dict, Sequence
import gc

import io
import torch
import transformers
from torch.utils.data import Dataset
from transformers import LlamaModel, LlamaConfig
from transformers.models.llama.modeling_llama import LlamaRMSNorm, LlamaRotaryEmbedding, LlamaDecoderLayer
from transformers.activations import ACT2FN
from transformers import AutoTokenizer

from transformers import Trainer, AdamW, get_linear_schedule_with_warmup
from transformers.trainer_pt_utils import get_parameter_names
from transformers.utils import is_sagemaker_mp_enabled
from transformers.trainer_utils import ShardedDDPOption
from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
from transformers import GenerationConfig


In [2]:
# Load custom LlamaModel
from modeling_llama import LlamaForCausalLM

In [4]:
# install git lfs, then get the model 
# !git clone https://huggingface.co/LearnItAnyway/llama-7b-hf-28q_4bit-128g_WVU
model_name = './llama-7b-hf-28q_4bit-128g_WVU'
# You can also use LearnItAnyway/llama-13b-hf-35q_4bit-128g_WVU or LearnItAnyway/llama-30b-hf-53q_4bit-128g_WVU

In [5]:
model = LlamaForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

started
ended


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

If model has quant_bits, quant_groupsize, and,  quant_layers, the Llama model, quantized first `quant_layers` layers has been loaded.

In [6]:
print(model.config)
#print('Num quantized layer :', model.config.quant_layers)
#print('Num quantized layer bits and groupsize:', model.config.quant_bits, ',', model.config.quant_groupsize)

LlamaConfig {
  "_name_or_path": "./llama-7b-hf-28q_4bit-128g_WVU",
  "architectures": [
    "LlamaForCausalLM"
  ],
  "bos_token_id": 0,
  "eos_token_id": 1,
  "hidden_act": "silu",
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "intermediate_size": 11008,
  "max_position_embeddings": 2048,
  "max_sequence_length": 2048,
  "model_type": "llama",
  "num_attention_heads": 32,
  "num_hidden_layers": 32,
  "pad_token_id": -1,
  "quant_bits": 4,
  "quant_groupsize": 128,
  "quant_layers": 28,
  "rms_norm_eps": 1e-06,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.29.2",
  "use_cache": true,
  "vocab_size": 32000
}



In [7]:
model.half()
model.cuda()
device='cuda'

In [8]:
def generate_prompt(instruction, input=None):
    return f"""{instruction}
#### Response:
"""

def evaluate(
        batch_data,
        input=None,
        temperature=1,
        top_p=0.9,
        top_k=40,
        num_beams=1,
        max_new_tokens=2048,
        **kwargs,
):
    prompts = generate_prompt(batch_data, input)
    #prompts = [generate_prompt(b, input) for b in batch_data]
    inputs = tokenizer(prompts, return_tensors="pt", max_length=512, truncation=True, padding=True)
    input_ids = inputs["input_ids"].to(device)
    generation_config = GenerationConfig(
        temperature=temperature,
        top_p=top_p,
        top_k=top_k,
        num_beams=num_beams,
        **kwargs,
    )
    with torch.no_grad():
        generation_output = model.generate(
            input_ids=input_ids,
            generation_config=generation_config,
            return_dict_in_generate=True,
            output_scores=True,
            max_new_tokens=max_new_tokens,
            eos_token_id=tokenizer.eos_token_id,
            bos_token_id=tokenizer.bos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )
    s = generation_output.sequences
    output = tokenizer.batch_decode(s, skip_special_tokens=False)
    return output


In [9]:
data = """
Can you provide a list of healthy habits to maintain a healthy lifestyle? Please format your response as an HTML page with bullet points. <html> <body> <h3>Healthy Habits:</h3> <ul> <li>Eating a balanced diet with plenty of fruits and vegetables.</li> <li>Engaging in regular physical activity, such as walking, running, or cycling.</li> <li>Getting enough sleep each night, ideally 7-8 hours.</li> <li>Staying hydrated by drinking plenty of water throughout the day.</li> <li>Limiting alcohol consumption and avoiding smoking.</li> <li>Managing stress through relaxation techniques like meditation or yoga.</li> <li>Regularly visiting a healthcare provider for check-ups and preventative care.</li> </ul> </body> </html>
"""

In [10]:
import time

In [11]:
s_time = time.time()
output = evaluate(data)
e_time = time.time()

print(e_time-s_time, '\n' ,output[0])

12.409578800201416 
 <s> 
Can you provide a list of healthy habits to maintain a healthy lifestyle? Please format your response as an HTML page with bullet points. <html> <body> <h3>Healthy Habits:</h3> <ul> <li>Eating a balanced diet with plenty of fruits and vegetables.</li> <li>Engaging in regular physical activity, such as walking, running, or cycling.</li> <li>Getting enough sleep each night, ideally 7-8 hours.</li> <li>Staying hydrated by drinking plenty of water throughout the day.</li> <li>Limiting alcohol consumption and avoiding smoking.</li> <li>Managing stress through relaxation techniques like meditation or yoga.</li> <li>Regularly visiting a healthcare provider for check-ups and preventative care.</li> </ul> </body> </html>

#### Response:
Here's an example HTML page with bullet points:
```
Healthy Habits:
1. Eating a balanced diet with plenty of fruits and vegetables.
2. Engaging in regular physical activity, such as walking, running, or cycling.
3. Getting enough sleep 

### After generate the response, it is faster

In [12]:
s_time = time.time()
output = evaluate(data)
e_time = time.time()

print(e_time-s_time, '\n' ,output[0])

5.025457143783569 
 <s> 
Can you provide a list of healthy habits to maintain a healthy lifestyle? Please format your response as an HTML page with bullet points. <html> <body> <h3>Healthy Habits:</h3> <ul> <li>Eating a balanced diet with plenty of fruits and vegetables.</li> <li>Engaging in regular physical activity, such as walking, running, or cycling.</li> <li>Getting enough sleep each night, ideally 7-8 hours.</li> <li>Staying hydrated by drinking plenty of water throughout the day.</li> <li>Limiting alcohol consumption and avoiding smoking.</li> <li>Managing stress through relaxation techniques like meditation or yoga.</li> <li>Regularly visiting a healthcare provider for check-ups and preventative care.</li> </ul> </body> </html>

#### Response:
Here's an example HTML page with bullet points:
```
Healthy Habits:
1. Eating a balanced diet with plenty of fruits and vegetables.
2. Engaging in regular physical activity, such as walking, running, or cycling.
3. Getting enough sleep e

In [13]:
if False:
    import tqdm, json

    input_data_path = './WizardLM/data/WizardLM_testset.jsonl'
    output_data_path = '7b_output_WizardLM_testset.jsonl'

    input_data = open(input_data_path, mode='r', encoding='utf-8')
    output_data = open(output_data_path, mode='w', encoding='utf-8')
    for num, line in tqdm.tqdm(enumerate(input_data.readlines())):
        one_data = json.loads(line)
        id = one_data["idx"]
        instruction = one_data["Instruction"]
        _output = evaluate(instruction, max_new_tokens=256)
        try: 
            final_output = _output[0].split("### Response:")[1].strip()
            new_data = {
                "id": id,
                "instruction": instruction,
                "output": final_output
            }
            output_data.write(json.dumps(new_data) + '\n')
        except:
            pass
    print('End.')