In [1]:
import time
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM

def load_model(model_name, device, num_gpus, load_8bit=False):
    if device == "cuda":
        kwargs = {"torch_dtype": torch.float16}
        if load_8bit:
            if num_gpus != "auto" and int(num_gpus) != 1:
                print("8-bit weights are not supported on multiple GPUs. Revert to use one GPU.")
            kwargs.update({"load_in_8bit": True, "device_map": "auto"})
        else:
            if num_gpus == "auto":
                kwargs["device_map"] = "auto"
            else:
                num_gpus = int(num_gpus)
                if num_gpus != 1:
                    kwargs.update({
                        "device_map": "auto",
                        "max_memory": {i: "19GiB" for i in range(num_gpus)},
                    })
    elif device == "cpu":
        kwargs = {}
    else:
        raise ValueError(f"Invalid device: {device}")

    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast = False)
    model = AutoModelForCausalLM.from_pretrained(model_name, output_hidden_states=True,
        low_cpu_mem_usage=True, **kwargs)

    # calling model.cuda() mess up weights if loading 8-bit weights
    if device == "cuda" and num_gpus == 1 and not load_8bit:
        model.cuda()

    return model, tokenizer

@torch.inference_mode()
def get_embeddings(model, tokenizer, prompt):
    input_ids = tokenizer(prompt).input_ids
    input_embeddings = model.get_input_embeddings()
    embeddings = input_embeddings(torch.LongTensor([input_ids]).cuda())
    mean = torch.mean(embeddings[0], 0).cuda().detach()
    return mean

def get_save_npy(model, tokenizer, src_csv, dst_fld, mode='name'):
    print(mode, 'by vicuna')
    df = pd.read_csv(src_csv, sep=';')
    print('corpus len =', len(df))
    corpus = df[mode]
    print('encode corpus')
    start_time = time.time()
    for i in range(len(corpus)):
        print(i)
        one_emb = get_embeddings(model, tokenizer, corpus[i])
        np.save(f'{dst_fld}/{i}_{mode}.npy', one_emb.cpu())
    print('total time_spent = {}'.format(time.time() - start_time))
    print('one emb time_spent = {}'.format((time.time() - start_time)/len(corpus)))

In [2]:
# model_name = "/mnt/vdb1/vicuna_8bit"
# model_name = "/mnt/vdb1/vicuna-13b-delta-v1.1"
model_name = "/mnt/vdb1/ggml_vicuna_13b_8bit"
device = "cuda"
num_gpus=1
load_8bit = False
model, tokenizer = load_model(model_name, device, num_gpus, load_8bit)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [3]:
from fastchat.model import get_conversation_template
#from FastChat.fastchat.model import get_conversation_template

def prompt(model, tokenizer, text):
    ques_json = {"question_id": 1, "text": text}
    model_id = "0"
    idx = ques_json["question_id"]
    qs = ques_json["text"]
    conv = get_conversation_template(model_id)
    conv.append_message(conv.roles[0], qs)
    conv.append_message(conv.roles[1], None)
    prompt = conv.get_prompt()
    input_ids = tokenizer([prompt]).input_ids
    output_ids = model.generate(
        torch.as_tensor(input_ids).cuda(),
        do_sample=True,
        temperature=0.7,
        max_new_tokens=20#1024,
    )

    output_ids = output_ids[0][len(input_ids[0]) :]
    return tokenizer.decode(output_ids, skip_special_tokens=True).strip()

In [14]:
# ques_json = {"question_id": 1, "text": "How can I improve my time management skills?", "category": "generic"}
name = "Планшет Samsung Galaxy Tab A7 Lite LTE 32GB Gray (SM-T225)"
#text = f"Imagine that you are Named Entity Recognition neural network. Parse the information from the following tablet description '{name}' and based on that complete gaps from next statements: \nBrand name:___\nDevice:___\nColor:___. \nExample of answer:\nBrand name: iPhone\nDevice: iPad Pro Max\nColor: Space Gray"
#text = f"Imagine that you can only parse named entities from text, give the confidence score of your output as float within [0,1] and nothing else. Parse the information from the following tablet description '{name}' and based on that complete gaps from next statements: \nBrand name:___\nDevice:___\nConfidence:___.\nYour answer should have exactly the template I provided you and no more other words. Please double check your answer before you give me that."

#conv = get_conversation_template("")
from fastchat.conversation import *
tmpl="planshet_big"
conv = get_conv_template(tmpl)
conv.append_message(conv.roles[0], name)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
input_ids = tokenizer([prompt]).input_ids
output_ids = model.generate(
    torch.as_tensor(input_ids).cuda(),
    do_sample=True,
    temperature=0.01,
    max_new_tokens=30#1024,
)
#Galaxy Tab A7 Lite 32GB
output_ids1 = output_ids[0][len(input_ids[0]) :]
outputs = tokenizer.decode(output_ids1, skip_special_tokens=True).strip()
#print(f'question:\n{text}\n\n')
print(f'answer:\n{outputs}\n\n')

answer:
Brand:Samsung
Device:Galaxy Tab A7 Lite LTE 32Gb
END_OF_ANSWER




In [None]:
get_conv_template('planshet_big')

In [None]:
outputs

In [None]:
text = "What is the confidence score of your last output?"
name="vicuna_v1.1"
conv = get_conversation_template(name)
conv.append_message(conv.roles[0], text)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()
input_ids = tokenizer([prompt]).input_ids
output_ids = model.generate(
    torch.as_tensor(input_ids).cuda(),
    do_sample=True,
    temperature=0.9,
    max_new_tokens=20#1024,
)

output_ids1 = output_ids[0][len(input_ids[0]) :]
outputs = tokenizer.decode(output_ids1, skip_special_tokens=True).strip()
print(f'question:\n{text}\n\n')
print(f'answer:\n{outputs}\n\n')

In [None]:
conv

In [None]:
#dir(model.generate)

In [None]:
(get_conversation_template(''))

In [None]:
conv

In [None]:
x = output_ids[0][len(input_ids[0]) :]
x

In [None]:
tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()

In [None]:
tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()

In [None]:
output_ids[0]

In [None]:
output_ids1 = output_ids[0][len(input_ids[0]) :]
outputs = tokenizer.decode(output_ids1, skip_special_tokens=True).strip()
print(outputs)

In [None]:
outputs

In [None]:
dir(model)

In [None]:
# dst_fld = '/mnt/vdb1/embeds_ggml_vicuna_13b_8bit'
# src_csv = '/mnt/vdb1/14_categories_balanced.csv'

# get_save_npy(model, tokenizer, src_csv, dst_fld, 'name')
# get_save_npy(model, src_csv, dst_fld, 'external_brand')
# get_save_npy(model, src_csv, dst_fld, 'external_type')
# get_save_npy(model, src_csv, dst_fld, 'attrs')
# get_save_npy(model, src_csv, dst_fld, 'description')