In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) 
base_model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

In [3]:
model = PeftModel.from_pretrained(base_model, "./finetuned_improved")
model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 2048)
        (layers): ModuleList(
          (0-21): 22 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.Linear(

In [4]:
def test_model(instruction, input_text):
    prompt = f"""### Instruction:
{instruction}
### Input:
{input_text}
### Response:
"""
    
    inputs = tokenizer(prompt, return_tensors="pt")
    outputs = model.generate(
        **inputs,
        max_new_tokens=50,
        temperature=0.1,
        top_p=0.9,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    
    full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    if "### Response:" in full_response:
        response = full_response.split("### Response:")[-1].strip()
        # Stop at next ### or newline if present
        response = response.split("###")[0].strip()
        response = response.split("<|endoftext|>")[0].strip()
        return response
    
    return full_response

In [5]:
print("Test 1:")
print(test_model("Extract info as JSON", "Emma bought 4 chairs for $80"))

Test 1:
{"name": "Emma", "items": 4, "price": 80}


In [8]:
print("Test 2:")
print(test_model("Extract info as JSON", "Jeel brought 5 perfumes from the shop for 100 dollars."))

Test 2:
{"name": "Jeel", "items": 5, "price": 100}


In [9]:
print("Test 3:")
print(test_model("Extract info as JSON","Jay has 5 watches each 200$"))

Test 3:
{"name": "Jay", "items": 5, "price": 200}
