In [1]:
import formatting
import json
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer
from datasets import Dataset
import copy
from transformers import TrainerCallback
from contextlib import nullcontext
from transformers import default_data_collator, Trainer, TrainingArguments
from peft import PeftModel

# The below code is so I can edit the formatting.py file and reload it without having to restart the kernel
import importlib
importlib.reload(formatting)

<module 'formatting' from '/home/Documents/Coding/python/ai/llama-experiments/chatbot-v2/formatting.py'>

# Load Model

In [2]:
# The path to the hugging face model. See the README to get this model.
hugging_face_model_dir = "../../models/llama/7B-hf"
# The path to the trained model. This is generated from the hugging face model train.ipynb
# This file does not include all weights, but simply a small subset of weights that were changed during training.
tuned_model_dir = "./trained-models/llama-7B-v2.1-stanford"

In [3]:
# Load and setup the tokenizer
tokenizer:LlamaTokenizer = LlamaTokenizer.from_pretrained(hugging_face_model_dir)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.model_max_length = 256
# Load the base model
model:LlamaForCausalLM = LlamaForCausalLM.from_pretrained(hugging_face_model_dir, load_in_8bit=True, device_map='auto', torch_dtype=torch.float16)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
# Ontop of the base model, load the modified weights from fine-tuning
model = PeftModel.from_pretrained(model, tuned_model_dir)

In [5]:
# Set the model to evaluation mode
model.eval()

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): Linear8bitLt(
                in_features=4096, out_features=4096, bias=False
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
            

# Interactive Conversation

In [6]:
import time

In [8]:
# conv1 is a list of turns in the conversation. Each turn is a tuple of (speaker, text)
conv1 = []
# Set the speakers names
speakers = ["Josh", "James"]
# Create the prompt for the conversation. This will be somthing like "Below is a conversation:"
prompt = formatting.get_chat_prompt(speakers)

# Print out the conversation up to this point
print(prompt)
for t in conv1:
    print(t)

# Generate 10 more turns. The AI will generate text for both speakers in the conversation, following the initial turn.
for i in range(100):
    if len(conv1)==0 or conv1[-1][0] == speakers[1]:
        next_input = input("Input: ")
        conv1.append((speakers[0], next_input))
        print(conv1[-1])
    else:
        # Tokenize the conversation up to this point
        model_input = formatting.tokenize_with_turn_trucation(tokenizer, prompt, conv1, next_turn=speakers[1], for_inference=True)
        # Generate the next turn. This ouputs not just the ai generated text, but also all the text that was inputted (with nex new txt on the end)
        with torch.no_grad():
            generated = model.generate(**model_input, max_new_tokens=50, num_beams=1, do_sample=True, temperature=1.2)[0]
            model_output = tokenizer.decode(generated)
        # Parse the model output to get the speaker and what they said
        turns = model_output.split("</s>")
        if turns[-1] == "":
            turns = turns[:-1]
        new_turn = turns[-1].strip()
        new_turn_parts = new_turn.split(": ", 1)
        # Add the new turn to the conversation
        conv1.append((new_turn_parts[0], new_turn_parts[1]))
        # Print out the conversation up to this point
        print(conv1[-1])
        time.sleep(0.5)

Here is a dialogue between James and Josh:
('Josh', 'Hello James. Who are you?')
('James', "I'm Mommy. I made a special dinner for you.")


KeyboardInterrupt: Interrupted by user