# Finetune Tiny Llama for NPC TT Experiment
Vibe coded with ChatGPT 

In [18]:
import random
import json
from tqdm import tqdm

import os
from dataclasses import dataclass, field
from typing import Dict, List, Any

import json
from datasets import Dataset, load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from peft import LoraConfig, PeftModel
from trl import SFTTrainer, SFTConfig
import torch

In [19]:
# Load model directly
BASE_MODEL_ID = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    device_map="auto",
    torch_dtype="auto"
)

Loading weights: 100%|██████████| 201/201 [00:38<00:00,  5.25it/s, Materializing param=model.norm.weight]                              


In [20]:
# set padding token
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

## Instructions to train the model
1. Create a dataset of based on current interactions in the game in JSON format showing the current avatar's name, position, and game map location. Include a JSON list of player data for all players on the screen that includes their name, position, and text (emote) if they're talking. For the output, use a JSON object action with the choices [text, emote, move]
2. Create sets for each role and how they should respond to situations augmenting with artificial data made combining Jae's JSON and ChatGPT output in the format specified
3. Finetune TinyLlama on the fake outputs
4. Connect to the server

ChatGPT prompt format:

```
Can you make a dataset of custom responses for an avatar NPC based on a particular role? I need 100 samples of interactions and responses between an NPC and another character. The possible input interactions will be either text or an emote from the emote.txt file. You can use the role_dialog.json file for examples of keyword outputs or generic lines too. For example, the input interaction should be formatted like such {"text": "Hi, I'd like some bread"} or {"emote":"004-big-smile"}. The output interaction should be formatted similarly like "{"text": "Sure, it's fresh baked!"}" or {"emote":"020-money-bag"} for the baker NPC role. Each interaction should be separated by a new line like such:

    INPUT: {"text": "Hi, I'd like some bread"}
    OUTPUT: {"emote":"020-money-bag"}

Can you make 100 samples for the {INSERT ROLE HERE} role using a mix of text and emote inputs and outputs? 
```

In [None]:
ROLE_INSTRUCTIONS = {}
with open("train-data/role_instructions.json", "r") as f:
    jdat = json.load(f)
    for role, instr in jdat.items():
        ROLE_INSTRUCTIONS[role] = {}
        ROLE_INSTRUCTIONS[role]['descr'] = instr['description']
        ROLE_INSTRUCTIONS[role]['tasks'] = [t for t in instr['tasks']]

In [None]:
FULL_INSTRUCTIONS = lambda npc_role: ("Pretend you are human player role-playing as an NPC character with a job in a medieval fantasy world. "
    f"Your job is a '{npc_role.upper()}' character. "
    f"These are the {npc_role.upper()} character instructions: "
    f"{ROLE_INSTRUCTIONS[npc_role]['descr']} "
    f"The {npc_role.upper()} role's tasks are: {', '.join(ROLE_INSTRUCTIONS[npc_role]['tasks'])}. "
    "You can either talk to another player or perform an action. "
    "You have the following actions available: [talk, move, emote, teleport]. "
    "If you want to talk, respond in normal text in the following form {'talk': '(your message)'}. For example, {'talk': 'Hello there!'}. "
    "You can only respond in one sentence with a maximum of 100 characters for the text. "
    "For emotes, you have the following icon choices available: [wave, dance, happy,big,laugh,intelligent,sleeping,bored,surprise,frightened,cry,angry,numb,sweat,tongue,numb,kissing,heart,star,star,like,close,help,daisy,gift,money,axe,chicken,tomato,mushroom,chemical,beer]. "
    "If you want to emote, respond in the following form {'emote': '(your emote choice)'}. For example, {'emote': 'wave'}. "
    "For movements, you can move to any (x,y) coordinate in the range of (0,0) to (800,400). "
    "If you want to move, respond in the following form {'move': '(x,y)'}. For example, {'move': '30,100'}. "
    "For teleportation, you can teleport to the following locations: ['plaza','library','blacksmith','training_ground','bakery','butcher','market','apothecary','tavern']. "
    "If you want to teleport, respond in the following form {'teleport': '(location)'}. For example, {'teleport': 'plaza'}. "
    "Only give your response in one of these four forms as a JSON format with the action chosen and the value. For example: {'talk': 'Hello there!'} or {'move': '30,100'}. "
    f"Try to stay in character as a '{npc_role.upper()}' and respond appropriately based on your role and the context of the interaction. "
    "Respond often with text or emotes, and only move or teleport when necessary. "
    "\n")


ALL_LOCS = ['plaza','library','blacksmith','training_ground','bakery','butcher','market','apothecary','tavern']
ALL_ROLES = ['blacksmith','baker','bard','chuck','butcher','apothecary','knight_trainer','librarian','general_goods','drunk','gossip','mercenary','barmaid','wizard']
ROLE_LOCS = {
    'blacksmith': 'blacksmith',
    'baker': 'bakery',
    'bard': 'tavern',
    'chuck': 'tavern',
    'butcher': 'butcher',
    'apothecary': 'apothecary',
    'knight_trainer': 'training_ground',
    'librarian': 'library',
    'general_goods': 'market',
    'drunk': 'tavern',
    'gossip': 'tavern',
    'mercenary': 'training_ground',
    'barmaid': 'tavern',
    'wizard': 'apothecary'
}

In [31]:
# CREATE FULL TRAINING DATA
def self_avatar_data(role, loc):
    LETTERS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    name = ' '.join(random.choices(LETTERS, k=2))
    position = (random.randint(0,800), random.randint(0,400))
    avatar = {
        "name": name,
        "pos": position,
        "role": role,
        "loc": loc
    }
    return avatar

def rand_avatar_data(text=None,emote=None):
    LETTERS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    name = ' '.join(random.choices(LETTERS, k=2))
    position = (random.randint(0,800), random.randint(0,400))
    avatar = {
        "name": name,
        "pos": position,
    }
    if text:
        avatar["text"] = text
    if emote:
        avatar["emote"] = emote
    return avatar


def read_inter_data(file_path):
    with open(file_path, 'r') as f:
        data = f.read()

    # data is defined with the following format:
    '''
        INPUT: {}
        OUTPUT: {}

        INPUT: {}
        OUTPUT: {}
        ...
    '''
    data = data.split("\n\n")
    processed_data = []
    for entry in data:
        if entry.strip() == "":
            continue
        input_part, output_part = entry.split("OUTPUT:")
        input_json = input_part.replace("INPUT:", "").strip()
        output_json = output_part.strip()
        processed_data.append((input_json, output_json))
    
    return processed_data

In [None]:
# GENERATE DATASET FOR TRAINING

def generate_training_data():
    dataset = []

    l33t_data = read_inter_data('train-data/l33tspeak_inter.txt')
    emote_data = read_inter_data('train-data/emote_inter.txt')

    # shuffle l33t and emote data
    random.shuffle(l33t_data)
    random.shuffle(emote_data)

    with tqdm(total=len(ALL_ROLES), desc="Generating training data") as pbar:
        for role in ALL_ROLES:
            # read in the role specific data
            role_data = read_inter_data(f'train-data/{role}_inter.txt')

            # shuffle role data
            random.shuffle(role_data)

            d = 0
            for dat in role_data:
                d += 1
                pbar.set_postfix({"role": role, "lines":f"{d}/{len(role_data)}"})

                # create self avatar data
                loc = random.choices(ALL_LOCS, weights=[0.25 if l == "plaza" else 0.1 if l != ROLE_LOCS[role] else 0.4 for l in ALL_LOCS])[0]
                self_avatar = self_avatar_data(role, loc)

                # create other avatars data
                num_others = random.randint(1,5)
                other_avatars = []

                # add input line from role data as one of the other avatars
                other_input = json.loads(dat[0])
                reaction = json.loads(dat[1])
                rand_avatar = rand_avatar_data(text=other_input.get("text",None), emote=other_input.get("emote",None))
                other_avatars.append(rand_avatar)

                # add additional other avatars
                for _ in range(num_others-1):

                    # if we run out of l33t or emote data, refill
                    if len(l33t_data) == 0:
                        l33t_data = read_inter_data('train-data/l33tspeak_inter.txt')
                        random.shuffle(l33t_data)
                    if len(emote_data) == 0:
                        emote_data = read_inter_data('train-data/emote_inter.txt')
                        random.shuffle(emote_data)


                    # randomly choose to add l33t, emote data, or nothing
                    s = random.random()
                    if s < 0.3:
                        dat = json.loads(l33t_data.pop()[0])
                    elif s < 0.6:
                        dat = json.loads(emote_data.pop()[0])

                    # create another random avatar
                    txt = None
                    emote = None
                    if "text" in dat:
                        txt = dat["text"]
                    if "emote" in dat:
                        emote = dat["emote"]

                    another_avatar = rand_avatar_data(text=txt, emote=emote)
                    other_avatars.append(another_avatar)

                # shuffle other avatars
                random.shuffle(other_avatars)

                # create the input-output pair
                input_data = {
                    "ME": self_avatar,
                    "OTHER": other_avatars
                }
                dataset.append({
                    "input": input_data,
                    "output": reaction
                })

            pbar.update(1)

    l33t_data = read_inter_data('train-data/l33tspeak_inter.txt')
    emote_data = read_inter_data('train-data/emote_inter.txt')

    comb = l33t_data + emote_data
    random.shuffle(comb)

    # adds the leet and emote data at the end to ensure they are all used
    for dat in comb:

        # 1. direct response, 2. move, 3. teleport
        for i in range(3):

            # create self avatar data
            role = random.choices(ALL_ROLES)[0]
            loc = random.choices(ALL_LOCS, weights=[0.25 if l == "plaza" else 0.1 if l != ROLE_LOCS[role] else 0.4 for l in ALL_LOCS])[0]
            self_avatar = self_avatar_data(role, loc)

            # create other avatars data
            num_others = random.randint(1,5)
            other_avatars = []

            # add input line from role data as one of the other avatars
            other_input = json.loads(dat[0])
            reaction = json.loads(dat[1])
            rand_avatar = rand_avatar_data(text=other_input.get("text",None), emote=other_input.get("emote",None))
            other_avatars.append(rand_avatar)

            # add additional other avatars
            for _ in range(num_others-1):
                another_avatar = rand_avatar_data()
                other_avatars.append(another_avatar)

            # shuffle other avatars
            random.shuffle(other_avatars)

            # create the input-output pair
            input_data = {
                "ME": self_avatar,
                "OTHER": other_avatars
            }

            # set reaction based on i
            if i == 1:
                # move
                reaction = {
                    "move": f"{random.randint(0,800)},{random.randint(0,400)}"
                }
            elif i == 2:
                # teleport
                reaction = {
                    "teleport": random.choice(ALL_LOCS)
                }

            dataset.append({
                "input": input_data,
                "output": reaction
            })

    with open('train-data/npc_training_data.json', 'w') as f:
        json.dump(dataset, f)

generate_training_data()

Generating training data: 100%|██████████| 14/14 [00:00<00:00, 47.05it/s, role=wizard, lines=82/82]       


In [None]:
# formatting the JSON list to have each element on a single line with indentation only at the list level
def format_list_shallow(json_path, out_path):
    # Load the raw JSON list
    with open(json_path, "r") as f:
        data = json.load(f)

    lines = ["["]

    for i, item in enumerate(data):
        # Dump each element on a single line (minified)
        minified = json.dumps(item, separators=(",", ":"))

        # Indent only at the list level
        if i < len(data) - 1:
            lines.append(f"  {minified},")
        else:
            lines.append(f"  {minified}")

    lines.append("]")

    with open(out_path, "w") as f:
        f.write("\n".join(lines))


# Example usage:
format_list_shallow("train-data/npc_training_data.json", "train-data/npc_training_data_pretty.json")


In [34]:
def load_and_prepare_dataset(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)

    dataset = []
    for item in data:
        input_data = item['input']
        output_data = item['output']
        dataset.append({
            "messages":[
                {"role": "system", "content": FULL_INSTRUCTIONS(input_data["ME"]["role"])},
                {"role": "user", "content": input_data},
                {"role": "assistant", "content": output_data}
            ]
        })

    # export to .jsonl file
    with open('npc_dataset.jsonl', 'w') as f:
        for entry in dataset:
            f.write(json.dumps(entry) + '\n')

    return dataset


def load_raw_dataset(path: str) -> Dataset:
    data = []
    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            obj = json.loads(line)

            fixed_messages = []
            for m in obj["messages"]:
                role = m["role"]
                content = m["content"]

                # Normalize: ensure content is always a dict
                if isinstance(content, str):
                    # Wrap string in {"text": ...}
                    content = {"text": content}

                fixed_messages.append({"role": role, "content": content})

            data.append({"messages": fixed_messages})

    return Dataset.from_list(data)


In [35]:
def formatting_func(example):
    """
    Convert one example {"messages": [...]} into a single training string.
    """
    parts = []
    for msg in example["messages"]:
        role = msg["role"].upper()
        content = msg["content"]

        # If it has only "text", show just the text
        if isinstance(content, dict) and set(content.keys()) == {"text"}:
            text = content["text"]
        else:
            # For structured ME/OTHER etc, dump as JSON string
            text = json.dumps(content, ensure_ascii=False)

        parts.append(f"[{role}]\n{text}")

    return "\n\n".join(parts)


## Alt Formatting for Finetuning
Because either SFT or ChatGPT shat itself

In [36]:
def messages_to_text(messages):
    """
    Turn one example's messages list into a single training string.
    """
    parts = []
    for msg in messages:
        role = msg["role"].upper()
        content = msg["content"]

        # content can be a string or an object (ME/OTHER or {"text": ...})
        if isinstance(content, str):
            text = content
        elif isinstance(content, dict) and set(content.keys()) == {"text"}:
            text = content["text"]
        else:
            # For structured stuff (ME/OTHER etc.), serialize to JSON
            text = json.dumps(content, ensure_ascii=False)

        parts.append(f"[{role}]\n{text}")

    # You could also mark where the assistant answer starts specially if you want
    return "\n\n".join(parts)


def build_text_dataset(jsonl_path: str) -> Dataset:
    """
    Read npc_dataset.jsonl, normalize everything, and create a Dataset
    with a single 'text' column (no 'messages' column at all).
    """
    rows = []
    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            obj = json.loads(line)
            messages = obj["messages"]

            # Normalize content: if it's a string, wrap into {"text": ...}
            norm_msgs = []
            for m in messages:
                role = m["role"]
                content = m["content"]
                if isinstance(content, str):
                    content = {"text": content}
                norm_msgs.append({"role": role, "content": content})

            text = messages_to_text(norm_msgs)
            rows.append({"text": text})

    with open('npc_dataset_text.jsonl', 'w') as f:
        for row in rows:
            f.write(json.dumps(row) + '\n')

    return Dataset.from_list(rows)

In [None]:
build_text_dataset("train-data/npc_dataset.jsonl")

Dataset({
    features: ['text'],
    num_rows: 2064
})

## Train the model with the data

In [None]:
# Output directories (all local folders)
LORA_OUTPUT_DIR = "./tinyllama-npc-lora"
MERGED_OUTPUT_DIR = "./tinyllama-npc-merged"

MAX_SEQ_LEN = 1024

DATA_FILE = "train-data/npc_dataset.jsonl"

def finetune():
    
    # 3) Load raw dataset (still has 'messages')
    #train_dataset = load_raw_dataset(DATA_FILE)

    # 3) Dataset: plain text only, no messages→no chat_template
    train_dataset = build_text_dataset(DATA_FILE)


    # 3) LoRA configuration
    peft_config = LoraConfig(
        r=8,
        lora_alpha=16,
        lora_dropout=0.1,
        bias="none",
        task_type="CAUSAL_LM",
        target_modules=[
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
        ],
    )

    # 5) SFTConfig (this is where max_length, packing, lr, etc live now)
    sft_config = SFTConfig(
        output_dir=LORA_OUTPUT_DIR,
        num_train_epochs=3,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        learning_rate=2e-4,
        logging_steps=50,
        save_steps=500,
        save_total_limit=2,
        report_to="none",
        dataset_text_field="text",

        # sequence / packing settings
        max_length=MAX_SEQ_LEN,   # replaces old max_seq_length
        packing=True,            # pack examples into fixed-length sequences

        # mixed precision
        bf16=(
            torch.cuda.is_available()
            and torch.cuda.get_device_capability(0)[0] >= 8
        ),
        fp16=False,
    )

    # 6) SFTTrainer (new API for trl 0.25.1)
    trainer = SFTTrainer(
        model=model,
        args=sft_config,
        train_dataset=train_dataset,
        processing_class=tokenizer,  # replaces tokenizer=...
        peft_config=peft_config,
        # formatting_func=formatting_func,  # takes raw example -> string
    )


    # 6) Train
    trainer.train()

   # 7) Save LoRA adapter + tokenizer locally
    os.makedirs(LORA_OUTPUT_DIR, exist_ok=True)
    trainer.model.save_pretrained(LORA_OUTPUT_DIR)
    tokenizer.save_pretrained(LORA_OUTPUT_DIR)
    print(f"Saved LoRA adapter to {LORA_OUTPUT_DIR}")

    # 8) Merge LoRA weights into base model and save a standalone model
    print("Merging LoRA adapter into base model...")

    # Reload base model on CPU (or cuda if you want)
    base_model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL_ID,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        device_map="cpu",   # merge on CPU to avoid GPU OOM
    )

    lora_model = PeftModel.from_pretrained(base_model, LORA_OUTPUT_DIR)
    merged_model = lora_model.merge_and_unload()  # apply LoRA weights into base

    os.makedirs(MERGED_OUTPUT_DIR, exist_ok=True)
    merged_model.save_pretrained(MERGED_OUTPUT_DIR)
    tokenizer.save_pretrained(MERGED_OUTPUT_DIR)
    print(f"Saved merged full model to {MERGED_OUTPUT_DIR}")

finetune()

Padding-free training is enabled, but the attention implementation is not set to a supported flash attention variant. Padding-free training flattens batches into a single sequence, and only the following implementations are known to reliably support this: flash_attention_2, flash_attention_3, kernels-community/flash-attn, kernels-community/flash-attn3, kernels-community/vllm-flash-attn3. Using other implementations may lead to unexpected behavior. To ensure compatibility, set `attn_implementation` in the model configuration to one of these supported options or verify that your attention mechanism can handle flattened sequences.
You are using packing, but the attention implementation is not set to a supported flash attention variant. Packing gathers multiple samples into a single sequence, and only the following implementations are known to reliably support this: flash_attention_2, flash_attention_3, kernels-community/flash-attn, kernels-community/flash-attn3, kernels-community/vllm-fla

KeyboardInterrupt: 