In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe = pipeline("text-generation", model="TheDrummer/Llama-3SOME-8B-v2")
pipe(messages)

config.json:   0%|          | 0.00/722 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors.index.json:   0%|          | 0.00/22.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.19G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
import json
import os

def reformat_conversation_file(input_filepath, output_filepath):
    """
    Load conversation data from a JSON file, then reformat each message:
      - If a message has a speaker field and the speaker is "Herta" (case-insensitive),
        assign role "assistant".
      - If a message has a speaker field but is not "Herta", assign role "user".
      - If no speaker field is present (e.g. a description message), assign role "system".
    
    The reformatting is applied for each key (e.g., a URL or conversation identifier) 
    in the JSON. The output is written to output_filepath as pretty-printed JSON.
    """
    if not os.path.exists(input_filepath):
        raise FileNotFoundError(f"Input file not found: {input_filepath}")
    
    with open(input_filepath, 'r', encoding='utf-8') as infile:
        data = json.load(infile)
    
    reformatted = {}
    for key, messages in data.items():
        new_messages = []
        for message in messages:
            # Check if the message has a speaker
            if "speaker" in message:
                # Compare speaker names after stripping and converting to lowercase
                if message["speaker"].strip().lower() == "herta":
                    role = "assistant"
                else:
                    role = "user"
            else:
                # If there is no speaker, we assign a 'system' role (this can be modified)
                role = "system"
            new_messages.append({
                "role": role,
                "content": message.get("text", "")
            })
        reformatted[key] = new_messages

    with open(output_filepath, 'w', encoding='utf-8') as outfile:
        json.dump(reformatted, outfile, indent=4)
    print(f"Reformatted conversation data saved to: {output_filepath}")


if __name__ == "__main__":
    input_file = "all_conversations.json"  # Path to your input file
    output_file = "formatted_conversation.json"  # Path to save the reformatted data
    reformat_conversation_file(input_file, output_file)


Reformatted conversation data saved to: formatted_conversation.json


In [3]:
print("hello world")

hello world


In [8]:
import json
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import Dataset

def load_and_preprocess_data(json_file):
    """
    Loads conversation data from a JSON file.
    The file is assumed to be a dictionary mapping conversation IDs (e.g. URLs)
    to a list of messages, each with a "role" and "content" field.
    
    This function concatenates each conversation into a single string with
    role markers for each turn. For example:
    
        System: <content>
        User: <content>
        Assistant: <content>
    
    Adjust the formatting if needed.
    """
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)
    
    conversation_texts = []
    for conv_id, messages in data.items():
        conversation = ""
        for msg in messages:
            # Capitalize the role for consistency
            role = msg.get("role", "system").capitalize()
            content = msg.get("content", "")
            conversation += f"{role}: {content}\n"
        conversation_texts.append(conversation.strip())
    return conversation_texts

def tokenize_function(example, tokenizer, max_length=1024):
    # Tokenizes a single example, truncating to max_length tokens.
    return tokenizer(example["text"], truncation=True, max_length=max_length)

def main():
    # Define paths and model identifier.
    json_file = "all_conversations.json"  # Path to your JSON training file
    model_id = "Sao10K/L3-8B-Lunaris-v1"  # Lunaris-v1 model merge based on Llama-3

    # Load tokenizer and model.
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(model_id)
    
    # Load and preprocess the conversation data.
    conversation_texts = load_and_preprocess_data(json_file)
    
    # Create a Hugging Face dataset from the conversation texts.
    # Each training example is stored under the key "text".
    dataset = Dataset.from_dict({"text": conversation_texts})
    
    # Tokenize the dataset.
    tokenized_dataset = dataset.map(
        lambda examples: tokenize_function(examples, tokenizer),
        batched=True,
        remove_columns=["text"]
    )
    
    # Define training arguments.
    training_args = TrainingArguments(
        output_dir="./lunaris_finetuned",
        overwrite_output_dir=True,
        num_train_epochs=3,
        per_device_train_batch_size=2,
        gradient_accumulation_steps=8,
        learning_rate=2e-5,
        weight_decay=0.01,
        fp16=True,               # Enable mixed precision if available
        logging_steps=10,
        save_steps=100,
        save_total_limit=2,
        evaluation_strategy="no"  # Change to "steps" or "epoch" to add evaluation
    )
    
    # Create the Trainer.
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset,
        tokenizer=tokenizer
    )
    
    # Start training.
    trainer.train()
    
    # Save the fine-tuned model.
    trainer.save_model("./lunaris_finetuned")
    print("Training complete. Model saved to './lunaris_finetuned'.")

if __name__ == "__main__":
    main()


Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   6%|5         | 273M/4.95G [00:00<?, ?B/s]

KeyboardInterrupt: 