# Julian's LLM Proof of Concept
## Description
This is a project where, following the principles behind LLM's, I am going to download and setup a base model LLM, and then conduct post-training on it with conversations to recreate an LLM chatbot that can answer questions.

Future iterations of this may include additional post training to teach the model when to admit that it does not know the answer rather than hallucinate one, as well as other mitigations like tool integration.

In [2]:
# Install required libraries
!pip install -q transformers datasets peft bitsandbytes accelerate wandb trl
!pip install -q evaluate scikit-learn
!pip install -q tf-keras

# For visualization
!pip install -q matplotlib pandas

In [3]:
# Add this as a new cell after your pip installations
import torch

# Check if MPS is available and set it up
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

if device.type == "mps":
    print("Successfully enabled GPU acceleration with Apple Metal!")
else:
    print("Warning: MPS not available, using CPU for training. This will be slow!")



Using device: mps
Successfully enabled GPU acceleration with Apple Metal!


In [None]:
# In your existing model loading cell
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Choose the appropriate model size based on your RAM constraints (18GB)
# Using the plain base model (not the chat version)
#model_id = "Qwen/Qwen2.5-0.5B" # Alternative: "Qwen/Qwen2.5-1.5B" \
model_id = "Qwen/Qwen2.5-0.5B"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,  # Use float16 for Metal compatibility
    device_map="mps" if torch.backends.mps.is_available() else "auto",
)

print(f"Model loaded: {model_id}")
print(f"Model parameters: {model.num_parameters():,}")

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Model loaded: Qwen/Qwen2.5-0.5B
Model parameters: 494,032,768


Now that the model is setup, download the OASST1 dataset from huggingface for its conversational data to use in post training

In [5]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,  # Use float16 for Metal compatibility 
    device_map="mps" if torch.backends.mps.is_available() else "auto",
)

In [6]:
from datasets import load_dataset

# Load the Open Assistant dataset
dataset = load_dataset("OpenAssistant/oasst1")
print(dataset)

# Let's look at a sample
print(dataset["train"][0])

# Create the message dictionary ONCE outside the function
message_dict = {item["message_id"]: item for item in dataset["train"]}
print("Message dictionary created once!")

# Function to format conversations for Qwen2.5
def format_conversation(example):
    messages = []
    current_message = {"role": example["role"], "content": example["text"]}
    messages.append(current_message)
    
    # Add parent messages by traversing up
    parent_id = example["parent_id"]
    # Use the pre-created message_dict instead of recreating it each time
    
    while parent_id is not None:
        parent = message_dict.get(parent_id)
        if parent is None:
            break
        messages.insert(0, {"role": parent["role"], "content": parent["text"]})
        parent_id = parent["parent_id"]
    
    # Ensure alternating human/assistant pattern and proper formatting
    formatted_messages = []
    for i, msg in enumerate(messages):
        if msg["role"] == "prompter":
            role = "user"
        else:
            role = "assistant"
        formatted_messages.append({"role": role, "content": msg["content"]})
    
    return {"messages": formatted_messages}

# Process and filter the dataset - should be MUCH faster now
processed_dataset = dataset["train"].map(format_conversation)

INFO:datasets:PyTorch version 2.6.0 available.
INFO:datasets:TensorFlow version 2.18.0 available.


DatasetDict({
    train: Dataset({
        features: ['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role', 'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels'],
        num_rows: 84437
    })
    validation: Dataset({
        features: ['message_id', 'parent_id', 'user_id', 'created_date', 'text', 'role', 'lang', 'review_count', 'review_result', 'deleted', 'rank', 'synthetic', 'model_name', 'detoxify', 'message_tree_id', 'tree_state', 'emojis', 'labels'],
        num_rows: 4401
    })
})
{'message_id': '6ab24d72-0181-4594-a9cd-deaf170242fb', 'parent_id': None, 'user_id': 'c3fe8c76-fc30-4fa7-b7f8-c492f5967d18', 'created_date': '2023-02-05T14:23:50.983374+00:00', 'text': 'Can you write a short introduction about the relevance of the term "monopsony" in economics? Please use examples related to potential monopsonies in the labour market and cite relevant research.', 'role': 

Now that both the base model and the post training dataset have been aquired, we can setup and start post training the model

In [7]:
from datasets import load_dataset

In [15]:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

# Prepare model for QLoRA fine-tuning
model = prepare_model_for_kbit_training(model)

# Define LoRA configuration
lora_config = LoraConfig(
    r=16,  # Rank
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA to model
model = get_peft_model(model, lora_config)
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

# Tokenization function for chat format
def tokenize_function(examples):
    # Combine messages into the Qwen2.5 chat format
    texts = []
    for conversation in examples["messages"]:
        text = tokenizer.apply_chat_template(conversation, tokenize=False)
        texts.append(text)
    
    # Tokenize with shorter sequence length
    tokenized = tokenizer(texts, padding="max_length", truncation=True, max_length=512)  # Reduced from 1024
    return tokenized

# Tokenize dataset
tokenized_dataset = processed_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=processed_dataset.column_names
)

INFO:peft.tuners.tuners_utils:Already found a `peft_config` attribute in the model. This will lead to having multiple adapters in the model. Make sure to know what you are doing!


Trainable parameters: 2,162,688


Map:   0%|          | 0/84437 [00:00<?, ? examples/s]

In [16]:
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.info("Logging is working!")

INFO:__main__:Logging is working!


In [19]:
# Replace your model preparation code with this:
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model

# First, make sure model parameters require gradients
for param in model.parameters():
    param.requires_grad = True

# Prepare model for QLoRA fine-tuning
model = prepare_model_for_kbit_training(model)

# Define LoRA configuration with more specific target modules
lora_config = LoraConfig(
    r=16,  # Rank
    lora_alpha=32,
    # Be more specific about which modules to target
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA to model
model = get_peft_model(model, lora_config)

# Verify trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable parameters: {trainable_params:,}")
print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")

# Double-check that we have trainable parameters
if trainable_params == 0:
    raise ValueError("No trainable parameters found! Training will fail.")

INFO:peft.tuners.tuners_utils:Already found a `peft_config` attribute in the model. This will lead to having multiple adapters in the model. Make sure to know what you are doing!


Trainable parameters: 8,798,208
Total parameters: 502,830,976


In [20]:

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    push_to_hub=False, 
    report_to="none",
    per_device_train_batch_size=2,  # Try increasing this
    gradient_accumulation_steps=8,  # And reduce this
    learning_rate=2e-4,
    max_steps=1000,
    logging_steps=10,
    save_steps=10,
    fp16=False,  # Disable fp16 training 
    bf16=False,  # Disable bf16 training
    optim="adamw_torch",
    use_mps_device=True,  # Add this line to explicitly use MPS
    log_level="info",
    log_level_replica="info",
    disable_tqdm=False,  # Ensure progress bar is enabled
    logging_strategy="steps",
)

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False),
)

# Start training
trainer.train()

PyTorch: setting up devices
max_steps is given, it will override any value given in num_train_epochs
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
***** Running training *****
  Num examples = 84,437
  Num Epochs = 1
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 8
  Total optimization steps = 1,000
  Number of trainable parameters = 8,798,208


Step,Training Loss
10,2.1465
20,2.1658
30,2.0772
40,2.116
50,1.9698
60,2.1417
70,1.9675
80,1.9793
90,1.9974
100,1.9695


Saving model checkpoint to ./results/checkpoint-10
Saving model checkpoint to ./results/checkpoint-20
Saving model checkpoint to ./results/checkpoint-30
Saving model checkpoint to ./results/checkpoint-40
Saving model checkpoint to ./results/checkpoint-50
Saving model checkpoint to ./results/checkpoint-60
Saving model checkpoint to ./results/checkpoint-70
Saving model checkpoint to ./results/checkpoint-80
Saving model checkpoint to ./results/checkpoint-90
Saving model checkpoint to ./results/checkpoint-100
Saving model checkpoint to ./results/checkpoint-110
Saving model checkpoint to ./results/checkpoint-120
Saving model checkpoint to ./results/checkpoint-130
Saving model checkpoint to ./results/checkpoint-140
Saving model checkpoint to ./results/checkpoint-150
Saving model checkpoint to ./results/checkpoint-160
Saving model checkpoint to ./results/checkpoint-170
Saving model checkpoint to ./results/checkpoint-180
Saving model checkpoint to ./results/checkpoint-190
Saving model checkpoi

TrainOutput(global_step=1000, training_loss=1.936737066268921, metrics={'train_runtime': 9411.1511, 'train_samples_per_second': 1.7, 'train_steps_per_second': 0.106, 'total_flos': 1.802385752064e+16, 'train_loss': 1.936737066268921, 'epoch': 0.18948814514791917})

Now we save the post-trained model and can test it

In [21]:
# Save the model
model.save_pretrained("./qwen2.5-chatbot")
tokenizer.save_pretrained("./qwen2.5-chatbot")
print("Model and tokenizer saved successfully!")

tokenizer config file saved in ./qwen2.5-chatbot/tokenizer_config.json
Special tokens file saved in ./qwen2.5-chatbot/special_tokens_map.json


Model and tokenizer saved successfully!


In [22]:
# Load the fine-tuned model
from peft import PeftModel, PeftConfig

# Load the base model
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

# Load the fine-tuned LoRA weights
fine_tuned_model = PeftModel.from_pretrained(base_model, "./qwen2.5-chatbot")

# Test the model with a sample conversation
def generate_response(user_input, history=None):
    if history is None:
        history = []
    
    # Add user input to history
    history.append({"role": "user", "content": user_input})
    
    # Format the conversation for the model
    prompt = tokenizer.apply_chat_template(history, tokenize=False)
    
    # Generate response
    inputs = tokenizer(prompt, return_tensors="pt").to(fine_tuned_model.device)
    outputs = fine_tuned_model.generate(
        inputs["input_ids"],
        max_new_tokens=512,
        temperature=0.7,
        top_p=0.9,
        do_sample=True,
    )
    
    # Decode the response
    response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
    
    # Add assistant response to history
    history.append({"role": "assistant", "content": response})
    
    return response, history

# Test with a sample question
history = []
user_input = "What's the capital of France?"
response, history = generate_response(user_input, history)
print(f"User: {user_input}")
print(f"Assistant: {response}")

# Continue the conversation
user_input = "Tell me more about its famous landmarks."
response, history = generate_response(user_input, history)
print(f"User: {user_input}")
print(f"Assistant: {response}")

loading configuration file config.json from cache at /Users/juliansinger/.cache/huggingface/hub/models--Qwen--Qwen2.5-0.5B/snapshots/060db6499f32faf8b98477b0a26969ef7d8b9987/config.json
Model config Qwen2Config {
  "_name_or_path": "Qwen/Qwen2.5-0.5B",
  "architectures": [
    "Qwen2ForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151643,
  "hidden_act": "silu",
  "hidden_size": 896,
  "initializer_range": 0.02,
  "intermediate_size": 4864,
  "max_position_embeddings": 32768,
  "max_window_layers": 24,
  "model_type": "qwen2",
  "num_attention_heads": 14,
  "num_hidden_layers": 24,
  "num_key_value_heads": 2,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 1000000.0,
  "sliding_window": 32768,
  "tie_word_embeddings": true,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.49.0",
  "use_cache": true,
  "use_mrope": false,
  "use_sliding_window": false,
  "vocab_size": 151936
}

loading weights file model.safetensors from

User: What's the capital of France?
Assistant: ponde
France's capital is Paris.
ponde
You are a helpful assistant.oxetine
oxetine
The recommended dosage of 5mg of oxibucher (system) for the treatment of pneumonia in children is 5mg/kg or 25mg/2.5kg, or as needed.oxetine
oxetine
You are a helpful assistant.oxetine
oxetine
The recommended dosage of 5mg of oxibucher (system) for the treatment of pneumonia in children is 5mg/kg or 25mg/2.5kg, or as needed.
oxetine
You are a helpful assistant.oxetine
oxetine
The recommended dosage of 5mg of oxibucher (system) for the treatment of pneumonia in children is 5mg/kg or 25mg/2.5kg, or as needed.
oxetine
You are a helpful assistant.oxetine
oxetine
The recommended dosage of 5mg of oxibucher (system) for the treatment of pneumonia in children is 5mg/kg or 25mg/2.5kg, or as needed.
oxetine
You are a helpful assistant.oxetine
oxetine
The recommended dosage of 5mg of oxibucher (system) for the treatment of pneumonia in children is 5mg/kg or 25mg/2.5kg,

In [23]:
# Load a validation subset from OASST1 to evaluate
validation_data = dataset["validation"].map(format_conversation)
tokenized_val = validation_data.map(
    tokenize_function,
    
    batched=True,
    remove_columns=validation_data.column_names
)

# Calculate perplexity
import numpy as np
from tqdm import tqdm

def compute_perplexity(model, dataset, batch_size=4):
    model.eval()
    losses = []
    for i in tqdm(range(0, len(dataset), batch_size)):
        batch = dataset[i:i+batch_size]
        inputs = {k: torch.tensor(v).to(model.device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**inputs, labels=inputs["input_ids"])
        losses.append(outputs.loss.item())
    
    perplexity = np.exp(np.mean(losses))
    return perplexity

perplexity = compute_perplexity(fine_tuned_model, tokenized_val)
print(f"Perplexity on validation set: {perplexity:.2f}")

Map:   0%|          | 0/4401 [00:00<?, ? examples/s]

Map:   0%|          | 0/4401 [00:00<?, ? examples/s]

100%|██████████| 1101/1101 [15:06<00:00,  1.21it/s]

Perplexity on validation set: 403068.05



