In [None]:
import random
import datasets
from src import data_loading
from src.data_models.data_models import TrainingData

dataset = datasets.load_dataset("lmarena-ai/arena-human-preference-140k")
data = data_loading.load_training_data_lmarena(dataset["train"].to_pandas())
#dataset = datasets.load_dataset("lmsys/chatbot_arena_conversations")
#data = data_loading.load_training_data_chatbot_arena(dataset["train"].to_pandas())
#data = TrainingData(random.sample(data.entries, 10000))
print(f"Successfully loaded {len(data.entries)} entries")

In [None]:
from src.models.embedding_specs.attention_embedding_spec import AttentionEmbeddingSpec
from src.models.gradient_boosting_model import GradientBoostingModel
from src.models.optimizers.adamw_spec import AdamWSpec
from src.utils.data_split import ValidationSplit


prompt_invariant_model = GradientBoostingModel(
    max_depth=4,
    learning_rate=1e-5,
    colsample_bytree=0.8,
    reg_alpha=0.0,
    reg_lambda=1.0,
    input_features=["model_embedding"],
    balance_model_samples=True,
    min_model_comparisons=1000,
    seed=42,
    print_every=5,
    
    # embedding_model_epochs=250, 
    # embedding_spec=AttentionEmbeddingSpec(
    #     encoder_model_name="all-MiniLM-L6-v2",
    #     optimizer=AdamWSpec(
    #         learning_rate=2e-5,
    #         lr_decay_gamma=0.995,
    #     ),
    #     h_emb=256,
    #     h_scalar=64,
    #     h_pair=256,
    #     d_out=128,
    #     pair_mlp_layers=8,
    #     num_attention_heads=8,
    #     dropout=0.1,
    #     temperature=0.07,
    #     pairs_per_model=128,
    #     models_per_batch=8,
    # )
    load_embedding_model_from="gradient-boosting-prompt-categories-no-embeddings"
)

prompt_invariant_model.train(data, validation_split=ValidationSplit(val_fraction=0.2, seed=42), epochs=180, batch_size=128)
prompt_invariant_model.save("gradient-boosting-model-embeddings-only")

In [None]:
from src.models.embedding_specs.attention_embedding_spec import AttentionEmbeddingSpec
from src.models.gradient_boosting_model import GradientBoostingModel
from src.models.optimizers.adamw_spec import AdamWSpec
from src.utils.data_split import ValidationSplit


gradient_boosting_model = GradientBoostingModel(
    max_depth=4,
    learning_rate=1e-5,
    colsample_bytree=0.8,
    reg_alpha=0.0,
    reg_lambda=1.0,
    input_features=["model_embedding", "prompt_embedding", "prompt_categories", "prompt_features"],
    balance_model_samples=True,
    min_model_comparisons=1000,
    seed=42,
    print_every=1,
    
    embedding_model_epochs=250, 
    embedding_spec=AttentionEmbeddingSpec(
        encoder_model_name="all-MiniLM-L6-v2",
        optimizer=AdamWSpec(
            learning_rate=2e-5,
            lr_decay_gamma=0.995,
        ),
        h_emb=256,
        h_scalar=64,
        h_pair=256,
        d_out=128,
        pair_mlp_layers=8,
        num_attention_heads=8,
        dropout=0.1,
        temperature=0.07,
        pairs_per_model=128,
        models_per_batch=8,
    ),
    base_model_name="gradient_boosting/gradient-boosting-model-embeddings-only"
)

gradient_boosting_model.train(data, validation_split=ValidationSplit(val_fraction=0.2, seed=42), epochs=500, batch_size=128)
gradient_boosting_model.save("gradient-boosting-model-embeddings-only")

In [None]:
from src.models.dn_embedding_model import DnEmbeddingModel
from src.models.finetuning_specs.lora_spec import LoraSpec
from src.models.finetuning_specs.qlora_spec import QLoraSpec
from src.models.optimizers.muon_spec import MuonSpec
from src.models.transformer_embedding_model import TransformerEmbeddingModel
from src.models.embedding_specs.attention_embedding_spec import AttentionEmbeddingSpec
from src.models.optimizers.adamw_spec import AdamWSpec
from src.utils.data_split import ValidationSplit

from src.utils.timer import Timer
#Timer.default_verbosity = None


model_name = "transformer-embedding-lora-mpnet-base-chatbot-arena"
model = TransformerEmbeddingModel(
    transformer_model_name="sentence-transformers/all-mpnet-base-v2",
    hidden_dims=[256, 128, 64],
    dropout=0.2,
    max_length=256,
    optimizer_spec=MuonSpec(
        learning_rate=2e-2,
        adamw_lr=1e-4,
        lr_decay_gamma=0.99,
        weight_decay=0.001,
    ),
    finetuning_spec=LoraSpec(
        rank=16,
        alpha=32,
        dropout=0.05,
        target_modules=["q", "v"]
    ),
    balance_model_samples=True,
    min_model_comparisons=1000,
    seed=42,
    print_every=1,
    save_every=4,
    checkpoint_name=model_name,
    
    embedding_model_epochs=250,
    embedding_spec=AttentionEmbeddingSpec(
        encoder_model_name="all-MiniLM-L6-v2",
        optimizer=AdamWSpec(
            learning_rate=2e-5,
            lr_decay_gamma=0.995,
        ),
        h_emb=256,
        h_scalar=64,
        h_pair=256,
        d_out=128,
        pair_mlp_layers=8,
        num_attention_heads=8,
        dropout=0.1,
        temperature=0.07,
        pairs_per_model=128,
        models_per_batch=8,
    )
)

model.train(data, validation_split=ValidationSplit(val_fraction=0.2, seed=42), epochs=40, batch_size=8)
model.save(model_name)

print()
model.last_timer.inspect(1)
#print()
#model.embedding_model.last_timer.inspect(1)

In [None]:
import matplotlib.pyplot as plt

from src.plotting_utils import plot_loss, plot_accuracy

history = prompt_invariant_model.get_history()
_, axes = plt.subplots(2, 2, figsize=(15, 10))

plot_loss(axes[0, 0], history.total_loss, "Training loss")
plot_loss(axes[0, 1], history.val_loss, "Validation loss")
plot_accuracy(axes[1, 0], history.train_accuracy, "Training accuracy")
plot_accuracy(axes[1, 1], history.val_accuracy, "Validation accuracy")

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt

from src.plotting_utils import plot_loss, plot_accuracy

embedding_history = model.embedding_model._epoch_logs

_, axes = plt.subplots(3, 2, figsize=(15, 15))

plot_loss(axes[0, 0], [e.train_loss for e in embedding_history], "Training loss")
plot_loss(axes[0, 1], [e.val_loss for e in embedding_history], "Validation loss")
plot_accuracy(axes[1, 0], [e.triplet_accuracy for e in embedding_history], "Training triplet accuracy")
plot_accuracy(axes[1, 1], [e.val_triplet_accuracy for e in embedding_history], "Validation triplet accuracy")
# plot_accuracy(axes[2, 0], [e.nearest_neighbor_accuracy for e in embedding_history], "Training nearest neighbor accuracy")
# plot_accuracy(axes[2, 1], [e.val_nearest_neighbor_accuracy for e in embedding_history], "Validation nearest neighbor accuracy")
plot_accuracy(axes[2, 0], [e.train_universal_accuracy for e in embedding_history], "Training universal accuracy")
plot_accuracy(axes[2, 1], [e.val_universal_accuracy for e in embedding_history], "Validation universal accuracy")

# axes[2, 0].plot([e.train_loss for e in embedding_history], label="Training loss")
# axes[2, 0].plot([e.train_triplet_loss for e in embedding_history], label="Training triplet loss")
# axes[2, 0].plot([e.train_reg_loss for e in embedding_history], label="Training regularization loss")
# axes[2, 0].legend()