In [1]:
%load_ext autoreload
%autoreload 2

In [2]:

import os
import sys
import copy
import torch
import wandb
import re
import numpy as np
import seaborn as sns
from dataclasses import replace
from functools import partial
from matplotlib import pyplot as plt
from huggingface_hub import login
from tempfile import TemporaryDirectory
from pytorch_lightning import seed_everything
from huggingface_hub import create_repo, login, HfApi

from projects.wiki_experts.train_experts_main import get_datamodule
from projects.wiki_experts.src.evolution.utils import (
    get_loss,
    init_wandb_logger,
    TableLogger,
    get_svd_embedding
)

from projects.wiki_experts.src.expert_trainer import ExpertTrainer
from mttl.models.modifiers.expert_containers.expert_library import (
    get_best_expert_for_task,
    get_best_expert_for_score,
    LocalExpertLibrary,
    HFExpertLibrary,
    ExpertLibrary,
    Score,
)
from projects.wiki_experts.src.evolution.evaluators import (
    Evaluator,
    prepare_evaluator,
    EvalCallback,
)


from mttl.models.modifiers.expert_containers.module_graph import Expert

from projects.wiki_experts.src.evolution.config import (
    EvolExpertConfig,
    increase_version,
)
from projects.wiki_experts.src.evolution.nevergrad_opt import NGRoutingOptimizer
from mttl.utils import setup_logging, logger
from projects.wiki_experts.src.expert_model import MultiExpertModel
from projects.wiki_experts.src.evolution.experiment_state import ExperimentState
from mttl.vllm_engines.engines import free_memory
from projects.wiki_experts.src.evolution.transfer_matrix import (
    eval_all_experts_on_task,
    eval_expert_on_task,
)
from mttl.datamodule.base import DefaultDataModule
from mttl.models.modifiers.expert_containers.library_transforms import (
    SVDEmbeddingTransform,
    SVDEmbeddingTransformConfig,
)


import os
import itertools
import pandas as pd
from huggingface_hub import login, HfApi, logout
import numpy as np
from scipy.cluster.hierarchy import linkage, fcluster, dendrogram
from scipy.spatial.distance import squareform
from mttl.models.modifiers.expert_containers.module_graph import load_expert

  warn(


Registering modifier...lora
Registering modifier...poly
Registering modifier...skilled
Registering modifier...kv_adapter
Registering modifier...poly_kv_adapter
Registering modifier...prompt_tuning
Registering modifier...poly_prompt_tuning
Registering modifier...hard_prompt
Registering multi-expert selector...poly_router
Registering multi-expert selector...moe_rkhs_router
Registering multi-expert selector...poly_router_dir
Registering multi-expert selector...info_selector
Registering multi-expert selector...task_selector
Registering multi-expert selector...kv_task_selector
Registering multi-expert selector...kv_concat_selector
Registering multi-expert selector...kv_norm_selector
Registering multi-expert selector...kv_concat_norm_selector
Registering multi-expert selector...kv_task_norm_selector


In [12]:
hf_api_key = ""
login(token=hf_api_key)
user = HfApi(token=hf_api_key).whoami()
os.environ["HF_TOKEN"] = hf_api_key
# hf_repo_id = "sordonia/library-phi_2-v3"
hf_repo_id = "sordonia/library-phi_2-v3-2epc"
# expert_lib =  HFExpertLibrary(hf_repo_id)
local_lib_location = f"/tmp/{hf_repo_id}"
os.makedirs(local_lib_location, exist_ok=True)
# expert_lib:LocalExpertLibrary = LocalExpertLibrary.create_from_remote(expert_lib, local_lib_location)
expert_lib:LocalExpertLibrary = LocalExpertLibrary(local_lib_location)

experts_to_remove = ["bool_q_1_0_0", "ai2_arc_ARC_Easy_1_0_0", "openbookqa_0_1_0", "ai2_arc_ARC_Challenge_1_0_0", "hellaswag_1_1_0", "piqa_1_0_0", "winogrande_1_1_0"]
for expert_name in experts_to_remove:
    if expert_name in expert_lib:
        expert_lib.remove_expert(expert_name)
assert len(expert_lib) == 256

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/v-oostapenko/.cache/huggingface/token
Login successful


In [13]:

def create_embeddings():
    svd_embedder = SVDEmbeddingTransform(
        SVDEmbeddingTransformConfig(sparsity_threshold=0.5)
    )
    svd_embedder.transform(expert_lib, upload_to_hf=True)
    del svd_embedder


# module to embedding
module2embed = {}
for n, m in expert_lib.items():
    module2embed[n] = get_svd_embedding(expert_lib,n)
    if module2embed[n] is None:
        create_embeddings()
        module2embed[n] = get_svd_embedding(expert_lib,n)

In [22]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
# Extract the embeddings as a numpy array
embeddings = np.array(list(module2embed.values()))

# Calculate cosine similarity matrix
cosine_sim_matrix = cosine_similarity(embeddings, embeddings)

# Specify the number of clusters (K)
K = 25  # Adjust this based on your requirements

# Initialize KMeans with cosine similarity as the metric
kmeans = KMeans(n_clusters=K, init='k-means++', n_init=10, random_state=42)

# Fit the KMeans model
kmeans.fit(cosine_sim_matrix)

# Get the cluster labels
cluster_labels = kmeans.labels_


In [23]:
from collections import defaultdict
clusters=defaultdict(list)
# Print the cluster labels for each embedding
for key, label in zip(module2embed.keys(), cluster_labels):
    print(f"Embedding {key} belongs to Cluster {label}")
    clusters[label].append(key)

Embedding glue_sst2_2_0_0 belongs to Cluster 2
Embedding dream_read_the_following_conversation_and_answer_the_question belongs to Cluster 21
Embedding race_middle_Read_the_article_and_answer_the_question_no_option_ belongs to Cluster 8
Embedding adversarial_qa_droberta_generate_question belongs to Cluster 23
Embedding adversarial_qa_dbidaf_question_context_answer belongs to Cluster 19
Embedding app_reviews_convert_to_star_rating belongs to Cluster 9
Embedding race_high_Select_the_best_answer belongs to Cluster 8
Embedding super_glue_rte_1_0_2 belongs to Cluster 24
Embedding true_case belongs to Cluster 2
Embedding wiqa_what_might_be_the_first_step_of_the_process belongs to Cluster 3
Embedding quail_description_context_question_answer_id belongs to Cluster 8
Embedding quail_context_question_description_text belongs to Cluster 8
Embedding stream_qed belongs to Cluster 10
Embedding huggingface_xsum belongs to Cluster 10
Embedding cos_e_v1_11_question_option_description_text belongs to Clu

In [24]:
for c, l in clusters.items():
    # print(f"Cluster {c} has {len(l)} elements")
    print(f"c{c}o{K}_2e = {l}")

c2o25_2e = ['glue_sst2_2_0_0', 'true_case', 'cot_esnli', 'trec_1_0_0', 'yelp_polarity_reviews_0_2_0', 'glue_cola_2_0_0', 'ag_news_subset_1_0_0', 'math_dataset_algebra__linear_1d_1_0_0', 'fix_punct', 'imdb_reviews_plain_text_1_0_0', 'word_segment', 'anli_r2_0_1_0', 'anli_r1_0_1_0']
c21o25_2e = ['dream_read_the_following_conversation_and_answer_the_question', 'race_middle_Is_this_the_right_answer', 'race_high_Is_this_the_right_answer', 'quarel_do_not_use', 'dream_baseline', 'quarel_heres_a_story', 'quarel_choose_between', 'social_i_qa_Show_choices_and_generate_index', 'quarel_testing_students', 'wiqa_effect_with_label_answer', 'cot_qasc', 'quarel_logic_test', 'stream_aqua_ii']
c8o25_2e = ['race_middle_Read_the_article_and_answer_the_question_no_option_', 'race_high_Select_the_best_answer', 'quail_description_context_question_answer_id', 'quail_context_question_description_text', 'race_high_Read_the_article_and_answer_the_question_no_option_', 'race_high_Select_the_best_answer_no_instruct

In [32]:
for c, l in clusters.items():
    print(f"Cluster {c} has {len(l)} elements")
    print(l)

!!!!!!!!!!!!!!!!!!!!!! DEBUG MODE
Cluster 8 has 22 elements
['glue_sst2_2_0_0', 'adversarial_qa_droberta_generate_question', 'stream_qed', 'super_glue_record_1_0_2', 'yelp_polarity_reviews_0_2_0', 'lambada_1_0_0', 'dream_generate_last_utterance', 'ag_news_subset_1_0_0', 'duorc_ParaphraseRC_title_generation', 'adversarial_qa_dbidaf_generate_question', 'quoref_Guess_Title_For_Context', 'fix_punct', 'imdb_reviews_plain_text_1_0_0', 'race_middle_Write_a_multi_choice_question_for_the_following_article', 'duorc_SelfRC_title_generation', 'anli_r2_0_1_0', 'race_high_Write_a_multi_choice_question_for_the_following_article', 'anli_r1_0_1_0', 'cot_ecqa', 'race_high_Write_a_multi_choice_question_options_given_', 'adversarial_qa_dbert_generate_question', 'race_middle_Write_a_multi_choice_question_options_given_']
Cluster 3 has 32 elements
['dream_read_the_following_conversation_and_answer_the_question', 'cos_e_v1_11_question_option_description_text', 'social_i_qa_Show_choices_and_generate_answer', 