# 构造niid和iid的jsonl，看看其对应的广度和质量

In [1]:
from FlagEmbedding import FlagModel
import numpy as np
from sklearn.manifold import TSNE
from scipy.spatial.distance import pdist, squareform
import matplotlib.pyplot as plt
from pprint import pprint as pp
import time
import umap
import os
import random
import time
from contextlib import contextmanager
from datasets import load_dataset, load_from_disk
from datasets import load_dataset, concatenate_datasets, load_from_disk
import pandas as pd
import datasets
from datasets import Dataset
from pprint import pprint as pp
from datasets import Dataset
from sklearn.cluster import KMeans
from tqdm import tqdm
import torch
import heapq
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig

In [2]:
code_data = load_dataset("sahil2801/CodeAlpaca-20k")["train"]
fin_data = load_dataset("FinGPT/fingpt-sentiment-train")["train"]
med_data = load_dataset("medalpaca/medical_meadow_medical_flashcards")["train"]
general_data = load_dataset("tatsu-lab/alpaca")["train"]
math_data = load_dataset("TIGER-Lab/MathInstruct")["train"]

def alpaca_format(example):
    if example['input'] == "":
        example["instruction"] = example["instruction"]
    else:
        example["instruction"] = example["instruction"] + " " + example['input']
    example["response"] = example['output']
    return example

def process_sft_dataset(dataset_name, dataset, dataset_sample=None)->datasets.Dataset:
    if dataset_name in ["lucasmccabe-lmi/CodeAlpaca-20k", "yahma/alpaca-cleaned", "FinGPT/fingpt-sentiment-train"]:
        dataset = dataset.map(alpaca_format, remove_columns=['input', 'output'], desc=f"Preprocessing {dataset_name} for unified format.")
        # if "fin" in dataset_name: dataset = dataset.shuffle(seed=42).select(range(51000))
    elif dataset_name in ["WizardLM/WizardLM_evol_instruct_70k"]:
        dataset = dataset.rename_column("output", "response")
    elif dataset_name in ["tatsu-lab/alpaca", "vicgalle/alpaca-gpt4", "gbharti/finance-alpaca"]:
        dataset = dataset.map(alpaca_format, remove_columns=['input', 'output', 'text'], desc=f"Preprocessing {dataset_name} for unified format.")
    elif dataset_name in ["TIGER-Lab/MathInstruct"]:
        df = pd.DataFrame(dataset)
        df = df.drop_duplicates(subset=['instruction'])
        dataset = datasets.Dataset.from_pandas(df)
        # dataset = dataset.shuffle(seed=42).select(range(51000))
        dataset = dataset.rename_column("output", "response")
        dataset = dataset.remove_columns(['source'])
    elif dataset_name in ["lighteval/MATH"]:
        dataset = dataset.rename_column("solution", "response")
        dataset = dataset.rename_column("problem", "instruction")
        dataset = dataset.remove_columns(['level', 'type'])
    elif dataset_name in ['gsm8k']:
        dataset = dataset.rename_column("question", "instruction")
        dataset = dataset.rename_column("answer", "response")
    elif dataset_name in ['medalpaca/medical_meadow_medical_flashcards']:       # TODO: 'lavita/ChatDoctor-HealthCareMagic-100k'. not sure whether to discard the instruction.
        dataset = dataset.remove_columns(['instruction'])
        dataset = dataset.rename_column("input", "instruction")
        dataset = dataset.rename_column("output", "response")
    elif "math" in dataset_name:
        dataset = dataset.remove_columns(['source'])
        dataset = dataset.rename_column("output", "response")
    else:
        raise NotImplementedError(f"Dataset {dataset_name} is not supported.")
    dataset = dataset.shuffle(seed=42)
    if dataset_sample:
        num_sample = min(len(dataset), dataset_sample)
        dataset = dataset.select(range(num_sample))
    print(f">> ===== After processing, Dataset {dataset_name} has {len(dataset)} examples. =====")
    return dataset

processed_data = []
for name, dataset in zip(["lucasmccabe-lmi/CodeAlpaca-20k","FinGPT/fingpt-sentiment-train","medalpaca/medical_meadow_medical_flashcards","tatsu-lab/alpaca","TIGER-Lab/MathInstruct"],[code_data,fin_data,med_data,general_data,math_data]):
# for name, dataset in zip(["lucasmccabe-lmi/CodeAlpaca-20k","FinGPT/fingpt-sentiment-train","medalpaca/medical_meadow_medical_flashcards", "TIGER-Lab/MathInstruct"],[code_data,fin_data,med_data,math_data]):
    tmp:datasets.Dataset = process_sft_dataset(name,dataset)
    print(tmp.column_names)
    processed_data.append(tmp)
    
data_concated = concatenate_datasets(processed_data)

>> ===== After processing, Dataset lucasmccabe-lmi/CodeAlpaca-20k has 20022 examples. =====
['instruction', 'response']
>> ===== After processing, Dataset FinGPT/fingpt-sentiment-train has 76772 examples. =====
['instruction', 'response']
>> ===== After processing, Dataset medalpaca/medical_meadow_medical_flashcards has 33955 examples. =====
['instruction', 'response']
>> ===== After processing, Dataset tatsu-lab/alpaca has 52002 examples. =====
['instruction', 'response']
>> ===== After processing, Dataset TIGER-Lab/MathInstruct has 224567 examples. =====
['response', 'instruction', '__index_level_0__']


In [3]:
model = FlagModel('BAAI/bge-large-en-v1.5', 
                  query_instruction_for_retrieval="",
                  use_fp16=True)

----------using 3*GPUs----------


In [4]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', model_kwargs={"torch_dtype":torch.bfloat16})



In [4]:
def matmul_block(A,B,block_size=50000):
    C = torch.zeros(A.size(0), B.size(1))
    # 进行分块矩阵乘法
    for i in range(0, A.size(0), block_size):
        for j in range(0, B.size(1), block_size):
            for k in range(0, A.size(1), block_size):
                # 计算分块索引
                i_end = min(i + block_size, A.size(0))
                j_end = min(j + block_size, B.size(1))
                k_end = min(k + block_size, A.size(1))
                # 执行子块乘法并累加到结果矩阵中
                C[i:i_end, j:j_end] += torch.mm(A[i:i_end, k:k_end].cuda(), B[k:k_end, j:j_end].cuda()).cpu()
    return C


In [4]:
import numpy as np
import torch
from torch.nn.functional import cosine_similarity as cosine_similarity

def coverage(A, V):
    # 将输入转换为张量
    A_tensor = torch.tensor(A, dtype=torch.float32)
    V_tensor = torch.tensor(V, dtype=torch.float32)
    # 计算集合A的覆盖广度
    # similarities = matmul_block(V_tensor,A_tensor.T)
    similarities = torch.matmul(V_tensor, A_tensor.T)
    # 计算每个v的最大相似度
    max_similarities = torch.max(similarities, dim=1).values
    # 计算总相似度
    total_similarity = torch.sum(max_similarities).item()/len(max_similarities)
    return total_similarity

In [5]:
data_concated_embeddings = model.encode(data_concated["instruction"])

Inference Embeddings:   0%|          | 0/531 [00:00<?, ?it/s]

dc61-p1a-t455-n036:904320:904320 [0] NCCL INFO cudaDriverVersion 12020
dc61-p1a-t455-n036:904320:904320 [0] NCCL INFO NCCL_SOCKET_FAMILY set by environment to AF_INET6
dc61-p1a-t455-n036:904320:904320 [0] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
dc61-p1a-t455-n036:904320:904320 [0] NCCL INFO Bootstrap : Using eth0:fdbd:dc61:1a:455::36<0>
dc61-p1a-t455-n036:904320:904320 [0] NCCL INFO NET/Plugin : dlerror=libnccl-net.so: cannot open shared object file: No such file or directory No plugin found (libnccl-net.so), using internal implementation
NCCL version 2.20.5+cuda12.4
dc61-p1a-t455-n036:904320:1095493 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 0.
dc61-p1a-t455-n036:904320:1095493 [1] NCCL INFO NCCL_SOCKET_FAMILY set by environment to AF_INET6
dc61-p1a-t455-n036:904320:1095493 [1] NCCL INFO NCCL_SOCKET_IFNAME set by environment to eth0
dc61-p1a-t455-n036:904320:1095493 [1] NCCL INFO NCCL_IB_HCA set to mlx5
dc61-p1a-t455-n036:904320:1095493 [1] NCCL INFO NET/IB : 

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
Inference Embeddings: 100%|██████████| 531/531 [04:36<00:00,  1.92it/s]


## niid的广度

In [56]:
import numpy as np
import umap
from scipy.spatial import ConvexHull
from cuml.manifold import UMAP, TSNE

def umap_embeddings_volume(embeddings):
    # 初始化 UMAP 降维模型
    reducer = umap.UMAP(n_components=2, metric='cosine')
    # 进行 UMAP 降维
    low_dim_embeddings = reducer.fit_transform(embeddings)
    # 将降维后的数据转换为 NumPy 数组
    points = np.array(low_dim_embeddings)
    # 计算凸包
    hull = ConvexHull(points)
    return hull.volume

def cuml_tsne_embeddings_volume(embeddings):
    reducer = TSNE(n_components=2, metric='cosine')
    # 进行 UMAP 降维
    low_dim_embeddings = reducer.fit_transform(embeddings)
    # 将降维后的数据转换为 NumPy 数组
    points = np.array(low_dim_embeddings)
    # 计算凸包
    hull = ConvexHull(points)
    return hull.volume

def cuml_umap_embeddings_volume(embeddings):
    reducer = UMAP(n_components=2, metric='cosine')
    # 进行 UMAP 降维
    low_dim_embeddings = reducer.fit_transform(embeddings)
    # 将降维后的数据转换为 NumPy 数组
    points = np.array(low_dim_embeddings)
    # 计算凸包
    hull = ConvexHull(points)
    return hull.volume

## 计算数据平均质量

In [51]:
reward_name = "/mnt/bn/data-tns-live-llm/leon/datasets/reward-model-deberta-v3-large-v2/"
device = "cuda"
rank_model, tokenizer = AutoModelForSequenceClassification.from_pretrained(reward_name, torch_dtype=torch.bfloat16).to(device), AutoTokenizer.from_pretrained(reward_name)
# rank_model, tokenizer = AutoModelForSequenceClassification.from_pretrained(reward_name, load_in_4bit=True), AutoTokenizer.from_pretrained(reward_name)
question, answer = "Explain nuclear fusion like I am five", "Nuclear fusion is the process by which two or more protons and neutrons combine to form a single nucleus. It is a very important process in the universe, as it is the source of energy for stars and galaxies. Nuclear fusion is also a key process in the production of energy for nuclear power plants."
inputs = tokenizer(question, answer, return_tensors='pt').to(device)
score = rank_model(**inputs).logits[0].detach()
print(float(score))

def quality_evaluation(datas):
    score = 0
    cnt = 0
    result_list = []
    for element in tqdm(datas):
        instruction = element['instruction']
        _input = ''
        if 'input' in element.keys():
            _input = element['input']
        _output = element['response']
        question = ''
        if _input == '':
            question = instruction
        else:
            question = instruction + '\n' +_input
        
        answer = _output
        
        try:
            inputs = tokenizer(question, answer, return_tensors='pt').to(device)
            score += rank_model(**inputs).logits[0].detach()
            cnt +=1
        except:
            print(instruction)
            print(_output)
            continue
        
    print(score/cnt)
        # final_result = {'instruction':instruction,'input':_input,'response':_output,'reward_score':float(score)}
        # result_list.append(final_result)



2.265625


In [None]:
root = "/mnt/bn/data-tns-live-llm/leon/datasets/fed_data/"
datas = []
settings = ["pos", "random", "iid2niid_code", "iid2niid_code_public"]
for setting in settings:
    datas = []
    for i in range(10):
        datas.append(load_from_disk(f"{root}/{setting}_{i}.parquet"))
    datas = concatenate_datasets(datas)
    datas = datas.select(random.sample(range(len(datas)), 1000))
    quality_evaluation(datas)

In [None]:
root = "/mnt/bn/data-tns-live-llm/leon/datasets/fed_data/"
datas = []
settings = ["niid_0.01", "niid_0.1", "niid_1", "niid_10", "niid_med_0.01", "niid_med_0.1", "niid_med_1", "niid_med_10"]
for setting in settings:
    datas = []
    for i in range(10):
        datas.append(load_from_disk(f"{root}/{setting}_{i}.parquet"))
    datas = concatenate_datasets(datas)
    datas = datas.select(random.sample(range(len(datas)), 1000))
    quality_evaluation(datas)

## 使用gpt评估

In [96]:
import requests
import json

messages_template =[
    {
        "role": "system",
        "content": """We would like to request your feedback on the performance of AI assistant in response to the instruction and the given input displayed following. Instruction: {instruction} Input: None Response: {response}"""
    },
    {
        "role": "user",
        "content": """Please rate according to the accuracy of the response to the instruction and the input. Each assistant receives a score on a scale of 0 to 5, where a higher score indicates higher level of the accuracy. Please first output a single line containing the value indicating the scores. In the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias."""
    },
    ]

messages_template[0]["content"] = messages_template[0]["content"].format_map({"instruction":"write a joke about biden","response":"oh biden, sleepy joe!"})
print(messages_template)
MODEL = "gpt-4o-2024-05-13"
MODEL = "gpt-35-turbo"
data = {
    "model": MODEL,
    "messages": messages_template,
    "temperature": 0.1,
    "top_p": 1,
    "n": 1,
    "stream": False,
    "max_tokens": 256,
}
headers = {'Content-Type': 'application/json', 'Caller': 'leon.kepler'}
# data = {k: v for k, v in data.items() if v is not None}
# data = json.dumps(data)
# url = f"https://swzkkd0h.us-east-fn.bytedance.net/gpt/openapi/online/v2/crawl"
# response = requests.post(url, data=data, headers=headers)
# print(response.content)
# score = int(response.json()["choices"][0]["message"]["content"][0])
# print(score)

[{'role': 'system', 'content': 'We would like to request your feedback on the performance of AI assistant in response to the instruction and the given input displayed following. Instruction: write a joke about biden Input: None Response: oh biden, sleepy joe!'}, {'role': 'user', 'content': 'Please rate according to the accuracy of the response to the instruction and the input. Each assistant receives a score on a scale of 0 to 5, where a higher score indicates higher level of the accuracy. Please first output a single line containing the value indicating the scores. In the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias.'}]


In [18]:
from volcenginesdkarkruntime import Ark
client = Ark(base_url="https://ark.cn-beijing.volces.com/api/v3",api_key="")

# Non-streaming:
print("----- standard request -----")
completion = client.chat.completions.create(
    model="",
    messages = [
        {"role": "system", "content": "We would like to request your feedback on the performance of AI assistant in response to the instruction and the given input displayed following. Please rate according to the accuracy of the response to the instruction and the input. Each assistant receives a score on a scale of 0 to 5, where a higher score indicates higher level of the accuracy. Please first output a single line containing the value indicating the scores. In the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias."},
        {"role": "user", "content": """Instruction: Remove all the punctuation from a given string "Welcome to the world of computers!", Response: 'import string\n\ndef remove_punctuation(text):\n    punctuations = string.punctuation\n    no_punct = ""\n    for char in text:\n        if char not in punctuations:\n            no_punct += char\n    return no_punct\n\nif __name__ == \'__main__\':\n    text = \'Welcome to the world of computers!\'\n    print(remove_punctuation(text))'"""},
    ],
    # temperature=0.1,
    # max_tokens=256
)
print(completion.choices[0].message.content)

----- standard request -----
4


In [7]:
import re
def request_doubao(messages):
    response = client.chat.completions.create(
        model="ep-20240804150426-2vkvx",
        messages = messages,
        temperature=0.1,
        max_tokens=256
    )
    try:
        response = response.choices[0].message.content
        match = re.search(r'\d+', response)
        score = int(match.group(0))
    except Exception as e: 
        print("request",response)
        score = 3
    return score

In [None]:
import multiprocessing as mp
from tqdm import tqdm
import re
import copy

MODEL = "gpt-35-turbo"

messages_template =[
{
    "role": "system",
    "content": """We would like to request your feedback on the performance of AI assistant in response to the instruction and the given input displayed following. Please rate according to the accuracy of the response to the instruction and the input. Each assistant receives a score on a scale of 0 to 5, where a higher score indicates higher level of the accuracy. Please first output a single line containing the value indicating the scores. In the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias."""
},
{
    "role": "user",
    "content": """Instruciton: {instruction} Response: {response}"""
},
]

def request_gpt(messages):
    data = {
        "model": MODEL,
        "messages": messages,
        "temperature": 0.1,
        "top_p": 1,
        "n": 1,
        "stream": False,
        "max_tokens": 256,
    }
    headers = {'Content-Type': 'application/json', 'Caller': 'leon.kepler'}
    data = {k: v for k, v in data.items() if v is not None}
    data = json.dumps(data)
    url = f"https://swzkkd0h.us-east-fn.bytedance.net/gpt/openapi/online/v2/crawl"
    # response = requests.post(url, data=data, headers=headers)
    # if "choices" not in response.json().keys(): response = requests.post(url, data=data, headers=headers)
    try: 
        response = requests.post(url, data=data, headers=headers)
        response = response.json()["choices"][0]["message"]["content"]
        match = re.search(r'\d+', response)
        score = int(match.group(0))
        # score = int(response.json()["choices"][0]["message"]["content"][0])
    except Exception as e: 
        print("request",response)
        score = 3
    return score

# Example scoring function
def calculate_score(data_item):
    try:
        messages = copy.deepcopy(messages_template)
        messages[1]["content"] = messages[1]["content"].format_map({"instruction":data_item["instruction"],"response":data_item["response"]})
        # score = request_gpt(messages)
        score = request_doubao(messages)
    except Exception as e:
        print("calculate", e)
        score = 3
    return score

def process_item(data_item):
    return calculate_score(data_item)

if __name__ == '__main__':
    # Number of processes to use
    num_processes = mp.cpu_count()
    root = "/mnt/bn/data-tns-live-llm/leon/datasets/fed_data/"
    datas = []
    settings = ["pos", "random", "iid2niid_code", "iid2niid_code_public"]
    for setting in settings:
        datas = []
        for i in range(10):
            datas.append(load_from_disk(f"{root}/{setting}_{i}.parquet"))
        datas = concatenate_datasets(datas)
        datas = datas.select(random.sample(range(len(datas)), 1000))
        # Use the pool to map the function to the dataset
        with mp.Pool(processes=5) as pool:
            scores = pool.map(process_item, tqdm(datas))
        
        # scores=[]
        # for data in tqdm(datas):
        #     scores.append(process_item(data))

        # Calculate the average score
        average_score = sum(scores) / len(scores)
        print(f'Average Quality Score: {average_score:.2f}')


## 相对于 public 数据集的覆盖度

In [11]:
root = "/mnt/bn/data-tns-live-llm/leon/datasets/fed_data/"
datas = []
# settings = ["pos", "random", "iid2niid_code", "iid2niid_code_public"] [0.75,0.88,0.72,0.8095]
# settings = ["iid2niid_code_public_filter"] # 0.8095
# settings = ["pos_med", "random_med", "iid2niid_med"]
# settings = ["public_10", "public_1", "public_0.1","public_0.01"]
# settings = ["niid_pos_public", "iid_pos_public"]
# settings = ["iid2niid_math","iid2niid_math_filter"]
settings = ["iid2niid_fin","iid2niid_fin_filter"]
# settings = ["pos_math"]
# settings = ["fed_code_only"]
for setting in settings:
    datas = []
    for i in range(10):
        datas.append(load_from_disk(f"{root}/{setting}_{i}.parquet"))
    datas = concatenate_datasets(datas)
    if len(datas) > 55000: datas = datas.select(random.sample(range(len(datas)), 55000))
    datas_embeddings = model.encode(datas["instruction"])
    volume = coverage(datas_embeddings, data_concated_embeddings)
    # volume = umap_embeddings_volume(datas_embeddings)
    print(f"{setting} The volume of the UMAP-reduced embeddings is:", volume)

Inference Embeddings: 100%|██████████| 25/25 [00:14<00:00,  1.68it/s]


iid2niid_fin The volume of the UMAP-reduced embeddings is: 0.654271125386062


Inference Embeddings: 100%|██████████| 25/25 [00:13<00:00,  1.85it/s]


iid2niid_fin_filter The volume of the UMAP-reduced embeddings is: 0.7089580499756947


## 相对于各 domain 数据集的覆盖度

In [6]:
root = "/mnt/bn/data-tns-live-llm/leon/datasets/fed_data/"
datas = []
# settings = ["pos", "random", "iid2niid_code_public"] [0.9396, 0.8512, 0.9531, 0.9766]
# settings = ["iid2niid_code_public_filter"] # 0.9766
# settings = ["niid_0.01", "niid_0.1", "niid_1", "niid_10"]
# settings = ["pos_med", "random_med", "iid2niid_med"]
# settings = ["niid_med_0.01", "niid_med_0.1", "niid_med_1", "niid_med_10"]
# settings = ["public_10", "public_1", "public_0.1","public_0.01"]
# settings = ["niid_pos_public", "iid_pos_public"]
# settings = ["random_math", "iid2niid_math_filter"]
# settings = ["random_fin","pos_math"]
# settings = ["iid2niid_math","iid2niid_math_filter"]
# settings = ["pos_nodup_math"]
# settings = ["iid2niid_fin","iid2niid_fin_filter"]
settings = [
    "iid2niid_code_4000","iid2niid_code_3000","iid2niid_code_2000","iid2niid_code_1000",
    "iid2niid_med_4000","iid2niid_med_3000","iid2niid_med_2000","iid2niid_med_1000",
    "iid2niid_fin_4000","iid2niid_fin_3000","iid2niid_fin_2000","iid2niid_fin_1000",
    "iid2niid_math_4000","iid2niid_math_3000","iid2niid_math_2000","iid2niid_math_1000",
]

# partial data
# code_embeddings = data_concated_embeddings[:20022]
# med_embeddings = data_concated_embeddings[71022:104977]
# fin_embeddings = data_concated_embeddings[20022:71022]
# math_embeddings = data_concated_embeddings[156979:207979]

# full data
code_embeddings = data_concated_embeddings[:20022]
med_embeddings = data_concated_embeddings[96794:130749]
fin_embeddings = data_concated_embeddings[20022:96794]
math_embeddings = data_concated_embeddings[182751:]

# names = ["code","med","fin","math"]
# domain_embeddings = [code_embeddings,med_embeddings,fin_embeddings,math_embeddings]

# 下两行都要改！
# names = ["math"]
# domain_embedding_list = [math_embeddings] #这要根据 domain 进行修改

for setting in tqdm(settings):
    datas = []
    for i in range(10):
        datas.append(load_from_disk(f"{root}/{setting}_{i}.parquet"))
    datas = concatenate_datasets(datas)
    if len(datas) > 55000: datas = datas.select(random.sample(range(len(datas)), 55000))
    datas_embeddings = model.encode(datas["instruction"])

    if "code" in setting: domain_embeddings = code_embeddings
    elif "med" in setting: domain_embeddings = med_embeddings
    elif "fin" in setting: domain_embeddings = fin_embeddings
    else: domain_embeddings = math_embeddings

    volume = coverage(datas_embeddings, domain_embeddings)
    print(f"{setting}'s coverage is:", volume)

    # for name, domain_embeddings in zip(names, domain_embedding_list):
    #     volume = coverage(datas_embeddings, domain_embeddings)
    #     # volume = umap_embeddings_volume(datas_embeddings)
    #     print(f"{setting}'s coverage is:", volume)

Inference Embeddings: 100%|██████████| 54/54 [00:22<00:00,  2.41it/s]
  6%|▋         | 1/16 [00:23<05:55, 23.69s/it]

iid2niid_code_4000's coverage is: 0.9767145788070623


Inference Embeddings: 100%|██████████| 41/41 [00:15<00:00,  2.73it/s]
 12%|█▎        | 2/16 [00:45<05:12, 22.34s/it]

iid2niid_code_3000's coverage is: 0.9647033770664769


Inference Embeddings: 100%|██████████| 28/28 [00:09<00:00,  2.95it/s]
 19%|█▉        | 3/16 [00:55<03:38, 16.82s/it]

iid2niid_code_2000's coverage is: 0.9443467654330236


Inference Embeddings: 100%|██████████| 15/15 [00:04<00:00,  3.46it/s]
 25%|██▌       | 4/16 [01:00<02:25, 12.13s/it]

iid2niid_code_1000's coverage is: 0.9083204763822296


Inference Embeddings: 100%|██████████| 54/54 [00:26<00:00,  2.07it/s]
 31%|███▏      | 5/16 [01:33<03:38, 19.86s/it]

iid2niid_med_4000's coverage is: 0.8828922816043293


Inference Embeddings: 100%|██████████| 41/41 [00:17<00:00,  2.30it/s]
 38%|███▊      | 6/16 [01:54<03:22, 20.26s/it]

iid2niid_med_3000's coverage is: 0.859376495545575


Inference Embeddings: 100%|██████████| 28/28 [00:11<00:00,  2.40it/s]
 44%|████▍     | 7/16 [02:15<03:04, 20.51s/it]

iid2niid_med_2000's coverage is: 0.8287683146811957


Inference Embeddings: 100%|██████████| 15/15 [00:05<00:00,  2.61it/s]
 50%|█████     | 8/16 [02:22<02:08, 16.00s/it]

iid2niid_med_1000's coverage is: 0.7876623242158739


Inference Embeddings: 100%|██████████| 54/54 [00:16<00:00,  3.22it/s]
 56%|█████▋    | 9/16 [02:43<02:02, 17.56s/it]

iid2niid_fin_4000's coverage is: 0.9736504682696816


Inference Embeddings: 100%|██████████| 41/41 [00:12<00:00,  3.37it/s]
 62%|██████▎   | 10/16 [02:58<01:41, 16.92s/it]

iid2niid_fin_3000's coverage is: 0.9667212655655708


Inference Embeddings: 100%|██████████| 28/28 [00:08<00:00,  3.43it/s]
 69%|██████▉   | 11/16 [03:09<01:14, 14.92s/it]

iid2niid_fin_2000's coverage is: 0.956135537696035


Inference Embeddings: 100%|██████████| 15/15 [00:04<00:00,  3.37it/s]
 75%|███████▌  | 12/16 [03:14<00:48, 12.11s/it]

iid2niid_fin_1000's coverage is: 0.9344662604855937


Inference Embeddings: 100%|██████████| 54/54 [00:37<00:00,  1.43it/s]
 81%|████████▏ | 13/16 [04:04<01:10, 23.51s/it]

iid2niid_math_4000's coverage is: 0.9152439750720275


Inference Embeddings: 100%|██████████| 41/41 [00:28<00:00,  1.45it/s]
 88%|████████▊ | 14/16 [04:50<01:00, 30.40s/it]

iid2niid_math_3000's coverage is: 0.9036248090770238


Inference Embeddings: 100%|██████████| 28/28 [00:19<00:00,  1.41it/s]
 94%|█████████▍| 15/16 [05:19<00:29, 29.73s/it]

iid2niid_math_2000's coverage is: 0.887758233066301


Inference Embeddings: 100%|██████████| 15/15 [00:10<00:00,  1.45it/s]
100%|██████████| 16/16 [05:33<00:00, 20.86s/it]

iid2niid_math_1000's coverage is: 0.862585553531908





In [None]:
root = "/mnt/bn/data-tns-live-llm/leon/datasets/fed_data/"
datas = []
# settings = ["niid_0.01", "niid_0.1", "niid_1", "niid_10", "niid_med_0.01", "niid_med_0.1", "niid_med_1", "niid_med_10"]
settings = ["niid_pos_public", "iid_pos_public"]
for setting in settings:
    datas = []
    for i in range(10):
        datas.append(load_from_disk(f"{root}/{setting}_{i}.parquet"))
    datas = concatenate_datasets(datas)
    if len(datas) > 55000: datas = datas.select(random.sample(range(len(datas)), 55000))
    datas_embeddings = model.encode(datas["instruction"])
    volume = coverage(datas_embeddings, data_concated_embeddings)
    # volume = umap_embeddings_volume(datas_embeddings)
    print(f"{setting} The volume of the UMAP-reduced embeddings is:", volume)

# 降维到同一平面

In [7]:
import numpy as np
import umap
from scipy.spatial import ConvexHull
from sklearn.decomposition import PCA
import os
os.environ["OPENBLAS_NUM_THREADS"] = "64"

def umap_embeddings_volume(embeddings, umap_params=None):
    if umap_params is None:
        umap_params = {'n_components': 2, 'metric': 'cosine'}
    
    # 初始化 UMAP 降维模型
    reducer = umap.UMAP(**umap_params)
    # 进行 UMAP 降维
    low_dim_embeddings = reducer.fit_transform(embeddings)

    # 将降维后的数据按原分割点分开
    split_indices = np.cumsum([len(e) for e in all_datas_embeddings])
    embeddings_split = np.split(low_dim_embeddings, split_indices[:-1])

    volumes = []
    for embeddings in embeddings_split:
        # 计算凸包
        hull = ConvexHull(embeddings)
        volumes.append(hull.volume)
    
    for alpha, volume in zip(alpha_values, volumes):
        print(f"The volume of the UMAP-reduced embeddings for alpha={alpha} is: {volume}")

root = "/mnt/bn/data-tns-live-llm/leon/datasets/fed_data/"
alpha_values = [0.01, 0.1, 1, 10]
all_datas_embeddings = []

import random
for alpha in alpha_values:
    datas = []
    for i in range(10):
        datas.append(load_from_disk(f"{root}/niid_{alpha}_{i}.parquet"))
    datas = concatenate_datasets(datas)
    datas = datas.select(random.sample(range(len(datas)), 5000))
    datas_embeddings = model.encode(datas["instruction"])
    all_datas_embeddings.append(datas_embeddings)

# 将所有嵌入向量合并
all_embeddings = np.vstack(all_datas_embeddings)

umap_embeddings_volume(all_embeddings)

Inference Embeddings: 100%|██████████| 3/3 [00:01<00:00,  2.37it/s]
Inference Embeddings: 100%|██████████| 3/3 [00:01<00:00,  2.99it/s]
Inference Embeddings: 100%|██████████| 3/3 [00:01<00:00,  2.32it/s]
Inference Embeddings: 100%|██████████| 3/3 [00:00<00:00,  3.33it/s]


The volume of the UMAP-reduced embeddings for alpha=0.01 is: 420.6180865435744
The volume of the UMAP-reduced embeddings for alpha=0.1 is: 420.784624893828
The volume of the UMAP-reduced embeddings for alpha=1 is: 420.7824662411095
The volume of the UMAP-reduced embeddings for alpha=10 is: 420.4096929098317


## tsne

In [11]:
import numpy as np
# from sklearn.manifold import TSNE
from scipy.spatial import ConvexHull
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import os
import random
from cuml.manifold import TSNE

os.environ["OPENBLAS_NUM_THREADS"] = "64"

def tsne_embeddings_volume(embeddings, split_indices):
    # 初始化 t-SNE 降维模型
    resolver = TSNE(n_components=2, metric='cosine', perplexity=50, n_neighbors=300)
    print(resolver.__dict__)
    # 进行 t-SNE 降维
    low_dim_embeddings = resolver.fit_transform(embeddings)

    # 将降维后的数据按原分割点分开
    embeddings_split = np.split(low_dim_embeddings, split_indices[:-1])

    volumes = []
    for embeddings in embeddings_split:
        # 计算凸包
        hull = ConvexHull(embeddings)
        volumes.append(hull.volume)
    
    return volumes

root = "/mnt/bn/data-tns-live-llm/leon/datasets/fed_data/"
alpha_values = [0.01, 0.1, 1, 10]
all_datas_embeddings = []

for alpha in alpha_values:
    datas = []
    for i in range(10):
        datas.append(load_from_disk(f"{root}/niid_{alpha}_{i}.parquet"))
    datas = concatenate_datasets(datas)
    datas = datas.select(random.sample(range(len(datas)), 5000))
    datas_embeddings = model.encode(datas["instruction"])
    all_datas_embeddings.append(datas_embeddings)

# 将所有嵌入向量合并
all_embeddings = np.vstack(all_datas_embeddings)

# 数据标准化
scaler = StandardScaler()
all_embeddings = scaler.fit_transform(all_embeddings)

# 计算分割索引
split_indices = np.cumsum([len(e) for e in all_datas_embeddings])

volumes = tsne_embeddings_volume(all_embeddings, split_indices)

# 打印不同 niid程度数据集的凸包体积
for alpha, volume in zip(alpha_values, volumes):
    print(f"The volume of the t-SNE-reduced embeddings for alpha={alpha} is: {volume}")

Inference Embeddings: 100%|██████████| 3/3 [00:00<00:00,  3.67it/s]
Inference Embeddings: 100%|██████████| 3/3 [00:01<00:00,  2.31it/s]
Inference Embeddings: 100%|██████████| 3/3 [00:00<00:00,  3.52it/s]
Inference Embeddings: 100%|██████████| 3/3 [00:01<00:00,  2.83it/s]
  return func(**kwargs)


{'handle': <pylibraft.common.handle.Handle object at 0x7fe060f2e180>, 'verbose': 4, 'output_type': 'input', 'output_mem_type': <MemoryType.device: 1>, '_input_type': None, '_input_mem_type': None, 'target_dtype': None, 'n_features_in_': None, 'n_components': 2, 'perplexity': 50, 'early_exaggeration': 12.0, 'late_exaggeration': 1.0, 'learning_rate': 200.0, 'n_iter': 1000, 'n_iter_without_progress': 300, 'min_grad_norm': 1e-07, 'metric': 'cosine', 'metric_params': None, 'init': 'random', 'random_state': None, 'method': 'fft', 'angle': 0.5, 'n_neighbors': 300, 'perplexity_max_iter': 100, 'exaggeration_iter': 250, 'pre_momentum': 0.5, 'post_momentum': 0.8, 'learning_rate_method': 'adaptive', 'epssq': 0.0025, 'perplexity_tol': 1e-05, 'min_gain': 0.01, 'pre_learning_rate': 200.0, 'post_learning_rate': 400.0, 'square_distances': True, 'X_m': CumlArrayDescriptorMeta(input_type=None, values={None: None}), 'embedding_': CumlArrayDescriptorMeta(input_type=None, values={None: None}), 'sparse_fit':

## iid 的广度

In [11]:
root = "/mnt/bn/data-tns-live-llm/leon/datasets/fed_data/"
datas = []
for i in range(10):
    datas.append(load_from_disk(f"{root}/niid_anchor_nodup_public_{i}.parquet"))
datas = concatenate_datasets(datas)
print(len(datas))
datas_embeddings = model.encode(datas["instruction"])

51000


Inference Embeddings: 100%|██████████| 25/25 [00:13<00:00,  1.87it/s]


In [12]:
# 调用函数计算 UMAP 降维后凸包的体积
volume = umap_embeddings_volume(datas_embeddings)
print("The volume of the UMAP-reduced embeddings is:", volume)

The volume of the UMAP-reduced embeddings is: 1171.5674569446574


# prototype based niid 的广度

In [21]:
root = "/mnt/bn/data-tns-live-llm/leon/datasets/fed_data/"
datas = []
for i in range(10):
    datas.append(load_from_disk(f"{root}/niid_prototype_public_{i}.parquet"))
datas = concatenate_datasets(datas)
print(len(datas))
datas_embeddings = model.encode(datas["instruction"])
# 调用函数计算 UMAP 降维后凸包的体积
volume = umap_embeddings_volume(datas_embeddings)
print("The volume of the UMAP-reduced embeddings is:", volume)

51000


Inference Embeddings: 100%|██████████| 25/25 [00:18<00:00,  1.38it/s]


The volume of the UMAP-reduced embeddings is: 604.2912879650961


# anchor

In [27]:
root = "/mnt/bn/data-tns-live-llm/leon/datasets/fed_data/"
datas = []
for i in range(10):
    datas.append(load_from_disk(f"{root}/niid_anchor_public_{i}.parquet"))
datas = concatenate_datasets(datas)
print(len(datas))
datas_embeddings = model.encode(datas["instruction"])
# 调用函数计算 UMAP 降维后凸包的体积
volume = umap_embeddings_volume(datas_embeddings)
print("The volume of the UMAP-reduced embeddings is:", volume)

51000


Inference Embeddings: 100%|██████████| 25/25 [01:30<00:00,  3.64s/it]


The volume of the UMAP-reduced embeddings is: 1882.2693424905829
