In [1]:
from datasets import load_dataset, concatenate_datasets, load_from_disk
import pandas as pd
import datasets
from datasets import Dataset
from pprint import pprint as pp
from datasets import Dataset
from sklearn.cluster import KMeans
from tqdm import tqdm
import torch
import heapq
from functools import partial
import os

In [2]:
code_data = load_dataset("sahil2801/CodeAlpaca-20k")["train"]
fin_data = load_dataset("FinGPT/fingpt-sentiment-train")["train"]
med_data = load_dataset("medalpaca/medical_meadow_medical_flashcards")["train"]
general_data = load_dataset("tatsu-lab/alpaca")["train"]
math_data = load_dataset("TIGER-Lab/MathInstruct")["train"]

In [4]:
def alpaca_format(example):
    if example['input'] == "":
        example["instruction"] = example["instruction"]
    else:
        example["instruction"] = example["instruction"] + " " + example['input']
    example["response"] = example['output']
    return example
    
def labeling(example, label):
    example["label"] = label
    return example

In [5]:
def process_sft_dataset(dataset_name, dataset, dataset_sample=None)->datasets.Dataset:
    if dataset_name in ["lucasmccabe-lmi/CodeAlpaca-20k", "yahma/alpaca-cleaned", "FinGPT/fingpt-sentiment-train"]:
        dataset = dataset.map(alpaca_format, remove_columns=['input', 'output'], desc=f"Preprocessing {dataset_name} for unified format.")
    elif dataset_name in ["WizardLM/WizardLM_evol_instruct_70k"]:
        dataset = dataset.rename_column("output", "response")
    elif dataset_name in ["tatsu-lab/alpaca", "vicgalle/alpaca-gpt4", "gbharti/finance-alpaca"]:
        dataset = dataset.map(alpaca_format, remove_columns=['input', 'output', 'text'], desc=f"Preprocessing {dataset_name} for unified format.")
    elif dataset_name in ["TIGER-Lab/MathInstruct"]:
        df = pd.DataFrame(dataset)
        df = df.drop_duplicates(subset=['instruction'])
        dataset = datasets.Dataset.from_pandas(df)
        dataset = dataset.shuffle(seed=42).select(range(51000))
        dataset = dataset.rename_column("output", "response")
        dataset = dataset.remove_columns(['source'])
    elif dataset_name in ["lighteval/MATH"]:
        dataset = dataset.rename_column("solution", "response")
        dataset = dataset.rename_column("problem", "instruction")
        dataset = dataset.remove_columns(['level', 'type'])
    elif dataset_name in ['gsm8k']:
        dataset = dataset.rename_column("question", "instruction")
        dataset = dataset.rename_column("answer", "response")
    elif dataset_name in ['medalpaca/medical_meadow_medical_flashcards']:       # TODO: 'lavita/ChatDoctor-HealthCareMagic-100k'. not sure whether to discard the instruction.
        dataset = dataset.remove_columns(['instruction'])
        dataset = dataset.rename_column("input", "instruction")
        dataset = dataset.rename_column("output", "response")
    elif "math" in dataset_name:
        dataset = dataset.remove_columns(['source'])
        dataset = dataset.rename_column("output", "response")
    else:
        raise NotImplementedError(f"Dataset {dataset_name} is not supported.")
    dataset = dataset.shuffle(seed=42)
    if dataset_sample:
        num_sample = min(len(dataset), dataset_sample)
        dataset = dataset.select(range(num_sample))
    print(f">> ===== After processing, Dataset {dataset_name} has {len(dataset)} examples. =====")
    return dataset

In [6]:
processed_data = []
# 这里前后都要改
for name, dataset in zip(["lucasmccabe-lmi/CodeAlpaca-20k", "TIGER-Lab/MathInstruct","FinGPT/fingpt-sentiment-train", "medalpaca/medical_meadow_medical_flashcards","tatsu-lab/alpaca"],[code_data, math_data,fin_data,med_data,general_data]): # 这里前后都要改
    tmp:datasets.Dataset = process_sft_dataset(name,dataset)
    if "fin" in name: 
        tmp = tmp.shuffle(seed=42).select(range(51000))
    print(tmp.column_names)
    processed_data.append(tmp)

>> ===== After processing, Dataset lucasmccabe-lmi/CodeAlpaca-20k has 20022 examples. =====
['instruction', 'response']
>> ===== After processing, Dataset TIGER-Lab/MathInstruct has 51000 examples. =====
['response', 'instruction', '__index_level_0__']
>> ===== After processing, Dataset FinGPT/fingpt-sentiment-train has 76772 examples. =====
['instruction', 'response']
>> ===== After processing, Dataset medalpaca/medical_meadow_medical_flashcards has 33955 examples. =====
['instruction', 'response']
>> ===== After processing, Dataset tatsu-lab/alpaca has 52002 examples. =====
['instruction', 'response']


In [7]:
label = ["code","math","fin","med","gen",]

for i, data in enumerate(processed_data):
    data = data.map(lambda example: labeling(example, label[i]), batched=False)
    processed_data[i] = data

Map:   0%|          | 0/51000 [00:00<?, ? examples/s]

In [8]:
data_concated = concatenate_datasets(processed_data)

# 构造base数据集

In [9]:
import numpy as np
import random
random.seed(42)
i=0
sampled_indices = np.array(random.sample(range(len(processed_data[i])), 1000))
sampled_data = processed_data[i].select(sampled_indices)
sampled_set = set(sampled_indices)
base_set = set(range(len(data_concated)))
# 计算差集，即在 idx_set 中但不在 sampled_set 中的元素
remaining_idx = list(base_set - sampled_set)
print(len(remaining_idx))
data_concated = data_concated.select(remaining_idx)

206979


# 将base数据集随机拆成十份

In [13]:
sampled_data = sampled_data.shuffle()
local_datasets = []
for i in range(10):
    local_datasets.append(sampled_data.shard(10, i))

In [None]:
for i, dataset in enumerate(local_datasets):
    dataset.save_to_disk(f"/mnt/bn/data-tns-live-llm/leon/datasets/privacy_data/base_code_{i}.parquet")

In [14]:
local_datasets = []
for i in range(10):
    local_datasets.append(load_from_disk(f"/mnt/bn/data-tns-live-llm/leon/datasets/privacy_data/base_code_{i}.parquet"))

In [3]:
tmp = load_from_disk("/mnt/bn/data-tns-live-llm/leon/datasets/fed_data/base_code_0.parquet")
print(len(tmp))

100


In [23]:
print(len(local_datasets[0]))

100


## 将公共数据集也随机拆成10份

In [11]:
data_concated = data_concated.shuffle(seed=42)
public_datasets = []
for i in range(10):
    public_datasets.append(data_concated.shard(10,i))

# 构造随机采样数据集

In [None]:
client_random_datasets = []
dataset: Dataset
for i, dataset in enumerate(public_datasets):
    random.seed(i)
    idxs = random.sample(range(len(dataset)), 5000)
    client_random_datasets.append(dataset.select(idxs))
print(len(client_random_datasets[0]))

# 公共数据集直接随机采样

In [11]:
client_random_datasets = []
for i in range(10):
    idxs = random.sample(range(len(data_concated)), 5000)
    client_random_datasets.append(data_concated.select(idxs))

In [12]:
for i, dataset in enumerate(client_random_datasets):
    # dataset = concatenate_datasets([dataset,local_datasets[i]]).shuffle(seed=42)
    dataset.save_to_disk(f"/mnt/bn/data-tns-live-llm/leon/datasets/fed_data/random_math_{i}.parquet")

Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

# 对每一个客户端的数据集进行检索，构造 pos 和 neg 数据集

In [18]:
from FlagEmbedding import FlagModel
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"
model = FlagModel('BAAI/bge-large-en-v1.5', 
                  query_instruction_for_retrieval="",
                  use_fp16=True,
                  )

cls
----------using 2*GPUs----------


In [10]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', model_kwargs={"torch_dtype":torch.bfloat16})



# 非去重pos 检索

In [15]:
client_pos_datasets, client_neg_datasets = [], []
k = 10
retrieve_num = 100

for i, sampled_data in enumerate(local_datasets):
    print(i)
    sampled_embeddings = model.encode(sampled_data["instruction"])
    # 假设 embeddings 是你的嵌入数据
    kmeans = KMeans(n_clusters=k, random_state=0).fit(sampled_embeddings)
    concated_embeddings = model.encode(public_datasets[i]["instruction"])
    clusters = torch.tensor(kmeans.cluster_centers_, dtype=torch.float32)
    concated_embeddings = torch.tensor(concated_embeddings, dtype=torch.float32)
    similarity_scores = clusters @ concated_embeddings.T
    top_idxs = []
    bot_idxs = []
    for j in range(similarity_scores.shape[0]):
        tmp = similarity_scores[j]
        top_idxs.append(heapq.nlargest(retrieve_num, range(len(tmp)-1), key=lambda x: tmp[x]))
        bot_idxs.append(heapq.nsmallest(retrieve_num, range(len(tmp)-1), key=lambda x: tmp[x]))
        
    pos_datasets: Dataset = []
    neg_datasets: Dataset = []
    top_idxs=np.concatenate(top_idxs,axis=None)
    bot_idxs=np.concatenate(bot_idxs,axis=None)
    pos_datasets = public_datasets[i].select(top_idxs)
    neg_datasets = public_datasets[i].select(bot_idxs)
    pos_datasets = concatenate_datasets([pos_datasets, sampled_data])
    neg_datasets = concatenate_datasets([neg_datasets, sampled_data])
    pos_datasets = pos_datasets.shuffle()
    neg_datasets = neg_datasets.shuffle()
    client_pos_datasets.append(pos_datasets)
    client_neg_datasets.append(neg_datasets)

0


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


1
2
3
4
5
6
7
8
9


In [16]:
for i, (pos_data, neg_data) in enumerate(zip(client_pos_datasets, client_neg_datasets)):
    pos_data.save_to_disk(f"/mnt/bn/data-tns-live-llm/leon/datasets/privacy_data/pos_code_1000_{i}.parquet")
    # neg_data.save_to_disk(f"/mnt/bn/data-tns-live-llm/leon/datasets/fed_data/neg_math_{i}.parquet")

Saving the dataset (0/1 shards):   0%|          | 0/1100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1100 [00:00<?, ? examples/s]

# 去重pos检索

In [None]:
client_pos_datasets, client_neg_datasets = [], []
k = 10
for i, sampled_data in enumerate(local_datasets):
    sampled_embeddings = model.encode(sampled_data["instruction"])
    # 假设 embeddings 是你的嵌入数据
    kmeans = KMeans(n_clusters=k, random_state=0).fit(sampled_embeddings)
    concated_embeddings = model.encode(public_datasets[i]["instruction"])
    clusters = torch.tensor(kmeans.cluster_centers_, dtype=torch.float32)
    concated_embeddings = torch.tensor(concated_embeddings, dtype=torch.float32)
    similarity_scores = clusters @ concated_embeddings.T
    already_retrieved = set()
    top_idxs = []
    bot_idxs = []
    # 遍历每一个 cluster center 的相似度分数
    for j in range(similarity_scores.shape[0]):
        tmp = similarity_scores[j]
        # 过滤掉已经检索过的索引
        filtered_scores = [(score, idx) for idx, score in enumerate(tmp) if idx not in already_retrieved]
        # 将分数和索引分开，分别进行排序
        filtered_scores.sort(key=lambda x: x[0], reverse=True)
        # 提取前 500 个最高的和最低的索引
        top_500 = filtered_scores[:500]
        bot_500 = filtered_scores[-500:]
        # 从剩余的分数中提取索引并更新已检索集合
        top_indices = [idx for _, idx in top_500]
        bot_indices = [idx for _, idx in bot_500]
        top_idxs.extend(top_indices)
        bot_idxs.extend(bot_indices)
    
    pos_datasets: Dataset = []
    neg_datasets: Dataset = []
    top_idxs=np.concatenate(top_idxs,axis=None)
    bot_idxs=np.concatenate(bot_idxs,axis=None)
    pos_datasets = public_datasets[i].select(top_idxs)
    neg_datasets = public_datasets[i].select(bot_idxs)
    pos_datasets = concatenate_datasets([pos_datasets, sampled_data])
    neg_datasets = concatenate_datasets([neg_datasets, sampled_data])
    pos_datasets = pos_datasets.shuffle(seed=42)
    neg_datasets = neg_datasets.shuffle(seed=42)
    client_pos_datasets.append(pos_datasets)
    client_neg_datasets.append(neg_datasets)

# 构造pos+diversity 数据集，一半 pos，一半 diversity

In [None]:
from datasets import Dataset
import torch
import heapq
from tqdm import tqdm
from sklearn.cluster import KMeans

client_pos_datasets=[]
for i, sampled_data in enumerate(local_datasets):
    sampled_embeddings = model.encode(sampled_data["instruction"])
    # 假设 embeddings 是你的嵌入数据
    k = 10
    kmeans = KMeans(n_clusters=k, random_state=0).fit(sampled_embeddings)
    concated_embeddings = model.encode(public_datasets[i]["instruction"])
    clusters = torch.tensor(kmeans.cluster_centers_, dtype=torch.float32)
    concated_embeddings = torch.tensor(concated_embeddings, dtype=torch.float32)
    similarity_scores = clusters @ concated_embeddings.T
    top_idxs = []
    for i in range(similarity_scores.shape[0]):
        tmp = similarity_scores[i]
        top_idxs.append(heapq.nlargest(250, range(len(tmp)), key=tmp.__getitem__))
    pos_datasets: Dataset = []
    # top_idxs去重，其余作为 diversity
    top_idxs = set(np.concatenate(top_idxs,axis=0))
    try: top_idxs.remove(len(public_datasets[i]))
    except: pass
    pos_datasets = public_datasets[i].select(top_idxs)
    print(len(top_idxs))
    # 从public_datasets[i]中去掉 top_idxs
    all_idxs = set(range(len(public_datasets[i])))
    remain_idxs = list(all_idxs-top_idxs)
    random_idxs = random.sample(remain_idxs, 5000-len(top_idxs))
    diversity_datasets = public_datasets[i].select(random_idxs)
    pos_datasets = concatenate_datasets([pos_datasets, diversity_datasets, sampled_data])
    pos_datasets = pos_datasets.shuffle(seed=42)
    client_pos_datasets.append(pos_datasets)

In [None]:
for i, pos_data in enumerate(client_pos_datasets):
    pos_data.save_to_disk(f"/mnt/bn/data-tns-live-llm/leon/datasets/fed_data/T_{i}.parquet")

# 构造去重 pos 数据集

In [31]:
from ordered_set import OrderedSet
tmp = OrderedSet([4,5,3,7,1])
tmp1 = OrderedSet([4,5])
for t in tmp1: tmp.discard(t)
print(tmp)

OrderedSet([3, 7, 1])


In [None]:
from datasets import Dataset
import torch
import heapq
from tqdm import tqdm
from sklearn.cluster import KMeans
from ordered_set import OrderedSet

client_pos_datasets=[]
for i, sampled_data in enumerate(local_datasets):
    sampled_embeddings = model.encode(sampled_data["instruction"])
    # 假设 embeddings 是你的嵌入数据
    k = 10
    kmeans = KMeans(n_clusters=k, random_state=0).fit(sampled_embeddings)
    concated_embeddings = model.encode(public_datasets[i]["instruction"])
    clusters = torch.tensor(kmeans.cluster_centers_, dtype=torch.float32)
    concated_embeddings = torch.tensor(concated_embeddings, dtype=torch.float32)
    top_idxs:OrderedSet = OrderedSet()
    remain_idxs = OrderedSet(range(len(public_datasets[i])))
    for j in range(k):
        similarity_scores = clusters[j] @ concated_embeddings.T
        top_idx = list(OrderedSet(heapq.nlargest(5000, range(len(similarity_scores)-1), key=lambda x: similarity_scores[x]))-top_idxs)[:500]
        top_idxs.update(top_idx)
        print("top_idxs", len(top_idxs))
        remain_idxs.difference_update(top_idx)
        print("remain_idxs", len(remain_idxs))

    pos_datasets = public_datasets[i].select(list(top_idxs))
    pos_datasets = concatenate_datasets([pos_datasets, sampled_data])
    pos_datasets = pos_datasets.shuffle(seed=42)
    client_pos_datasets.append(pos_datasets)

In [49]:
for i, pos_data in enumerate(client_pos_datasets):
    pos_data.save_to_disk(f"/mnt/bn/data-tns-live-llm/leon/datasets/fed_data/pos_nodup_{i}.parquet")

Saving the dataset (0/1 shards):   0%|          | 0/5100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5100 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5100 [00:00<?, ? examples/s]

# 构造 code only fed 训练

In [9]:
code_tmp:Dataset = processed_data[3]
client_pos_datasets = []

In [12]:
for i in range(10):
    client_pos_datasets.append(code_tmp.shuffle(seed=i).select(range(5000)))

In [13]:
for i, (pos_data) in enumerate(client_pos_datasets):
    pos_data.save_to_disk(f"/mnt/bn/data-tns-live-llm/leon/datasets/fed_data/fed_code_only_{i}.parquet")

Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/5000 [00:00<?, ? examples/s]