# Group all different queries

In [6]:

from datasets import load_dataset
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import re
import json
import pandas as pd

# 去敏感化函数
def desensitize_query(query):
    query = re.sub(r'\b(Black|Asian|White|Hispanic|intersex|male|female|woman|man)\b', 'individual', query, flags=re.IGNORECASE)
    query = re.sub(r'\b(\d+ year old)\b', 'adult', query, flags=re.IGNORECASE)
    return query.strip()

# 加载所有子数据集
sub_list = ["cc_llm", "cc_manual", "ehai", "fbrt_llm", "trinds", "omiye_et_al", "fbrt_llm_661_sampled",
            "fbrt_manual", "multimedqa", "mixed_mmqa_omaq", "omaq"]

all_queries = []

# 提取查询内容
for sub in sub_list:
    ds = load_dataset("katielink/EquityMedQA", sub)
    for row in ds["train"]:
        if isinstance(row, str):
            all_queries.append(desensitize_query(row))
        elif isinstance(row, dict):
            for key, value in row.items():
                if isinstance(value, str):
                    all_queries.append(desensitize_query(value))
print("length of  all_queries", len(all_queries))
sum_ = sum(len(i.split(" ")) for i in all_queries)
print("average length of all_queries", int(sum_/len(all_queries)))


length of  all_queries 6949
average length of all_queries 16


# Data instruction MedMCQA

In [11]:

from datasets import load_dataset
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import re
import json
import pandas as pd

def desensitize_query(query):
    query = re.sub(r'\b(Black|Asian|White|Hispanic|intersex|male|female|woman|man)\b', 'individual', query, flags=re.IGNORECASE)
    query = re.sub(r'\b(\d+ year old)\b', 'adult', query, flags=re.IGNORECASE)
    return query.strip()

sub_list = ["cc_llm", "cc_manual", "ehai", "fbrt_llm", "trinds", "omiye_et_al", "fbrt_llm_661_sampled",
            "fbrt_manual", "multimedqa", "mixed_mmqa_omaq", "omaq"]

all_queries = []

for sub in sub_list:
    ds = load_dataset("katielink/EquityMedQA", sub)
    for row in ds["train"]:
        if isinstance(row, str):
            all_queries.append(desensitize_query(row))
        elif isinstance(row, dict):
            for key, value in row.items():
                if isinstance(value, str):
                    all_queries.append(desensitize_query(value))
print("length of  all_queries", len(all_queries))
sum_ = sum(len(i.split(" ")) for i in range(all_queries))
print("average length of all_queries", int(sum_/all_queries))


model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(all_queries)

similarity_matrix = cosine_similarity(embeddings)

threshold = 0.7  
groups = []
visited = set()

for i, query in enumerate(all_queries):
    if i in visited:
        continue
    group = {query} 
    visited.add(i)
    for j in range(len(all_queries)):
        if j not in visited and similarity_matrix[i, j] > threshold:
            group.add(all_queries[j])
            visited.add(j)
    groups.append(list(group))  


output_file = "query_groups.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(groups, f, indent=4, ensure_ascii=False)

csv_output = []
for idx, group in enumerate(groups, 1):
    for query in group:
        csv_output.append({"Group ID": idx, "Query": query})

df = pd.DataFrame(csv_output)
df.to_csv("query_groups.csv", index=False, encoding="utf-8")



分组结果已保存到文件：query_groups.json 和 query_groups.csv
