In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import spearmanr

# 读取MTEB排行榜数据（假设CSV文件名为 mteb_rankings.csv）
mteb_df = pd.read_csv("mteb_rankings.csv")

# 从论文表格中提取sticky tokens数据（手动整理为以下格式）
sticky_tokens_data = {
    "Model": [
        "all-MiniLM-L6-v2", "all-mpnet-base-v2", "sup-simcse-bert-base-uncased",
        "e5-mistral-7b-instruct", "bge-large-en-v1.5", "gte-Qwen2-7B-instruct",
        # 其他模型名称...
    ],
    "Sticky_Tokens": [21, 24, 22, 31, 15, 103, ...]  # 对应Validated列的值
}
sticky_df = pd.DataFrame(sticky_tokens_data)

# 合并数据（根据模型名称匹配）
merged_df = pd.merge(mteb_df, sticky_df, on="Model", how="inner")

In [None]:
# 计算斯皮尔曼秩相关系数（适用于非正态分布数据）
corr, p_value = spearmanr(merged_df["Sticky_Tokens"], merged_df["Mean (Task)"])
print(f"Spearman Correlation: {corr:.3f}, p-value: {p_value:.3g}")

# 输出示例：
# Spearman Correlation: -0.452, p-value: 0.0037

In [None]:
plt.figure(figsize=(10, 6))
sns.regplot(
    data=merged_df,
    x="Sticky_Tokens",
    y="Mean (Task)",
    scatter_kws={"s": 100, "alpha": 0.6},
    line_kws={"color": "red"}
)
plt.title("Sticky Tokens vs. Model Performance (MTEB Mean Score)")
plt.xlabel("Number of Sticky Tokens")
plt.ylabel("MTEB Mean Score")
plt.grid(True)
plt.show()