In [1]:
# %load_ext gradio

In [3]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
import sys
sys.path.append('/root/StickyToken')
import torch
import gradio as gr
import json
import pandas as pd
import random
import numpy as np
from sentence_transformers import SentenceTransformer
from datasets import load_dataset
from stickytoken.utils import load_verification_results
from collections import namedtuple
DistanceMetrics = namedtuple("Metrics", ["cosine_distance", "euclidean_distance", "manhattan_distance"])
from stickytoken.utils import distance_metrics, random_insert
from stickytoken.sentence_pair import output_dataset_name

In [4]:
path = "/root/StickyToken/results/experiment_information.json"
try:
    with open(path, 'r', encoding='utf-8') as f:
        experiement_record = json.load(f)
except (FileNotFoundError, json.JSONDecodeError):
    experiement_record = []
experiement_record_df = pd.DataFrame(experiement_record)
model_list = experiement_record_df["model_name"].tolist()
experiement_record_df.head(3)


Unnamed: 0,record_time,model_name,vocab_size,num_parameters,dataset,insert_num,model,sent_pair_num,verification_sent_pair_num,ok_tokens_num,...,caculate_vocab_token_magic_score_time,final_verification_time,vocab_embeddings_is_on_unit_sphere,wte_is_on_unit_sphere,vocab_embeddings_is_anisotropic,wte_is_anisotropic,vocab_embeddings_mean_cosine_similarity,candidates_for_verification_percentile,candidates_for_verification_threshold,candidates_for_verification_num
0,2024-10-27 16:08:44,sentence-t5-base,32100,110218368,[G:/juchiyun2024-11-14/hf_dataset/mteb/sts13-s...,8,G:/hf/sentence-transformers/sentence-t5-base,5,250,32097,...,7564.841289,7328.973902,True,False,True,True,0.795921,2.0,56.085167,642
1,2024-10-28 08:00:50,sentence-t5-large,32100,335726080,[G:/juchiyun2024-11-14/hf_dataset/mteb/sts13-s...,8,G:/hf/sentence-transformers/sentence-t5-large,5,250,32097,...,18250.417344,18864.12946,True,False,True,True,0.763351,2.0,49.076265,642
2,2024-10-28 12:59:15,all-MiniLM-L6-v2,30522,22713216,[G:/juchiyun2024-11-14/hf_dataset/mteb/sts13-s...,8,G:/hf/sentence-transformers/all-MiniLM-L6-v2,5,250,23699,...,5054.268309,4728.382494,True,False,True,True,0.199846,2.0,49.204199,474


In [5]:
model_list

['sentence-t5-base',
 'sentence-t5-large',
 'all-MiniLM-L6-v2',
 'bge-base-en-v1.5',
 'bge-small-en-v1.5',
 'all-mpnet-base-v2',
 'sentence-t5-xl',
 'e5-small',
 'e5-base',
 'e5-large',
 'bge-large-en-v1.5',
 'gtr-t5-base',
 'gte-base',
 'gte-small',
 'gte-base-en-v1.5',
 'gtr-t5-large',
 'gte-large',
 'gte-large-en-v1.5',
 'gtr-t5-xl',
 'GritLM-7B',
 'sentence-t5-xxl',
 'sup-simcse-bert-base-uncased',
 'sup-simcse-bert-large-uncased',
 'sup-simcse-roberta-base',
 'sup-simcse-roberta-large',
 'gtr-t5-xxl',
 'e5-mistral-7b-instruct',
 'nomic-embed-text-v1.5',
 'nomic-embed-text-v1',
 'SFR-Embedding-Mistral',
 'SFR-Embedding-2_R',
 'instructor-base',
 'gte-Qwen2-7B-instruct',
 'gte-Qwen2-1.5B-instruct',
 'UAE-Large-V1',
 'instructor-xl',
 'instructor-large']

In [6]:
all_verifications_df = pd.DataFrame()

for model_name in model_list:
    print(model_name)
    verification_results = load_verification_results(model_name)
    verification_results_df = pd.DataFrame(verification_results).transpose()
    # 只取出‘verification’列非空的数据
    verifications_df = verification_results_df[verification_results_df['verification'].notnull()].copy()
    # 添加一个列，列名为‘model’，值为model_name
    verifications_df.loc[:, 'model'] = model_name
    all_verifications_df = pd.concat([all_verifications_df, verifications_df], ignore_index=True)


sentence-t5-base
sentence-t5-large
all-MiniLM-L6-v2
bge-base-en-v1.5
bge-small-en-v1.5
all-mpnet-base-v2
sentence-t5-xl
e5-small
e5-base
e5-large
bge-large-en-v1.5
gtr-t5-base
gte-base
gte-small
gte-base-en-v1.5
gtr-t5-large
gte-large
gte-large-en-v1.5
gtr-t5-xl
GritLM-7B
sentence-t5-xxl
sup-simcse-bert-base-uncased
sup-simcse-bert-large-uncased
sup-simcse-roberta-base
sup-simcse-roberta-large
gtr-t5-xxl
e5-mistral-7b-instruct
nomic-embed-text-v1.5
nomic-embed-text-v1
SFR-Embedding-Mistral
SFR-Embedding-2_R
instructor-base
gte-Qwen2-7B-instruct
gte-Qwen2-1.5B-instruct
UAE-Large-V1
instructor-xl
instructor-large


In [7]:
all_verifications_df.to_csv("all_model_verification_values.csv", index=False)

In [8]:
all_verifications_df.head(3)

Unnamed: 0,i,raw_vocab,category,decoded,metrics,main_metric,metric_names,main_metric_name,verification,max_prob,magic,reencoded_ids,reencoded,model
0,1,</s>,OK_SPECIAL,</s>,"[97.830365, 106.041251, 137.680227]",97.830365,,,"[1.0000000000000122, 0.9985454545454666, 0.998...",1.0,strong_verified,,,sentence-t5-base
1,317,▁think,OK,think,"[56.713909, 59.961645, 62.589155]",56.713909,,,"[0.8920000000000091, 0.8894545454545545, 0.878...",0.892,strong_verified,,,sentence-t5-base
2,344,▁between,OK,between,"[56.294466, 59.531261, 63.816119]",56.294466,,,"[0.8803636363636451, 0.8789090909090996, 0.858...",0.880364,strong_verified,,,sentence-t5-base


In [9]:
#all_verifications_df只保留model列和max_prob列,并且max_prob列的值转化为float类型
all_verifications_df = all_verifications_df[['model', 'max_prob']]
all_verifications_df['max_prob'] = all_verifications_df['max_prob'].astype(float)

In [10]:
all_verifications_df.head(3)

Unnamed: 0,model,max_prob
0,sentence-t5-base,1.0
1,sentence-t5-base,0.892
2,sentence-t5-base,0.880364


In [None]:
import ptitprince as pt
import matplotlib.pyplot as plt
import numpy as np

# 定义箱线图的属性
medianprops = {"linewidth": 1.5, "color": "#a9a9a9", "solid_capstyle": "butt"}
boxprops = {"linewidth": 1.5, "color": "#a9a9a9"}

# 获取模型列表
MODELS = model_list
COLORS = [
    "#DE7833", "#912C2C", "#F2BB6B", "#C2ABC8", "#329845", "#AED185", 
    "#276C9E", "#A3C9D5", "#FF5733", "#33FF57", "#3357FF", "#FF33A1", 
    "#33FFA1", "#A133FF", "#FF8C33", "#33FF8C", "#8C33FF", "#FF338C",
    "#FFB533", "#33FFB5", "#B533FF", "#FF33B5", "#B5FF33", "#33B5FF",
    "#FF6633", "#33FF66", "#6633FF", "#FF3366", "#66FF33", "#3366FF",
        "#DE7833", "#912C2C", "#F2BB6B", "#C2ABC8", "#329845", "#AED185", 
    "#276C9E", "#A3C9D5", "#FF5733", "#33FF57", "#3357FF", "#FF33A1", 
    "#33FFA1", "#A133FF", "#FF8C33", "#33FF8C", "#8C33FF", "#FF338C",
    "#FFB533", "#33FFB5", "#B533FF", "#FF33B5", "#B5FF33", "#33B5FF",
    "#FF6633", "#33FF66", "#6633FF", "#FF3366", "#66FF33", "#3366FF"
][:len(MODELS)]

# 创建自定义大小的图形
fig, ax = plt.subplots(figsize=(16, 24))

# 绘制半小提琴图
pt.half_violinplot(
    x='max_prob', y='model', scale='area', palette=COLORS, 
    inner=None, data=all_verifications_df, width=1, ax=ax
)

# 遍历模型
for i, model in enumerate(MODELS):
    # 筛选数据
    data = all_verifications_df[all_verifications_df["model"] == model]
    # 在垂直轴上抖动数值
    y = i + np.random.uniform(high=0.2, size=len(data))
    # 选择水平轴的数值
    x = data["max_prob"]
    # 使用scatter方法添加雨点，点更小更轻
    ax.scatter(x, y, color=COLORS[i], alpha=0.1, s=15)

# 生成数组列表
boxplot_data = [
    all_verifications_df[all_verifications_df["model"] == model]["max_prob"].values 
    for model in MODELS
]

# 调整箱线图的位置
SHIFT = 0.1
POSITIONS = [i + SHIFT for i in range(len(MODELS))]

ax.boxplot(
    boxplot_data, 
    vert=False, 
    positions=POSITIONS, 
    manage_ticks=False,
    showfliers=True,  # 不显示超出箱线图的异常值
    showcaps=False,    # 不显示箱线图的顶端和底端
    medianprops=medianprops,
    whiskerprops=boxprops,
    boxprops=boxprops,
    widths=0.25
)

# 添加标签和标题
ax.set_xlabel("Max Probability Distribution", fontsize=12)
ax.set_ylabel("Model", fontsize=12)

# 修改刻度标签的大小
ax.tick_params(labelsize=13)
# fig.savefig(r'G:\juchiyun2024-11-14\ckx_ws\StickyToken\fig\max_prob_distribution.pdf', format='pdf', bbox_inches='tight')


In [13]:
experiement_record_df.head(1)

Unnamed: 0,record_time,model_name,vocab_size,num_parameters,dataset,insert_num,model,sent_pair_num,verification_sent_pair_num,ok_tokens_num,...,caculate_vocab_token_magic_score_time,final_verification_time,vocab_embeddings_is_on_unit_sphere,wte_is_on_unit_sphere,vocab_embeddings_is_anisotropic,wte_is_anisotropic,vocab_embeddings_mean_cosine_similarity,candidates_for_verification_percentile,candidates_for_verification_threshold,candidates_for_verification_num
0,2024-10-27 16:08:44,sentence-t5-base,32100,110218368,[G:/juchiyun2024-11-14/hf_dataset/mteb/sts13-s...,8,G:/hf/sentence-transformers/sentence-t5-base,5,250,32097,...,7564.841289,7328.973902,True,False,True,True,0.795921,2.0,56.085167,642


In [14]:
all_verifications_statics_df = pd.DataFrame()

for model_name in model_list:
    verification_results = load_verification_results(model_name)
    verification_results_df = pd.DataFrame(verification_results).transpose()
    # 只取出‘verification’列非空的数据
    verifications_df = verification_results_df[verification_results_df['verification'].notnull()].copy()
    # 计算所有行的列表中第一个元素的中位数和四分位数
    first_elements = verifications_df['verification'].apply(lambda x: x[0])
    Q1 = first_elements.quantile(0.25)
    median = first_elements.median()
    Q3 = first_elements.quantile(0.75)
    IQR = Q3 - Q1

    # 计算上下边界
    # lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 0.7 * IQR

    # print(f"Q1: {Q1}, Median: {median}, Q3: {Q3}, IQR: {IQR}")
    # print(f"Lower Bound: {lower_bound}, Upper Bound: {upper_bound}")

    # 找出超过上边界的点和个数
    outliers = first_elements[first_elements > upper_bound]
    outliers_count = outliers.count()

    # 找出verifications_df行中超出上边界的行
    outliers_df = verifications_df[first_elements > upper_bound].copy()
    # 检查outliers_df是否为空
    if not outliers_df.empty:
        # 如果非空，添加一个列，列名为‘model’，值为model_name
        outliers_df.loc[:, 'model'] = model_name
    all_verifications_statics_df = pd.concat([all_verifications_statics_df, outliers_df], ignore_index=True)

    # 将统计信息添加到现有的experiement_record_df中
    experiement_record_df.loc[experiement_record_df['model_name'] == model_name, ['Q1', 'Median', 'Q3', 'Upper Bound', 'sticky token count']] = [
        Q1, median, Q3, upper_bound, outliers_count
    ]

all_verifications_statics_df.head(3)

Unnamed: 0,i,raw_vocab,category,decoded,metrics,main_metric,metric_names,main_metric_name,verification,max_prob,magic,reencoded_ids,reencoded,model
0,1,</s>,OK_SPECIAL,</s>,"[97.830365, 106.041251, 137.680227]",97.830365,,,"[1.0000000000000122, 0.9985454545454666, 0.998...",1.0,strong_verified,,,sentence-t5-base
1,2226,▁tip,OK,tip,"[56.475278, 59.811531, 63.237172]",56.475278,,,"[0.9418181818181923, 0.9392727272727377, 0.918...",0.941818,strong_verified,,,sentence-t5-base
2,6397,▁br,OK,br,"[61.626297, 65.118991, 69.23426]",61.626297,,,"[0.9352727272727376, 0.932727272727283, 0.9123...",0.935273,strong_verified,,,sentence-t5-base


In [15]:
all_verifications_statics_df.to_csv("../results/final_all_models_sticky_tokens.csv", index=False)

In [17]:
experiement_record_df.head(3)

Unnamed: 0,model_name,record_time,vocab_size,num_parameters,dataset,insert_num,model,sent_pair_num,verification_sent_pair_num,ok_tokens_num,...,wte_is_anisotropic,vocab_embeddings_mean_cosine_similarity,candidates_for_verification_percentile,candidates_for_verification_threshold,candidates_for_verification_num,Q1,Median,Q3,Upper Bound,sticky token count
0,all-MiniLM-L6-v2,2024-10-28 12:59:15,30522,22713216,[G:/juchiyun2024-11-14/hf_dataset/mteb/sts13-s...,8,G:/hf/sentence-transformers/all-MiniLM-L6-v2,5,250,23699,...,True,0.199846,2.0,49.204199,474,0.748727,0.775273,0.796727,0.830327,21.0
1,all-mpnet-base-v2,2024-10-28 14:40:12,30527,109486464,[G:/juchiyun2024-11-14/hf_dataset/mteb/sts13-s...,8,G:/hf/sentence-transformers/all-mpnet-base-v2,5,250,23700,...,True,0.187637,2.0,22.803801,474,0.714364,0.735091,0.756364,0.785764,24.0
2,sup-simcse-bert-base-uncased,2024-11-02 12:11:26,30522,109482240,[G:/juchiyun2024-11-14/hf_dataset/mteb/sts13-s...,8,G:/hf/princeton-nlp/sup-simcse-bert-base-uncased,5,250,23699,...,True,0.586649,2.0,36.052514,474,0.687818,0.717818,0.745,0.785027,22.0


In [None]:
final_sticky_token_df = experiement_record_df[['model_name','vocab_size' ,'sticky token count']]
final_sticky_token_df

In [30]:
def get_sticky_token_list(model_name):
    sticky_tokens_of_all_models_df = pd.read_csv("../results/final_all_models_sticky_tokens.csv")
    # 先筛选出对应模型的行
    sticky_token_column = sticky_tokens_of_all_models_df[sticky_tokens_of_all_models_df['model'] == model_name]
    # 按照main_metric从高到低排序
    sticky_token_column = sticky_token_column.sort_values(by='main_metric', ascending=False)
    # 获取排序后的raw_vocab列表
    sticky_token_list = sticky_token_column['raw_vocab'].to_list()
    return sticky_token_list

In [40]:
get_sticky_token_list(model_names[0])

['（',
 'textbook',
 'h₂o',
 'satisfy',
 'trajectory',
 'julio',
 'functioning',
 '[CLS]',
 '₂',
 'gambia',
 'defendant',
 '？',
 '{',
 'functioned',
 'imaginative',
 'cultivated',
 'う',
 'intelligent',
 'oskar',
 'whereupon',
 'intended']

In [52]:
model_names = [
    "all-MiniLM-L6-v2", "all-mpnet-base-v2",
    "sup-simcse-bert-base-uncased", "sup-simcse-bert-large-uncased", "sup-simcse-roberta-base", "sup-simcse-roberta-large",
    "sentence-t5-base", "sentence-t5-large", "sentence-t5-xl", "sentence-t5-xxl",
    "gtr-t5-base", "gtr-t5-large", "gtr-t5-xl", "gtr-t5-xxl",
    "instructor-base", "instructor-large", "instructor-xl",
    "e5-small", "e5-base", "e5-large", "e5-mistral-7b-instruct",
    "bge-small-en-v1.5", "bge-base-en-v1.5", "bge-large-en-v1.5",
    "UAE-Large-V1",
    "nomic-embed-text-v1", "nomic-embed-text-v1.5",
    "gte-small", "gte-base", "gte-large", "gte-base-en-v1.5", "gte-large-en-v1.5", "gte-Qwen2-1.5B-instruct", "gte-Qwen2-7B-instruct",
    "GritLM-7B",
    "SFR-Embedding-2_R", "SFR-Embedding-Mistral",
]
# 按照model_names的顺序对DataFrame进行排序
experiement_record_df = experiement_record_df.set_index('model_name').loc[model_names].reset_index()
my_df = experiement_record_df[['model_name', 'num_parameters', 'vocab_size', 'ok_tokens_num', 'candidates_for_verification_num', 'sticky token count']].assign(
    num_parameters=lambda x: (x['num_parameters'] / 1e6).round().astype(int).astype(str) + 'M',
    sticky_token_count=lambda x: x['sticky token count'].astype(int),  # 将sticky token count转换为整数
    example=lambda x: x['model_name'].apply(lambda name: ', '.join(get_sticky_token_list(name)))
).drop(columns=['sticky token count'])  # 删除原来的sticky token count列
my_df.to_latex(buf="../results/final_sticky_token_table.tex", index=False)
my_df

Unnamed: 0,model_name,num_parameters,vocab_size,ok_tokens_num,candidates_for_verification_num,sticky_token_count,example
0,all-MiniLM-L6-v2,23M,30522,23699,474,21,"（, textbook, h₂o, satisfy, trajectory, julio, ..."
1,all-mpnet-base-v2,109M,30527,23700,474,24,"00, adversary, intended, ambiguous, cooked, た,..."
2,sup-simcse-bert-base-uncased,109M,30522,23699,474,22,"203, ?, [SEP], ロ, game, 640, り, victories, cal..."
3,sup-simcse-bert-large-uncased,335M,30522,23699,474,11,"', ;, contestants, accidental, ɔ, continents, ..."
4,sup-simcse-roberta-base,125M,50265,49894,998,27,"ĠThere, There, Ġthere, </s>, there, ĠTHERE, ĠE..."
5,sup-simcse-roberta-large,355M,50265,49894,998,25,"Discussion, ĠâĢĭ, ĠSubjects, Topic, Ġ?, .-, Ġs..."
6,sentence-t5-base,110M,32100,32097,642,21,"</s>, lucrarea, ▁grains, ▁photographed, ▁sport..."
7,sentence-t5-large,336M,32100,32097,642,30,"</s>, ▁»., <extra_id_27>, ▁Comment, ▁Ribbon, c..."
8,sentence-t5-xl,1242M,32100,32097,642,34,"</s>, <extra_id_0>, <extra_id_27>, ▁velvet, ▁c..."
9,sentence-t5-xxl,4866M,32100,32097,642,22,"</s>, ▁consacré, <extra_id_27>, ▁hashtag, ▁hel..."


In [53]:
my_df.to_csv("sticky_tokens_cross_model.csv", index=False)

In [51]:
from collections import Counter
all_verified_tokens = pd.read_csv("../results/final_all_models_sticky_tokens.csv")["raw_vocab"].to_list()
token_counts = Counter(all_verified_tokens)
sorted_token_counts = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
sorted_token_counts

[('</s>', 12),
 ('[SEP]', 9),
 ('<extra_id_27>', 8),
 ('[CLS]', 7),
 ('<extra_id_19>', 5),
 ('<extra_id_0>', 4),
 ('[MASK]', 4),
 ('lucrarea', 3),
 ('<extra_id_18>', 3),
 ('▁».', 3),
 ('intended', 3),
 ('[PAD]', 3),
 ('<extra_id_12>', 3),
 ('<extra_id_9>', 3),
 ('？', 2),
 ('having', 2),
 ('deposited', 2),
 ('▁hashtag', 2),
 ('occurring', 2),
 ('behaved', 2),
 ('⺩', 2),
 ('勝', 2),
 ('扌', 2),
 ('▁somebody', 2),
 ('▁Someone', 2),
 ('<extra_id_26>', 2),
 ('momentarily', 2),
 ('<extra_id_25>', 2),
 ('<extra_id_13>', 2),
 ('▁indeed', 2),
 ('▁consacré', 2),
 ('ɔ', 2),
 ('▁которы', 2),
 ('ющи', 2),
 ('▁voegen', 2),
 ('▁най', 2),
 ('▁отри', 2),
 ('▁tip', 1),
 ('▁br', 1),
 ('▁Gas', 1),
 ('▁blanc', 1),
 ('▁organism', 1),
 ('▁brake', 1),
 ('▁joke', 1),
 ('▁Gate', 1),
 ('durant', 1),
 ('▁Tablet', 1),
 ('▁grains', 1),
 ('▁sportive', 1),
 ('▁Portable', 1),
 ('▁Patio', 1),
 ('▁pastel', 1),
 ('▁meme', 1),
 ('▁photographed', 1),
 ('▁Hose', 1),
 ('▁cum', 1),
 ('▁chance', 1),
 ('▁sharing', 1),
 ('▁Comment