# 对分词的探索

In [18]:
from transformers import AutoModel,AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('BAAI/bge-large-zh-v1.5')
model = AutoModel.from_pretrained('BAAI/bge-large-zh-v1.5')

In [19]:
import jieba
import re
sentence = "近日，有媒体报道称，中国科学家在南极发现了一种新型细菌，这种细菌能够在零下20度的极端环境下生存。"
# 分词，然后去掉标点
tokens = jieba.lcut(sentence)
tokens1 = tokens
tokens = [re.sub(r'[^\w\s]', '', token) for token in tokens if re.sub(r'[^\w\s]', '', token)]
tokens

['近日',
 '有',
 '媒体报道',
 '称',
 '中国',
 '科学家',
 '在',
 '南极',
 '发现',
 '了',
 '一种',
 '新型',
 '细菌',
 '这种',
 '细菌',
 '能够',
 '在',
 '零下',
 '20',
 '度',
 '的',
 '极端',
 '环境',
 '下',
 '生存']

###  分词去标点结果

In [20]:
import torch
encoded_input  = tokenizer(tokens,padding=True, return_tensors="pt")
with torch.no_grad():
    output = model(**encoded_input)[0][:,0]
output

tensor([[-0.9940, -0.0095, -0.0290,  ..., -1.1730,  0.8795, -0.1893],
        [ 0.5184, -0.4749,  0.0987,  ..., -0.4425,  0.0887, -0.0732],
        [-0.3612, -0.1135, -0.4985,  ..., -0.7243,  0.3804, -0.2077],
        ...,
        [-0.3579,  0.6526,  0.0548,  ..., -0.1603,  0.5569, -1.0110],
        [-0.0661,  0.1508, -0.0335,  ..., -1.0574, -0.0148, -0.8802],
        [-0.6681, -0.3047, -0.1448,  ...,  0.3437,  0.3852,  0.0301]])

#### 试试评估模式省略的影响

In [21]:
model.eval()
with torch.no_grad():
    output1 = model(**encoded_input)[0][:,0]
output1

tensor([[-0.9940, -0.0095, -0.0290,  ..., -1.1730,  0.8795, -0.1893],
        [ 0.5184, -0.4749,  0.0987,  ..., -0.4425,  0.0887, -0.0732],
        [-0.3612, -0.1135, -0.4985,  ..., -0.7243,  0.3804, -0.2077],
        ...,
        [-0.3579,  0.6526,  0.0548,  ..., -0.1603,  0.5569, -1.0110],
        [-0.0661,  0.1508, -0.0335,  ..., -1.0574, -0.0148, -0.8802],
        [-0.6681, -0.3047, -0.1448,  ...,  0.3437,  0.3852,  0.0301]])

In [22]:
output.all()==output1.all(),output1.shape

(tensor(True), torch.Size([25, 1024]))

<caption>说明没影响</caption>

### 将分词去标点的结果作平均，从[p,embed_size]到[1,embed_size].
注: 这里是对于一个完整caption的处理。
p表示分词个数，对第0维度取平均
但是如果传入的是列表 分析：batch_size个数据，目的得到[batch_size,1024]的张量。实际上每一个[1,1024]张量都是通过[pi,1024]平均得到的。而且很有可能pi≠pj,即不能扩充一维用于存放分词数量（作为维数）从而不能指定dim=(分词所在维数)以平均化。
综上，此种情况个人认为的是一个caption一个caption处理。

In [23]:
# output取平均值,后面均归一化也就是output[num]_1这个变量
output1 = output1.mean(dim=0).unsqueeze(0)
output1_1 = torch.nn.functional.normalize(output1, p=2, dim=1)

output1_1,output1_1.shape

(tensor([[-0.0103,  0.0002, -0.0074,  ..., -0.0407,  0.0163, -0.0162]]),
 torch.Size([1, 1024]))

不做任何处理的一句话，通过模型后再归一化

In [24]:
encoded_input_raw =tokenizer(sentence,padding=True, return_tensors="pt")
with torch.no_grad():
    output2 = model(**encoded_input_raw)[0][:,0]
output2_1 = torch.nn.functional.normalize(output2, p=2, dim=1)
output2_1

tensor([[-0.0364, -0.0192,  0.0009,  ...,  0.0050,  0.0238,  0.0379]])

### 分词后不去标点，通过模型后再归一化

In [35]:
encoded_input1  = tokenizer(tokens1,padding=True, return_tensors="pt")
with torch.no_grad():
    output3 = model(**encoded_input1)[0][:,0]
output3 = output3.mean(dim=0).unsqueeze(0)
output3_1 = torch.nn.functional.normalize(output3, p=2, dim=1)
output3_1,output3_1.shape

(tensor([[-0.0087,  0.0044, -0.0003,  ..., -0.0461,  0.0128, -0.0220]]),
 torch.Size([1, 1024]))

In [38]:
# 检测是否归一化
import numpy as np
result3 = np.linalg.norm(output3_1,axis=1)
result3

array([1.], dtype=float32)

### 引入一个完全无关的负样本(自认为，人工归类的)


In [56]:
# 这两个负样本是相似的
input_F = "在美国亚利桑那州，疯狂戴夫发明了一种金刚不坏的土豆，据说能够抵挡僵尸的攻击，向日葵开心极了"
input_F1 = "在美国纽约，僵王博士发明了一种金刚不坏的土豆僵尸，据说能够抵挡植物的攻击，它开心极了"

encoded_input_F  = tokenizer(input_F,padding=True, return_tensors="pt")
encoded_input_F1  = tokenizer(input_F1,padding=True, return_tensors="pt")
with torch.no_grad():
    output_F = model(**encoded_input_F)[0][:,0]
    output_F1 = model(**encoded_input_F1)[0][:,0]
output_F_1 = torch.nn.functional.normalize(output_F, p=2, dim=1)
output_F1_1 = torch.nn.functional.normalize(output_F1, p=2, dim=1)
output_F_1,output_F1_1

(tensor([[-0.0158, -0.0100, -0.0096,  ..., -0.0030,  0.0028,  0.0485]]),
 tensor([[ 0.0018, -0.0377, -0.0137,  ...,  0.0244, -0.0115,  0.0668]]))

### 计算3+1个向量的距离（余弦相似度以及欧氏距离）

In [63]:
import torch
import pandas as pd
import numpy as np

def calculate_similarity_and_distance(tensors:list, names:list):
    num_tensors = len(tensors)
    # 初始化余弦相似度和欧几里得距离矩阵
    cosine_similarity_matrix = np.zeros((num_tensors, num_tensors))
    euclidean_distance_matrix = np.zeros((num_tensors, num_tensors))

    # 计算每对张量之间的余弦相似度和欧几里得距离
    for i in range(num_tensors):
        for j in range(num_tensors):
            if i != j:  # 排除自身与自身的比较
                cosine_similarity_matrix[i, j] = torch.nn.functional.cosine_similarity(tensors[i], tensors[j])
                euclidean_distance_matrix[i, j] = torch.norm(tensors[i] - tensors[j])
            else:  # 自身与自身的相似度设置为1，距离设置为0
                cosine_similarity_matrix[i, j] = 1.0
                euclidean_distance_matrix[i, j] = 0.0

    # 创建余弦相似度表格
    cosine_similarity_df = pd.DataFrame(cosine_similarity_matrix, index=names, columns=names)
    # 创建欧几里得距离表格
    euclidean_distance_df = pd.DataFrame(euclidean_distance_matrix, index=names, columns=names)

    return cosine_similarity_df, euclidean_distance_df

def find_nearest_tensors(cosine_similarity_df, euclidean_distance_df):
    # 设置对角线（自己和自己）为nan
    cosine_similarity_df.values[np.diag_indices_from(cosine_similarity_df)] = np.nan
    euclidean_distance_df.values[np.diag_indices_from(euclidean_distance_df)] = np.nan
    # 余弦相似度：找到每个张量最相似的张量（自身除外）
    cosine_nearest = cosine_similarity_df.idxmax().rename("Cosine Nearest Tensor")
    # 欧几里得距离：找到每个张量最近的张量（自身除外）
    euclidean_nearest = euclidean_distance_df.idxmin().rename("Euclidean Nearest Tensor")

    # 创建一个新的 DataFrame 来保存结果
    nearest_tensors_df = pd.DataFrame({
        'Cosine Nearest Tensor': cosine_nearest,
        'Euclidean Nearest Tensor': euclidean_nearest
    })
    
    # 计算排序，基于余弦相似度和欧几里得距离
    cosine_similarity_ranking = cosine_similarity_df.rank(1, ascending=False)  # 余弦相似度越高，排名越前
    euclidean_distance_ranking = euclidean_distance_df.rank(1)  # 欧几里得距离越小，排名越前

    return nearest_tensors_df, cosine_similarity_ranking, euclidean_distance_ranking


tensors = [output2_1,output1_1,output3_1,output_F_1,output_F1_1]
names = ["不做处理","分词去标点","分词不去标点","负样本","负样本的相似样本"]
c_df,e_df = calculate_similarity_and_distance(tensors,names)
near_df,cs_df,es_df = find_nearest_tensors(c_df,e_df)

In [65]:
# 余弦相似度比较 越近越靠近1
c_df

Unnamed: 0,不做处理,分词去标点,分词不去标点,负样本,负样本的相似样本
不做处理,,0.469554,0.433262,0.360374,0.368029
分词去标点,0.469554,,0.990061,0.393847,0.401334
分词不去标点,0.433262,0.990061,,0.380646,0.388923
负样本,0.360374,0.393847,0.380646,,0.787223
负样本的相似样本,0.368029,0.401334,0.388923,0.787223,


In [66]:
# 欧氏距离比较，越近越靠近0
e_df

Unnamed: 0,不做处理,分词去标点,分词不去标点,负样本,负样本的相似样本
不做处理,,1.029996,1.064648,1.13104,1.124252
分词去标点,1.029996,,0.140988,1.101048,1.094226
分词不去标点,1.064648,0.140988,,1.112972,1.105511
负样本,1.13104,1.101048,1.112972,,0.652345
负样本的相似样本,1.124252,1.094226,1.105511,0.652345,


In [67]:
# 直观排序表示（最近的向量）
near_df

Unnamed: 0,Cosine Nearest Tensor,Euclidean Nearest Tensor
不做处理,分词去标点,分词去标点
分词去标点,分词不去标点,分词不去标点
分词不去标点,分词去标点,分词去标点
负样本,负样本的相似样本,负样本的相似样本
负样本的相似样本,负样本,负样本


In [68]:
# 余弦相似度比较排序
cs_df

Unnamed: 0,不做处理,分词去标点,分词不去标点,负样本,负样本的相似样本
不做处理,,1.0,2.0,4.0,3.0
分词去标点,2.0,,1.0,4.0,3.0
分词不去标点,2.0,1.0,,4.0,3.0
负样本,4.0,2.0,3.0,,1.0
负样本的相似样本,4.0,2.0,3.0,1.0,


In [69]:
# 欧氏距离比较排序
es_df

Unnamed: 0,不做处理,分词去标点,分词不去标点,负样本,负样本的相似样本
不做处理,,1.0,2.0,4.0,3.0
分词去标点,2.0,,1.0,4.0,3.0
分词不去标点,2.0,1.0,,4.0,3.0
负样本,4.0,2.0,3.0,,1.0
负样本的相似样本,4.0,2.0,3.0,1.0,
