In [None]:
from tqdm import tqdm
import random
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import torch.nn as nn

input_file = 'data/wiki1m_for_simcse.txt'
with open(input_file, 'r', encoding='utf-8') as f:
    sent_list = f.read().splitlines()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name = 'princeton-nlp/unsup-simcse-bert-base-uncased'
# model_name = 'bert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

bs = 64

list1 = sent_list[:bs]
list2 = random.sample(sent_list, bs)

sent_list = list1 * 2 + list2 * 2   # list1 list1 list2 list2

print(len(sent_list))

inputs = tokenizer(sent_list, padding=True, truncation=True, return_tensors='pt')

with torch.no_grad():
    outputs = model(**inputs.to(device))

last_hidden_states = outputs.last_hidden_state  # (bs*4, seq_len, hidden_size)

pooler_output = outputs.pooler_output  # (bs*4, hidden_size)
pooler_output = pooler_output.view (4, bs, -1)  # (4, bs, hidden_size)

z1 ,z2 ,z3, z4 = pooler_output

class Similarity(nn.Module):

    def __init__(self, temp):
        super().__init__()
        self.temp = temp
        self.cos = nn.CosineSimilarity(dim=-1)

    def forward(self, x, y):
        return self.cos(x, y) / self.temp

# sim = Similarity(1)

# cos_sim_12 = sim(z1.unsqueeze(1), z2.unsqueeze(0))

# 打印相似度
# print(cos_sim_12)

# cos_sim_34 = sim(z3.unsqueeze(1), z4.unsqueeze(0))

# loss_fct = nn.CrossEntropyLoss()

# labels = torch.arange(cos_sim_12.size(0)).long().to(device)

# loss1 = loss_fct(cos_sim_12, labels)
# loss2 = loss_fct(cos_sim_34, labels)

# print(loss1, loss2)


from sklearn.manifold import TSNE
import numpy as np
import matplotlib.pyplot as plt

# 对Z1可视化
# 使用 t-SNE 降维
tsne = TSNE(n_components=2, random_state=42)
reduced_embeddings = tsne.fit_transform(z3.cpu().numpy())

# 绘制散点图
plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], s=1, alpha=0.5)
plt.title('Uniformity Visualization (t-SNE)')
plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.show()



In [None]:
import numpy as np
import torch
from sklearn.cluster import DBSCAN

from tqdm import tqdm
import random
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
import torch.nn as nn
import numpy as np
import torch
from sklearn.cluster import MiniBatchKMeans


model_name = 'princeton-nlp/unsup-simcse-bert-base-uncased'
# model_name = 'bert-base-uncased'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

input_file = 'data/wiki1m_for_simcse.txt'
with open(input_file, 'r', encoding='utf-8') as f:
    sent_list = f.read().splitlines()

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

# sent_list = random.sample(sent_list, 1000)

sentence_embeddings = []
with torch.no_grad():
    bs = 1024
    for i in tqdm(range(0, len(sent_list), bs)):
        list = sent_list[i:i+bs]
        inputs = tokenizer(list, padding=True, truncation=True, return_tensors='pt')
        outputs = model(**inputs.to(device))
        sentence_embeddings.append(outputs.last_hidden_state[:, 0, :].cpu().numpy())
    # outputs = model(**inputs.to(device))

# sentence_embeddings = outputs.last_hidden_state[:, 0, :]  # 取 [CLS] 的输出作为句子向量


# 假设你有 1M 个句子的嵌入，每个句子的维度是 768
# sentence_embeddings = torch.randn(1000000, 768)  # 模拟1M个句子的嵌入

# 计算目标簇数
batch_size = 64
num_samples = sentence_embeddings.shape[0]
num_clusters = num_samples // batch_size  # 计算所需的簇的数量

# 将张量转换为 numpy 数组
sentence_embeddings_np = sentence_embeddings.cpu().numpy()

# 使用 MiniBatchKMeans 聚类
minibatch_kmeans = MiniBatchKMeans(n_clusters=num_clusters, random_state=42, batch_size=batch_size, n_init=10)
minibatch_kmeans.fit(sentence_embeddings_np)

# 获取每个句子的簇标签
labels = minibatch_kmeans.labels_

# 查看每个簇的标签
print("聚类标签:", labels[:100])  # 查看前 100 个样本的标签
# 打印每个簇的大小
unique, counts = np.unique(labels, return_counts=True)
print("每个簇的大小:", dict(zip(unique, counts)))

# 获取每个簇的中心（每个簇的代表句子）
# centroids = minibatch_kmeans.cluster_centers_

# 查看每个簇的中心
# print("每个簇的中心（代表句子）:", centroids[:5])  # 只查看前 5 个簇中心

In [6]:
from knowledge.retrieval import retrieval_knowledge_title
import random
from tqdm import tqdm

input_file = 'data/wiki1m_for_simcse.txt'
with open(input_file, 'r', encoding='utf-8') as f:
    sent_list = f.read().splitlines()

# sent_list = sent_list[:1000]
# sent_list = random.sample(sent_list, 1000)
bs = 1000

sent_dict = {}

for i in tqdm(range(0, len(sent_list), bs)):
    list = sent_list[i:i+bs]
    result = retrieval_knowledge_title(list)

    for sent, title in zip(list, result):
        key = ",".join(title)
        if key not in sent_dict:
            sent_dict[key] = []
        sent_dict[key].append(sent)

length_list = [len(v) for k, v in sent_dict.items()]
print('max:', max(length_list))
print('min:', min(length_list))
print('mean:', sum(length_list) / len(length_list))

100%|██████████| 1001/1001 [00:09<00:00, 109.39it/s]

max: 278998
min: 1
mean: 5.754275422363394





In [8]:
# 生成新的训练数据

from knowledge.retrieval import retrieval_knowledge_title
import random
from tqdm import tqdm

input_file = 'data/wiki1m_for_simcse.txt'
with open(input_file, 'r', encoding='utf-8') as f:
    sent_list = f.read().splitlines()

new_sent_list = []

for i in tqdm(range(0, len(sent_list), bs)):
    list = sent_list[i:i+bs]
    result = retrieval_knowledge_title(list)
    for sent, title in zip(list, result):
        if title:
            new_sent_list.append(sent)

print(len(new_sent_list))
output_file = 'data/wiki1m_for_simcse_remove_empty_title.txt'
with open(output_file, 'w', encoding='utf-8') as f:
    f.write('\n'.join(new_sent_list))

100%|██████████| 1001/1001 [00:08<00:00, 115.03it/s]


721003
