In [6]:
from sentence_transformers import SentenceTransformer, InputExample
from torch.utils.data import DataLoader


# 加载模型
model = SentenceTransformer('nghuyong/ernie-3.0-nano-zh')

No sentence-transformers model found with name C:\Users\Administrator/.cache\torch\sentence_transformers\nghuyong_ernie-3.0-nano-zh. Creating a new one with MEAN pooling.
Some weights of ErnieModel were not initialized from the model checkpoint at C:\Users\Administrator/.cache\torch\sentence_transformers\nghuyong_ernie-3.0-nano-zh and are newly initialized: ['ernie.pooler.dense.weight', 'ernie.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# 加载数据，构建训练集

# 读取文件
f = open('./cls.txt', 'r', encoding='utf-8').readlines()
f = [i.strip() for i in f]

# 将数据转换为字典
data = {}
for i in f:
    t, e = i.split('：')
    e = e.split('、')
    data[t] = e
    print(len(e))

# 构建训练集：将每个类别与其中的元素相似度定义为1，与其他类别的元素相似度定义为0
train_data = []
for k1, v1 in data.items():
    for v in v1:
        train_data.append(InputExample(texts=[k1, v], label=float(1)))
    for k2, v2 in data.items():
        if k1 != k2:
            for v in v2:
                train_data.append(InputExample(texts=[k1, v], label=float(0)))
print(len(train_data))
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=16)

35
39
35
32
564


In [8]:
from sentence_transformers import losses

train_loss = losses.CosineSimilarityLoss(model)  # 定义损失函数，与查询时使用的方法一致
model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=50, show_progress_bar=False)  # 训练模型
model.save('./model/')

In [9]:
from sentence_transformers import util
import torch


corpus = ['干垃圾', '可回收垃圾', '厨余垃圾', '有害垃圾']
corpus_embeddings = model.encode(corpus, convert_to_tensor=True)

# Query sentences:
queries = ['电池', '外卖盒', '骨头', '化妆品', '纸箱']


# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = min(5, len(corpus))
for query in queries:
    query_embedding = model.encode(query, convert_to_tensor=True)

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for score, idx in zip(top_results[0], top_results[1]):
        print(corpus[idx], "(Score: {:.4f})".format(score))





Query: 电池

Top 5 most similar sentences in corpus:
有害垃圾 (Score: 0.7171)
干垃圾 (Score: 0.4939)
可回收垃圾 (Score: -0.0149)
厨余垃圾 (Score: -0.2701)




Query: 外卖盒

Top 5 most similar sentences in corpus:
可回收垃圾 (Score: 0.7201)
干垃圾 (Score: 0.1896)
有害垃圾 (Score: 0.1494)
厨余垃圾 (Score: -0.1899)




Query: 骨头

Top 5 most similar sentences in corpus:
厨余垃圾 (Score: 0.7113)
干垃圾 (Score: 0.0334)
可回收垃圾 (Score: -0.0730)
有害垃圾 (Score: -0.1449)




Query: 化妆品

Top 5 most similar sentences in corpus:
有害垃圾 (Score: 0.6373)
干垃圾 (Score: 0.5172)
厨余垃圾 (Score: -0.0775)
可回收垃圾 (Score: -0.2346)




Query: 纸箱

Top 5 most similar sentences in corpus:
可回收垃圾 (Score: 0.9293)
干垃圾 (Score: -0.0017)
厨余垃圾 (Score: -0.0717)
有害垃圾 (Score: -0.1058)


In [21]:
import numpy as np
def softmax(x):
    x = [i.cpu().numpy() for i in x]
    return np.exp(x) / np.sum(np.exp(x), axis=0)
def cls(query):
    query_embedding = model.encode(query, convert_to_tensor=True)
    cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=4)
    result_id, result_score = [], []
    for score, idx in zip(top_results[0], top_results[1]):
        result_id.append(idx)
        result_score.append(score)
    result_score = softmax(result_score)
    result_label = [corpus[i] for i in result_id]
    result = dict(zip(result_label, result_score))
    return result

In [38]:
cls('大树')

{'厨余垃圾': 0.44761857, '干垃圾': 0.1962006, '可回收垃圾': 0.17894478, '有害垃圾': 0.17723598}