In [None]:
# Requirements
!pip install transformers
!pip install openai
!pip install openTSNE
!pip install datasets

In [None]:
!git clone https://github.com/LC1332/Luotuo-Text-Embedding.git

In [3]:
import torch
from scipy.spatial.distance import cosine
from transformers import AutoModel, AutoTokenizer
from argparse import Namespace
# Import our models. The package will take care of downloading the models automatically
tokenizer = AutoTokenizer.from_pretrained("silk-road/luotuo-bert")
model_args = Namespace(do_mlm=None, pooler_type="cls", temp=0.05, mlp_only_train=False, init_embeddings_model=None)
model = AutoModel.from_pretrained("silk-road/luotuo-bert", trust_remote_code=True, model_args=model_args)

Downloading (…)okenizer_config.json:   0%|          | 0.00/539 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/439k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/966 [00:00<?, ?B/s]

Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


Downloading (…)solve/main/models.py:   0%|          | 0.00/21.1k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/414M [00:00<?, ?B/s]

In [None]:
%cd Luotuo-Text-Embedding

In [4]:
import csv
import numpy as np
def get_evalCSV():
    text_left = []
    text_right = []
    with open("./data/sentspair.csv", "r") as csv_file:
        csv_reader = csv.reader(csv_file)
        for row in csv_reader:
            text_left.append(row[0])
            text_right.append(row[1])
    return text_left, text_right

text_left, text_right = get_evalCSV()
inputs = tokenizer(text_left, padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
    embeddings_left = model(**inputs, output_hidden_states=True, return_dict=True, sent_emb=True).pooler_output
inputs = tokenizer(text_right, padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
    embeddings_right = model(**inputs, output_hidden_states=True, return_dict=True, sent_emb=True).pooler_output
    
cos_sim_matrix = torch.matmul(embeddings_left, embeddings_right.t())
cos_sim_matrix /= torch.matmul(torch.norm(embeddings_left, dim=1, keepdim=True), torch.norm(embeddings_right, dim=1, keepdim=True).t())
tensor_cpu = cos_sim_matrix.cpu()

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [5]:
import sys

from lib.tsne import TSNE_Plot

merged_list = text_left + text_right
merged_embed = torch.cat((embeddings_left, embeddings_right), dim=0)

# if the data have no labels, you can use the following code to cluster the data
tsne_plot = TSNE_Plot(merged_list, merged_embed, n_clusters = 4)
tsne_plot.tsne_plot(n_sentence=40)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai
  Downloading openai-0.27.4-py3-none-any.whl (70 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.3/70.3 kB[0m [31m148.1 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting aiohttp
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
Collecting multidict<7.0,>=4.5
  Downloading multidict-6.0.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (114 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.2/114.2 kB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting yarl<2.0,>=1.0
  Downloading yarl-1.8.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (264 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m264.6/264.6 kB[0m [31m29.2 MB/s[0m 

In [None]:
import pandas as pd
from lib.heatmap import Heatmap
positions = [(i, i) for i in range(0, 20, 2)] + [(1, 5), (2, 3), (15, 9), (5, 13), (17, 7)]
df = pd.DataFrame({ "first":text_left, 
                    "second":text_right, 
                    "first_embed":[np.array(embeddings_left[i]) for i in range(len(embeddings_left))], 
                    "second_embed":[np.array(embeddings_right[i]) for i in range(len(embeddings_right))]})
heatmap = Heatmap(df, positions)
heatmap.create_heatmap(font_path='./lib/arial.ttf')

# TODO:
* 模糊问题搜索
* 文本聚类
* 少样本分类学习