In [1]:
# Requirements

# !pip install transformers
# !pip install torch
# !pip install scipy

In [2]:
import torch
from scipy.spatial.distance import cosine
from transformers import AutoModel, AutoTokenizer
from argparse import Namespace
# Import our models. The package will take care of downloading the models automatically
tokenizer = AutoTokenizer.from_pretrained("liusiyi641/bert_distilled_openai_mse_cse_cnews")
model_args = Namespace(do_mlm=None, pooler_type="cls", temp=0.05, mlp_only_train=False, init_embeddings_model=None)
model = AutoModel.from_pretrained("liusiyi641/bert_distilled_openai_mse_cse_cnews", trust_remote_code=True, model_args=model_args)


Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


In [3]:
import csv
import numpy as np
def get_evalCSV():
    text_left = []
    text_right = []
    with open("../data/sentspair.csv", "r") as csv_file:
        csv_reader = csv.reader(csv_file)
        for row in csv_reader:
            text_left.append(row[0])
            text_right.append(row[1])
    return text_left, text_right

text_left, text_right = get_evalCSV()
inputs = tokenizer(text_left, padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
    embeddings_left = model(**inputs, output_hidden_states=True, return_dict=True, sent_emb=True).pooler_output
inputs = tokenizer(text_right, padding=True, truncation=True, return_tensors="pt")
with torch.no_grad():
    embeddings_right = model(**inputs, output_hidden_states=True, return_dict=True, sent_emb=True).pooler_output
    
cos_sim_matrix = torch.matmul(embeddings_left, embeddings_right.t())
cos_sim_matrix /= torch.matmul(torch.norm(embeddings_left, dim=1, keepdim=True), torch.norm(embeddings_right, dim=1, keepdim=True).t())
tensor_cpu = cos_sim_matrix.cpu()

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [4]:
# !pip install openai
# !pip install openTSNE
# !pip install datasets

In [5]:
import sys
sys.path.append("../")
from lib.tsne import TSNE_Plot

merged_list = text_left + text_right
merged_embed = torch.cat((embeddings_left, embeddings_right), dim=0)

# if the data have no labels, you can use the following code to cluster the data
tsne_plot = TSNE_Plot(merged_list, merged_embed, n_clusters = 4)
tsne_plot.tsne_plot(n_sentence=40)

# # if the data have labels, you can use the following code to visualize the data
# tsne_plot = TSNE_Plot(merged_list, merged_embed, label=labels)
# tsne_plot.tsne_plot(n_sentence=40)

Perplexity value 30 is too high. Using perplexity 13.00 instead
Perplexity value 30 is too high. Using perplexity 13.00 instead


In [6]:
# plt.imshow(tensor_cpu, cmap='jet', interpolation='nearest')
# plt.colorbar()
# plt.show()
import pandas as pd
from lib.heatmap import Heatmap
positions = [(i, i) for i in range(0, 20, 2)] + [(1, 5), (2, 3), (15, 9), (5, 13), (17, 7)]
df = pd.DataFrame({ "first":text_left, 
                    "second":text_right, 
                    "first_embed":[np.array(embeddings_left[i]) for i in range(len(embeddings_left))], 
                    "second_embed":[np.array(embeddings_right[i]) for i in range(len(embeddings_right))]})
heatmap = Heatmap(df, positions)
heatmap.create_heatmap()