In [None]:
from transformers import set_seed, BertTokenizer, BertModel,BertForMaskedLM
from tqdm import tqdm
import argparse, torch
import pandas as pd
import umap.umap_ as umap
import matplotlib.pyplot as plt
from matplotlib import colors
import torch.nn.functional as f
import numpy as np
from sklearn.manifold import TSNE
set_seed(2022)

In [None]:
parser = argparse.ArgumentParser(description='search for best template according to dev set')
parser.add_argument('--max_len', default=512, type=int, help="max sequence length")
parser.add_argument('--batch_size', default=2, type=int, help="batch size")
parser.add_argument('--model', default='../models/my_bert/', type=str, help="pretrained model")
parser.add_argument('--tokenizer', default='../models/my_bert/', type=str, help="tokenizer")
parser.add_argument('--task', default='douban', type=str, help="task name")
parser.add_argument('--datasets', default='../datasets_ppl_score/', type=str, help="dataset dir")
parser.add_argument('--template', default='很好。', type=str, help="template")
parser.add_argument('--input_data', default='../datasets/', type=str, help="input data dir")
args = parser.parse_args(args=[])

device = 'cuda:0'

In [None]:
tokenizer = BertTokenizer.from_pretrained(f'{args.tokenizer}')
pretrained_model = BertForMaskedLM.from_pretrained(args.model).to(device)

In [None]:
pd_all = pd.read_csv(f'{args.input_data}{args.task}_output.csv',names=['labels','text'],header=0)
texts = pd_all.text.tolist()
labels = pd_all.labels.tolist()

In [None]:
text_embeddings = []
with torch.no_grad():
    for text in tqdm(texts[:10]):
        
     
        text = '[MASK]好。'+text

        inputs = tokenizer(text, return_tensors="pt").to(device)
        outputs = pretrained_model(**inputs).logits
        print(outputs.shape)

        

In [None]:
outputs.logits.shape

In [None]:
text_embeddings = torch.stack(text_embeddings)
norm_text_vectors = f.normalize(text_embeddings,p=2,dim=1).cpu()

In [None]:
## visualize
manifold = umap.UMAP(n_neighbors=15,min_dist=1.0,random_state = 2022).fit(norm_text_vectors)
X_reduced_2 = manifold.transform(norm_text_vectors)

# X_reduced_2 = TSNE(n_components=2, learning_rate='auto', init='random', perplexity=3,random_state=0).fit_transform(norm_text_vectors)


In [None]:
cmap = colors.ListedColormap(['b','r'])
fig, ax = plt.subplots(figsize=(20,20))
scatter1 = ax.scatter(X_reduced_2[:][:, 0], X_reduced_2[:][:, 1], c=labels[:], s=10,cmap=cmap)
legend1 = ax.legend(*scatter1.legend_elements(), title="Classes")

In [None]:
np.unique(labels)