## SimCSE test

In [1]:
import torch
torch.__version__

'1.10.0+cu113'

In [2]:
from simcse import SimCSE
model = SimCSE('princeton-nlp/unsup-simcse-bert-base-uncased')

11/04/2021 13:33:25 - INFO - simcse.tool -   Use `cls_before_pooler` for unsupervised models. If you want to use other pooling policy, specify `pooler` argument.


In [3]:
import torch
from scipy.spatial.distance import cosine
from transformers import AutoModel, AutoTokenizer

# Import our models. The package will take care of downloading the models automatically
tokenizer = AutoTokenizer.from_pretrained("princeton-nlp/sup-simcse-roberta-large")
model = AutoModel.from_pretrained("princeton-nlp/sup-simcse-roberta-large")

# Tokenize input texts
texts = [
    "저기 스케이트보드를 타는 학생이 있다.",
    "한 남자가 스케이트 보드를 탄다.",
    "할아버지가 식사를 하신다."
]
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Get the embeddings
with torch.no_grad():
    embeddings = model(**inputs, output_hidden_states=True, return_dict=True).pooler_output

# Calculate cosine similarities
# Cosine similarities are in [-1, 1]. Higher means more similar
cosine_sim_0_1 = 1 - cosine(embeddings[0], embeddings[1])
cosine_sim_0_2 = 1 - cosine(embeddings[0], embeddings[2])

print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[0], texts[1], cosine_sim_0_1))
print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[0], texts[2], cosine_sim_0_2))

Cosine similarity between "저기 스케이트보드를 타는 학생이 있다." and "한 남자가 스케이트 보드를 탄다." is: 0.984
Cosine similarity between "저기 스케이트보드를 타는 학생이 있다." and "할아버지가 식사를 하신다." is: 0.969


# Sentence Transformer

In [4]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Sentences we want sentence embeddings for
sentences = [
    "저기 스케이트보드를 타는 학생이 있다.",
    "한 남자가 스케이트 보드를 탄다.",
    "할아버지가 식사를 하신다."
]

# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/distiluse-base-multilingual-cased-v1')
model = AutoModel.from_pretrained('sentence-transformers/distiluse-base-multilingual-cased-v1')

# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

# Perform pooling
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

print("Sentence embeddings:")
print(sentence_embeddings)

Sentence embeddings:
tensor([[-0.0342,  0.0079,  0.0563,  ..., -0.0561,  0.0047,  0.0192],
        [-0.0190, -0.0207,  0.0216,  ..., -0.0090, -0.0134,  0.0204],
        [ 0.0161, -0.0826,  0.0426,  ..., -0.0160, -0.0296, -0.0425]])


In [5]:

cosine_sim_0_1 = 1 - cosine(sentence_embeddings[0], sentence_embeddings[1])
cosine_sim_0_2 = 1 - cosine(sentence_embeddings[0], sentence_embeddings[2])

print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (sentences[0], sentences[1], cosine_sim_0_1))
print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (sentences[0], sentences[2], cosine_sim_0_2))

Cosine similarity between "저기 스케이트보드를 타는 학생이 있다." and "한 남자가 스케이트 보드를 탄다." is: 0.758
Cosine similarity between "저기 스케이트보드를 타는 학생이 있다." and "할아버지가 식사를 하신다." is: 0.134


## KR-SBERT
- pre-required https://github.com/snunlp/KR-SBERT files


In [6]:
import torch
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
model = SentenceTransformer('KR-SBERT/KR-SBERT-V40K-klueNLI-augSTS')

# Tokenize input texts
texts = [
    "저기 스케이트보드를 타는 학생이 있다.",
    "한 남자가 스케이트보드를 탄다.",
    "할아버지가 식사를 하신다."
]

vectors = model.encode(texts)
similarities = cosine_similarity(vectors)
print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[0], texts[1], similarities[0][1]))
print("Cosine similarity between \"%s\" and \"%s\" is: %.3f" % (texts[0], texts[2], similarities[0][2]))

11/04/2021 13:33:48 - INFO - sentence_transformers.SentenceTransformer -   Load pretrained SentenceTransformer: KR-SBERT/KR-SBERT-V40K-klueNLI-augSTS
11/04/2021 13:33:53 - INFO - sentence_transformers.SentenceTransformer -   Use pytorch device: cuda


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cosine similarity between "저기 스케이트보드를 타는 학생이 있다." and "한 남자가 스케이트보드를 탄다." is: 0.793
Cosine similarity between "저기 스케이트보드를 타는 학생이 있다." and "할아버지가 식사를 하신다." is: 0.186
