## Embedding

### Sentence Embedding

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

smodel = SentenceTransformer('snunlp/KR-SBERT-V40K-klueNLI-augSTS')
dense_embeddings = smodel.encode(['학교', '공부', '운동'])
cosine_similarity(dense_embeddings) # 코사인 유사도

array([[1.        , 0.5950745 , 0.32537562],
       [0.5950745 , 1.        , 0.5459569 ],
       [0.32537562, 0.5459569 , 1.0000001 ]], dtype=float32)

### One-Hot encoding

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

word_dict = {
    'school': np.array([[1, 0, 0]]),
    'study': np.array([[0, 1, 0]]),
    'workout': np.array([[0, 0, 1]]),
}

# 두 단어 사이의 코사인 유사도 계산하기
cosine_school_study = cosine_similarity(word_dict['school'], word_dict['study']) # 0 
cosine_school_workout = cosine_similarity(word_dict['school'], word_dict['workout']) # 0

### Bi-encoder

In [9]:
from sentence_transformers import SentenceTransformer, models

# 사용할 BERT 모델
word_embedding_model = models.Transformer('klue/roberta-base')
# 풀링 층 차원 입력하기
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
# 두 모듈 결합하기
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: RobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)

In [11]:
import torch

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

In [12]:
def max_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    token_embeddings[input_mask_expanded == 0] = 1e-9
    return torch.max(token_embeddings, 1)[0]

### Generate Text & Image Embedding with Sentence-Transformers

In [13]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('snunlp/KR-SBERT-V40K-klueNLI-augSTS')
embs = model.encode([
    "잠이 안 옵니다.",
    "졸음이 옵니다.",
    "기차가 옵니다."
])
cos_scores = util.cos_sim(embs, embs)
print(cos_scores)

tensor([[1.0000, 0.6500, 0.1976],
        [0.6500, 1.0000, 0.2920],
        [0.1976, 0.2920, 1.0000]])


In [15]:
from PIL import Image
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('clip-ViT-B-32')
image_embs = model.encode([
    Image.open("examples/dog.jpg"),
    Image.open("examples/cat.jpg")
])
text_embs = model.encode([
    "A dog on grass",
    "Brown cat on yellow background"
])
cos_scores = util.cos_sim(image_embs, text_embs)
print(cos_scores)

tensor([[0.2782, 0.1512],
        [0.2071, 0.3180]])
