In [1]:
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler, losses, util, InputExample, models
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging
import math
import numpy as np

In [2]:
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

In [3]:
model_name = "kpfbert-base"
train_batch_size = 16

In [4]:
word_embedding_model = models.Transformer(model_name)

Some weights of the model checkpoint at kpfbert-base were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at kpfbert-base and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probab

In [5]:
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                               pooling_mode_mean_tokens=True,
                               pooling_mode_cls_token=False,
                               pooling_mode_max_tokens=False)

In [6]:
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

2021-12-13 15:03:49 - Use pytorch device: cuda


In [7]:
logging.info("Read AllNLI train dataset")

2021-12-13 15:03:49 - Read AllNLI train dataset


In [8]:
label2int = {"contradiction": 0, "entailment": 1, "neutral": 2}
train_samples = []

In [9]:
with open('KorNLUDatasets/KorNLI/snli_1.0_train.ko.tsv', "rt", encoding="utf-8") as fIn:
    lines = fIn.readlines()
    for line in lines:
        s1, s2, label = line.split('\t')
        label = label2int[label.strip()]
        train_samples.append(InputExample(texts=[s1, s2], label=label))

In [10]:
train_dataset = SentencesDataset(train_samples, model=model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=len(label2int))

2021-12-13 15:03:51 - Softmax loss: #Vectors concatenated: 3


In [11]:
#Read STSbenchmark dataset and use it as development set
logging.info("Read STSbenchmark dev dataset")
dev_samples = []

2021-12-13 15:03:51 - Read STSbenchmark dev dataset


In [12]:
with open('KorNLUDatasets/KorSTS/tune_dev.tsv', 'rt', encoding='utf-8') as fIn:
    lines = fIn.readlines()
    for line in lines:
        s1, s2, score = line.split('\t')
        score = score.strip()
        score = float(score) / 5.0
        dev_samples.append(InputExample(texts= [s1,s2], label=score))

In [13]:
dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, batch_size=train_batch_size, name='sts-dev')

In [14]:
num_epochs = 1

warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

2021-12-13 15:03:51 - Warmup-steps: 3439


In [15]:
# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=dev_evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
#           output_path=model_save_path
          )

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/34385 [00:00<?, ?it/s]

2021-12-13 15:05:48 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 1000 steps:
2021-12-13 15:05:51 - Cosine-Similarity :	Pearson: 0.7155	Spearman: 0.7139
2021-12-13 15:05:51 - Manhattan-Distance:	Pearson: 0.7028	Spearman: 0.7072
2021-12-13 15:05:51 - Euclidean-Distance:	Pearson: 0.7018	Spearman: 0.7064
2021-12-13 15:05:51 - Dot-Product-Similarity:	Pearson: 0.6725	Spearman: 0.6729
2021-12-13 15:07:47 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 2000 steps:
2021-12-13 15:07:50 - Cosine-Similarity :	Pearson: 0.6100	Spearman: 0.6310
2021-12-13 15:07:50 - Manhattan-Distance:	Pearson: 0.6174	Spearman: 0.6306
2021-12-13 15:07:50 - Euclidean-Distance:	Pearson: 0.6147	Spearman: 0.6286
2021-12-13 15:07:50 - Dot-Product-Similarity:	Pearson: 0.5835	Spearman: 0.5933
2021-12-13 15:09:47 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 3000 steps:
2021-12-13 15:09:49 - Cosine-

2021-12-13 15:43:36 - Cosine-Similarity :	Pearson: 0.7393	Spearman: 0.7544
2021-12-13 15:43:36 - Manhattan-Distance:	Pearson: 0.7576	Spearman: 0.7549
2021-12-13 15:43:36 - Euclidean-Distance:	Pearson: 0.7580	Spearman: 0.7548
2021-12-13 15:43:36 - Dot-Product-Similarity:	Pearson: 0.7305	Spearman: 0.7362
2021-12-13 15:45:34 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 21000 steps:
2021-12-13 15:45:37 - Cosine-Similarity :	Pearson: 0.7563	Spearman: 0.7702
2021-12-13 15:45:37 - Manhattan-Distance:	Pearson: 0.7711	Spearman: 0.7693
2021-12-13 15:45:37 - Euclidean-Distance:	Pearson: 0.7711	Spearman: 0.7689
2021-12-13 15:45:37 - Dot-Product-Similarity:	Pearson: 0.7475	Spearman: 0.7492
2021-12-13 15:47:34 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 22000 steps:
2021-12-13 15:47:37 - Cosine-Similarity :	Pearson: 0.7489	Spearman: 0.7643
2021-12-13 15:47:37 - Manhattan-Distance:	Pearson: 0.7671	Spearman: 0.765

In [16]:
model.save('output/kpfSBERT_nli')

2021-12-13 16:12:30 - Save model to output/kpfSBERT_nli


In [17]:
train_batch_size = 16
num_epochs = 4
model_save_path = 'output/kpfSBERT'

In [18]:
# model = SentenceTransformer(model_name)

In [19]:
logging.info("Read STSbenchmark train dataset")

2021-12-13 16:12:31 - Read STSbenchmark train dataset


In [20]:
train_samples = []
dev_samples = []
test_samples = []
with open('KorNLUDatasets/KorSTS/tune_dev.tsv', 'rt', encoding='utf-8') as fIn:
    lines = fIn.readlines()
    for line in lines:
        s1, s2, score = line.split('\t')
        score = score.strip()
        score = float(score) / 5.0
        dev_samples.append(InputExample(texts= [s1,s2], label=score))

with open('KorNLUDatasets/KorSTS/tune_test.tsv', 'rt', encoding='utf-8') as fIn:
    lines = fIn.readlines()
    for line in lines:
        s1, s2, score = line.split('\t')
        score = score.strip()
        score = float(score) / 5.0
        test_samples.append(InputExample(texts= [s1,s2], label=score))

with open('KorNLUDatasets/KorSTS/tune_train.tsv', 'rt', encoding='utf-8') as fIn:
    lines = fIn.readlines()
    for line in lines:
        s1, s2, score = line.split('\t')
        score = score.strip()
        score = float(score) / 5.0
        train_samples.append(InputExample(texts= [s1,s2], label=score))

train_dataset = SentencesDataset(train_samples, model)
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

In [21]:
# Development set: Measure correlation between cosine score and gold labels
logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')

warmup_steps = math.ceil(len(train_dataset) * num_epochs / train_batch_size * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

2021-12-13 16:12:31 - Read STSbenchmark dev dataset
2021-12-13 16:12:31 - Warmup-steps: 144


In [22]:
# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

2021-12-13 16:13:20 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset after epoch 0:
2021-12-13 16:13:23 - Cosine-Similarity :	Pearson: 0.8501	Spearman: 0.8508
2021-12-13 16:13:23 - Manhattan-Distance:	Pearson: 0.8438	Spearman: 0.8494
2021-12-13 16:13:23 - Euclidean-Distance:	Pearson: 0.8442	Spearman: 0.8500
2021-12-13 16:13:23 - Dot-Product-Similarity:	Pearson: 0.8385	Spearman: 0.8373
2021-12-13 16:13:23 - Save model to output/kpfSBERT


Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

2021-12-13 16:14:14 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset after epoch 1:
2021-12-13 16:14:16 - Cosine-Similarity :	Pearson: 0.8519	Spearman: 0.8539
2021-12-13 16:14:16 - Manhattan-Distance:	Pearson: 0.8482	Spearman: 0.8551
2021-12-13 16:14:16 - Euclidean-Distance:	Pearson: 0.8480	Spearman: 0.8549
2021-12-13 16:14:16 - Dot-Product-Similarity:	Pearson: 0.8384	Spearman: 0.8382
2021-12-13 16:14:16 - Save model to output/kpfSBERT


Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

2021-12-13 16:15:07 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset after epoch 2:
2021-12-13 16:15:09 - Cosine-Similarity :	Pearson: 0.8557	Spearman: 0.8567
2021-12-13 16:15:09 - Manhattan-Distance:	Pearson: 0.8506	Spearman: 0.8569
2021-12-13 16:15:09 - Euclidean-Distance:	Pearson: 0.8507	Spearman: 0.8570
2021-12-13 16:15:09 - Dot-Product-Similarity:	Pearson: 0.8453	Spearman: 0.8444
2021-12-13 16:15:09 - Save model to output/kpfSBERT


Iteration:   0%|          | 0/360 [00:00<?, ?it/s]

2021-12-13 16:16:00 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset after epoch 3:
2021-12-13 16:16:02 - Cosine-Similarity :	Pearson: 0.8556	Spearman: 0.8563
2021-12-13 16:16:02 - Manhattan-Distance:	Pearson: 0.8497	Spearman: 0.8565
2021-12-13 16:16:02 - Euclidean-Distance:	Pearson: 0.8499	Spearman: 0.8567
2021-12-13 16:16:02 - Dot-Product-Similarity:	Pearson: 0.8448	Spearman: 0.8440


In [23]:
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='sts-test')
test_evaluator(model, output_path=model_save_path)

2021-12-13 16:16:02 - Load pretrained SentenceTransformer: output/kpfSBERT
2021-12-13 16:16:03 - Use pytorch device: cuda
2021-12-13 16:16:03 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-test dataset:
2021-12-13 16:16:05 - Cosine-Similarity :	Pearson: 0.8324	Spearman: 0.8364
2021-12-13 16:16:05 - Manhattan-Distance:	Pearson: 0.8298	Spearman: 0.8345
2021-12-13 16:16:05 - Euclidean-Distance:	Pearson: 0.8302	Spearman: 0.8343
2021-12-13 16:16:05 - Dot-Product-Similarity:	Pearson: 0.8247	Spearman: 0.8257


0.8363526559449966

In [24]:
# TEST1 : sentesce similarity sorting with cosine similarity

model_path = model_save_path

model = SentenceTransformer(model_path)

# Corpus with example sentences
corpus = ['한 남자가 음식을 먹는다.',
          '한 남자가 빵 한 조각을 먹는다.',
          '그 여자가 아이를 돌본다.',
          '한 남자가 말을 탄다.',
          '한 여자가 바이올린을 연주한다.',
          '두 남자가 수레를 숲 속으로 밀었다.',
          '한 남자가 담으로 싸인 땅에서 백마를 타고 있다.',
          '원숭이 한 마리가 드럼을 연주한다.',
          '치타 한 마리가 먹이 뒤에서 달리고 있다.']

corpus_embeddings = model.encode(corpus, convert_to_tensor=True)

# Query sentences:
queries = ['한 남자가 파스타를 먹는다.',
           '고릴라 의상을 입은 누군가가 드럼을 연주하고 있다.',
           '치타가 들판을 가로 질러 먹이를 쫓는다.']

# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = 5
for query in queries:
    query_embedding = model.encode(query, convert_to_tensor=True)
    cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
    cos_scores = cos_scores.cpu()

    #We use np.argpartition, to only partially sort the top_k results
    top_results = np.argpartition(-cos_scores, range(top_k))[0:top_k]

    print("\n\n======================\n\n")
    print("Query:", query)
    print("\nTop 5 most similar sentences in corpus:")

    for idx in top_results[0:top_k]:
        print(corpus[idx].strip(), "(Score: %.4f)" % (cos_scores[idx]))

2021-12-13 16:16:06 - Load pretrained SentenceTransformer: output/kpfSBERT
2021-12-13 16:16:06 - Use pytorch device: cuda


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]





Query: 한 남자가 파스타를 먹는다.

Top 5 most similar sentences in corpus:
한 남자가 음식을 먹는다. (Score: 0.6732)
한 남자가 빵 한 조각을 먹는다. (Score: 0.6184)
한 남자가 말을 탄다. (Score: 0.1289)
한 남자가 담으로 싸인 땅에서 백마를 타고 있다. (Score: 0.0810)
두 남자가 수레를 숲 속으로 밀었다. (Score: 0.0695)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]





Query: 고릴라 의상을 입은 누군가가 드럼을 연주하고 있다.

Top 5 most similar sentences in corpus:
원숭이 한 마리가 드럼을 연주한다. (Score: 0.6221)
한 여자가 바이올린을 연주한다. (Score: 0.1462)
한 남자가 담으로 싸인 땅에서 백마를 타고 있다. (Score: 0.0656)
치타 한 마리가 먹이 뒤에서 달리고 있다. (Score: 0.0326)
한 남자가 말을 탄다. (Score: 0.0280)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]





Query: 치타가 들판을 가로 질러 먹이를 쫓는다.

Top 5 most similar sentences in corpus:
치타 한 마리가 먹이 뒤에서 달리고 있다. (Score: 0.8170)
두 남자가 수레를 숲 속으로 밀었다. (Score: 0.1619)
원숭이 한 마리가 드럼을 연주한다. (Score: 0.1177)
한 여자가 바이올린을 연주한다. (Score: 0.0597)
한 남자가 음식을 먹는다. (Score: 0.0072)


In [25]:
# TEST2 : Clustering with k-means

model_path = model_save_path

model = SentenceTransformer(model_path)

# Corpus with example sentences
corpus = ['한 남자가 음식을 먹는다.',
          '한 남자가 빵 한 조각을 먹는다.',
          '그 여자가 아이를 돌본다.',
          '한 남자가 말을 탄다.',
          '한 여자가 바이올린을 연주한다.',
          '두 남자가 수레를 숲 속으로 밀었다.',
          '한 남자가 담으로 싸인 땅에서 백마를 타고 있다.',
          '원숭이 한 마리가 드럼을 연주한다.',
          '치타 한 마리가 먹이 뒤에서 달리고 있다.',
          '한 남자가 파스타를 먹는다.',
          '고릴라 의상을 입은 누군가가 드럼을 연주하고 있다.',
          '치타가 들판을 가로 질러 먹이를 쫓는다.']

corpus_embeddings = model.encode(corpus)

# Then, we perform k-means clustering using sklearn:
from sklearn.cluster import KMeans

num_clusters = 5
clustering_model = KMeans(n_clusters=num_clusters)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_

clustered_sentences = [[] for i in range(num_clusters)]
for sentence_id, cluster_id in enumerate(cluster_assignment):
    clustered_sentences[cluster_id].append(corpus[sentence_id])

for i, cluster in enumerate(clustered_sentences):
    print("Cluster ", i+1)
    print(cluster)
    print("")

2021-12-13 16:16:07 - Load pretrained SentenceTransformer: output/kpfSBERT
2021-12-13 16:16:07 - Use pytorch device: cuda


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Cluster  1
['치타 한 마리가 먹이 뒤에서 달리고 있다.', '치타가 들판을 가로 질러 먹이를 쫓는다.']

Cluster  2
['한 남자가 음식을 먹는다.', '한 남자가 빵 한 조각을 먹는다.', '한 남자가 파스타를 먹는다.']

Cluster  3
['그 여자가 아이를 돌본다.', '한 여자가 바이올린을 연주한다.']

Cluster  4
['한 남자가 말을 탄다.', '두 남자가 수레를 숲 속으로 밀었다.', '한 남자가 담으로 싸인 땅에서 백마를 타고 있다.']

Cluster  5
['원숭이 한 마리가 드럼을 연주한다.', '고릴라 의상을 입은 누군가가 드럼을 연주하고 있다.']

