In [None]:
from datetime import datetime
from pytz import timezone
import pandas as pd
import numpy as np
import os
from sentence_transformers import SentenceTransformer
from annoy import AnnoyIndex
from tqdm import tqdm

In [None]:
start = datetime.now(timezone("Asia/Seoul"))

print("- Loading data 'aihub_or_kr-sports_ko.xlsx'...", end=" ")
data = pd.read_excel("aihub_or_kr-sports_ko.xlsx")
print("completed!")

print("- Downloading pre-trained Sentence BERT model 'jhgan/ko-sroberta-sts'...", end=" ")
model = SentenceTransformer("jhgan/ko-sroberta-sts")
print("completed!")

print("- Encoding each text to 768-dimensional vectors....")
vecs = model.encode(data["text"].tolist(), show_progress_bar=True, normalize_embeddings=True)
print("completed!")

n_trees = 8
print(f"- Building a forest of {n_trees} trees for nearest neighbor search...", end=" ")
dim = 768
tree = AnnoyIndex(f=dim, metric="dot")
for i, vec in enumerate(vecs):
    tree.add_item(i + 1, vec)
tree.build(n_trees=n_trees, n_jobs=-1)
print("completed!")

print("- Searching up to 5 nearest neighbors for each sentence...")
res = list()
sim_thr = 0.9
for id1 in tqdm(range(1, len(data) + 1)):
    ids, sims = tree.get_nns_by_item(i=id1, n=5, include_distances=True)
    for id2, sim in zip(ids, sims):
        if sim >= sim_thr and id1 < id2:
            res.append((id1, id2, sim))
res = sorted(res, key=lambda x:x[2], reverse=True)
res = pd.DataFrame(res, columns=["id1", "id2", "similarity"])

id2text = {row["id"]:row["text"] for _, row in data.iterrows()}
res.insert(2, "text1", res["id1"].map(id2text))
res.insert(3, "text2", res["id2"].map(id2text))

res = res.sort_values(by=["similarity", "id1", "id2"], ascending=[False, True, True])
print("completed!")

print("- Saving the result as 'Semantic_textual_similarity_Result.xlsx'...", end=" ")
res.to_excel("Semantic_textual_similarity_Result.xlsx", index=False, encoding="euc-kr")
print("completed!")

end = datetime.now(timezone("Asia/Seoul"))

print("- All the precesses are done;")
print(f"    - {'The program started at:':<24s}{datetime.strftime(start, format='%Y-%m-%d %H:%M:%S'):>20s}.")
print(f"    - {'The program ended at:':<24s}{datetime.strftime(end, format='%Y-%m-%d %H:%M:%S'):>20s}.")
elapsed = (end - start).total_seconds()
print(f"    - {elapsed//60:,.0f}mins and {elapsed%60:,.0f}secs ({elapsed:,.0f}secs) elapsed.")
print(f"    - {len(res):,} pair(s) of sentences showed a similarity of {sim_thr} or more.")