<h1 style='color:orange; text-align:center; font-weight:bold'>Semantic Similarity</h1>

In [10]:
from transformers import AutoTokenizer, AutoModel
import torch
from torch.nn.functional import cosine_similarity
import matplotlib.pyplot as plt
import warnings
import umap
import torch.nn.functional as F
import pandas as pd

In [24]:
# surpress warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)

In [25]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineG

In [26]:
model_name = "indobenchmark/indobert-base-p1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

## **2.1 Semantic Similarity: Word Level**

In [30]:
word_pairs = [
    ("ayah", "bapak"),
    ("ayah", "kursi"),
    ("ayah", "ponsel"),
    ("ayah", "guru"),
    ("ayah", "jeruk"),
    ("ayah", "pekerjaan"),
    ("ayah", "metamorfosis")
]

In [31]:
def get_word_embedding(word):
    inputs = tokenizer(word, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        # take the first actual token (0: [CLS])
        return outputs.last_hidden_state[0][1]

similarities = []
for w1, w2 in word_pairs:
    emb1 = get_word_embedding(w1)
    emb2 = get_word_embedding(w2)
    sim = cosine_similarity(emb1, emb2, dim=0).item()
    similarities.append((w1, w2, sim))

In [32]:
for w1, w2, score in similarities:
    print(f"{w1} <> {w2} = {score:.4f}")

ayah <> bapak = 0.7813
ayah <> kursi = 0.6375
ayah <> ponsel = 0.6325
ayah <> guru = 0.7468
ayah <> jeruk = 0.6337
ayah <> pekerjaan = 0.6396
ayah <> metamorfosis = 0.4039


## **2.2 Semantic Similarity: Sentence Level**

In [17]:
def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1)

In [18]:
text1 = "Anak itu sedang membaca buku di perpustakaan."
text2 = "Seorang siswa membaca sebuah buku di dalam perpustakaan."

emb1 = get_embedding(text1)
emb2 = get_embedding(text2)

similarity = cosine_similarity(emb1, emb2)
print(f"Similarity: {similarity.item():.4f}")

Similarity: 0.8338


In [19]:
texts = [
    "Ayah makan nasi.",
    "Bapak menyantap nasi.",
    "Ibu memasak di dapur.",
    "Siswa belajar di kelas.",
    "Dia membaca buku.",
    "Anak itu sedang membaca buku di perpustakaan.",
    "Anak bermain di taman.",
    "Petani menanam padi di sawah.",
    "Burung terbang di langit.",
    "Dokter memeriksa pasien.",
    "Mobil melaju di jalan raya.",
    "Guru mengajar matematika.",
    "Saya menonton film di bioskop.",
    "Perempuan itu membeli sayur di pasar.",
    "Kami berjalan-jalan di pusat kota.",
    "Polisi mengatur lalu lintas.",
    "Orang-orang antre di halte bus.",
    "Seorang siswa membaca sebuah buku di dalam perpustakaan.",
    "Dia menulis surat untuk temannya.",
    "Mereka berolahraga di lapangan."
]

embeddings = [get_embedding(t) for t in texts]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

In [47]:
n = len(embeddings)
sim_matrix = torch.zeros((n, n))

for i in range(n):
    for j in range(n):
        sim_matrix[i][j] = F.cosine_similarity(embeddings[i], embeddings[j])

for i, text in enumerate(texts):
    print(f"sent{i+1:02}: {text}")

# change to df
labels = [f"sent{i+1:02}" for i in range(n)]
df_sim = pd.DataFrame(sim_matrix.numpy(), index=labels, columns=labels)
display(df_sim.round(3).style.background_gradient(cmap='YlOrRd'))

sent01: Ayah makan nasi.
sent02: Bapak menyantap nasi.
sent03: Ibu memasak di dapur.
sent04: Siswa belajar di kelas.
sent05: Dia membaca buku.
sent06: Anak itu sedang membaca buku di perpustakaan.
sent07: Anak bermain di taman.
sent08: Petani menanam padi di sawah.
sent09: Burung terbang di langit.
sent10: Dokter memeriksa pasien.
sent11: Mobil melaju di jalan raya.
sent12: Guru mengajar matematika.
sent13: Saya menonton film di bioskop.
sent14: Perempuan itu membeli sayur di pasar.
sent15: Kami berjalan-jalan di pusat kota.
sent16: Polisi mengatur lalu lintas.
sent17: Orang-orang antre di halte bus.
sent18: Seorang siswa membaca sebuah buku di dalam perpustakaan.
sent19: Dia menulis surat untuk temannya.
sent20: Mereka berolahraga di lapangan.


Unnamed: 0,sent01,sent02,sent03,sent04,sent05,sent06,sent07,sent08,sent09,sent10,sent11,sent12,sent13,sent14,sent15,sent16,sent17,sent18,sent19,sent20
sent01,1.0,0.915,0.771,0.582,0.65,0.573,0.659,0.538,0.632,0.649,0.539,0.688,0.623,0.65,0.521,0.605,0.508,0.475,0.605,0.574
sent02,0.915,1.0,0.758,0.593,0.68,0.608,0.64,0.554,0.635,0.684,0.575,0.716,0.624,0.667,0.538,0.615,0.537,0.508,0.597,0.595
sent03,0.771,0.758,1.0,0.679,0.526,0.566,0.799,0.702,0.682,0.586,0.637,0.7,0.649,0.665,0.499,0.615,0.538,0.506,0.538,0.558
sent04,0.582,0.593,0.679,1.0,0.541,0.56,0.759,0.635,0.607,0.55,0.584,0.779,0.643,0.537,0.485,0.613,0.538,0.64,0.489,0.591
sent05,0.65,0.68,0.526,0.541,1.0,0.771,0.55,0.467,0.552,0.693,0.513,0.632,0.666,0.676,0.59,0.558,0.433,0.678,0.767,0.69
sent06,0.573,0.608,0.566,0.56,0.771,1.0,0.604,0.472,0.504,0.617,0.547,0.515,0.636,0.758,0.579,0.509,0.516,0.834,0.672,0.594
sent07,0.659,0.64,0.799,0.759,0.55,0.604,1.0,0.675,0.743,0.511,0.655,0.735,0.682,0.573,0.553,0.644,0.551,0.555,0.54,0.613
sent08,0.538,0.554,0.702,0.635,0.467,0.472,0.675,1.0,0.644,0.475,0.618,0.627,0.597,0.595,0.477,0.56,0.529,0.487,0.486,0.543
sent09,0.632,0.635,0.682,0.607,0.552,0.504,0.743,0.644,1.0,0.521,0.634,0.665,0.638,0.534,0.525,0.601,0.519,0.471,0.51,0.622
sent10,0.649,0.684,0.586,0.55,0.693,0.617,0.511,0.475,0.521,1.0,0.515,0.625,0.555,0.607,0.538,0.605,0.448,0.53,0.621,0.59
