In [1]:
!pip install tabulate
!pip install sentence_transformers

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting sentence_transformers
  Downloading sentence_transformers-5.1.2-py3-none-any.whl (488 kB)
     -------------------------------------- 488.0/488.0 kB 7.6 MB/s eta 0:00:00
Collecting huggingface-hub>=0.20.0
  Downloading huggingface_hub-1.1.7-py3-none-any.whl (516 kB)
     ------------------------------------- 516.2/516.2 kB 16.3 MB/s eta 0:00:00
Collecting typing_extensions>=4.5.0
  Downloading typing_extensions-4.15.0-py3-none-any.whl (44 kB)
     ---------------------------------------- 44.6/44.6 kB ? eta 0:00:00
Collecting transformers<5.0.0,>=4.41.0
  Downloading transformers-4.57.3-py3-none-any.whl (12.0 MB)
     ---------------------------------------- 12.0/12.0 MB 6.6 MB/s eta 0:00:00
Collecting shellingham
  Downloading shellingham-1.5.4-py2.py3-none-any.whl (9.8 kB)
Collecting typer-slim
  Downloading typer_slim-0



In [4]:
import numpy as np
from sentence_transformers import SentenceTransformer
from tabulate import tabulate
from sklearn.preprocessing import normalize

In [3]:
# Load the model
model = SentenceTransformer("all-mpnet-base-v2")

In [5]:
# Here we can make lists for datasets we want to order
animals = [
    "kitten", "puppy", "rabbit", "panda", "hedgehog",
    "hamster", "dolphin", "horse", "penguin",
    "fox", "deer", "owl", "cow", "chicken", "goat",
    "pig", "parrot", "squirrel", "rat", "snake", "spider",
    "bat", "vulture", "shark", "cockroach", "maggot", "worm", "hyena",
]

In [29]:
# This combines multiple similar words into a single word anchor to remove noise
def encode_anchor(words):
    vecs = model.encode(words)
    vecs = normalize(vecs)
    mean_vec = np.mean(vecs, axis = 0)
    mean_vec = mean_vec / np.linalg.norm(mean_vec)
    return mean_vec

In [10]:
# This function does the projection
def proj_meas(v1, v2, v3):
    v = v2-v1
    w = v3-v1
    proj = np.dot(w,v)/np.dot(v,v)*v
    d = np.linalg.norm(w - proj)
    
    t = np.dot(w, v) / np.dot(v, v) # t is how far along on the spectrum something is, 0.0 for v1, 1.0 for v2.
    proj_point = v1 + t * v
    return d, proj_point, t

In [30]:
# Here I make the list and sort them based on the scale
def make_scale_list(words1, words2, word_list):
    scale_scores = []
    dist_scores = []
    vec1 = encode_anchor(words1)
    vec2 = encode_anchor(words2)
        
    for word in word_list:
        deter = model.encode(word)
        deter = deter / np.linalg.norm(deter)

        d, proj, t = proj_meas(vec1, vec2, deter)
        scale_scores.append(t)
        dist_scores.append(d)

    scores, words, dists = zip(*sorted(zip(scale_scores, word_list, dist_scores)))
    # normed_scores = (scores-min(scores))/(max(scores)-min(scores))
    normed_dists = 1-(dists-min(dists))/(max(dists)-min(dists)) # Normalized makes a bit more sense here
    # Build table
    table = []
    for word, score, dist, nor_dist in zip(words, scores, dists, normed_dists):
        table.append([word, f"{score:.3f}", f"{dist:.3f}", f"{nor_dist:.3f}"])

    headers = ["Word", "t (scale)", "Distance", "Normalized Distance"]
    print('From ', words1, ' to ', words2, ':')
    print(tabulate(table, headers=headers, tablefmt="fancy_grid"))
        
make_scale_list(["angry", "hostile", "aggressive", "mean", "irritable"], ["friendly", "nice", "gentle", "kind", "affectionate"], animals)

From  ['angry', 'hostile', 'aggressive', 'mean', 'irritable']  to  ['friendly', 'nice', 'gentle', 'kind', 'affectionate'] :
╒═══════════╤═════════════╤════════════╤═══════════════════════╕
│ Word      │   t (scale) │   Distance │   Normalized Distance │
╞═══════════╪═════════════╪════════════╪═══════════════════════╡
│ squirrel  │       0.409 │      1.06  │                 0.834 │
├───────────┼─────────────┼────────────┼───────────────────────┤
│ cockroach │       0.411 │      1.099 │                 0.438 │
├───────────┼─────────────┼────────────┼───────────────────────┤
│ maggot    │       0.412 │      1.097 │                 0.452 │
├───────────┼─────────────┼────────────┼───────────────────────┤
│ snake     │       0.422 │      1.077 │                 0.655 │
├───────────┼─────────────┼────────────┼───────────────────────┤
│ vulture   │       0.429 │      1.075 │                 0.681 │
├───────────┼─────────────┼────────────┼───────────────────────┤
│ rat       │       0.43  │    