In [1]:
# !pip install tabulate
# !pip install sentence_transformers

In [None]:
import fasttext
import numpy as np
from sentence_transformers import SentenceTransformer
from nltk.corpus import wordnet as wn

import itertools
# from tabulate import tabulate
# import faiss

  from tqdm.autonotebook import tqdm, trange
2025-12-09 22:20:23.910683: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-12-09 22:20:24.062106: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [1]:
# Source: https://englishteabreak.netlify.app/hub/animals
animals = [
    "Aardvark",
    "Alligator",
    "Alpaca",
    "Ant",
    "Anteater",
    "Antelope",
    "Armadillo",
    "Baboon",
    "Badger",
    "Barracuda",
    "Bat",
    "Bear",
    "Beaver",
    "Bee",
    "Beetle",
    "Bird",
    "Bison",
    "Bobcat",
    "Buffalo",
    "Bull",
    "Butterfly",
    "Calf",
    "Camel",
    "Capybara",
    "Caribou",
    "Cat",
    "Caterpillar",
    "Centipede",
    "Chameleon",
    "Cheetah",
    "Chick",
    "Chicken",
    "Chimpanzee",
    "Chinchilla",
    "Chipmunk",
    "Cicada",
    "Clam",
    "Cockroach",
    # "Cougar", This can be a bad slur and is therefore removed
    "Cow",
    "Crab",
    "Crane",
    "Crayfish",
    "Cricket",
    "Crocodile",
    "Crow",
    "Cub",
    "Deer",
    "Dog",
    "Dolphin",
    "Donkey",
    "Dove",
    "Dragonfly",
    "Duck",
    "Duckling",
    "Eagle",
    "Elephant",
    "Elk",
    "Fawn",
    "Ferret",
    "Firefly",
    "Fish",
    "Flamingo",
    "Flea",
    "Fly",
    "Foal",
    "Fox",
    "Frog",
    "Gazelle",
    "Gecko",
    "Giraffe",
    "Goat",
    "Goose",
    "Gopher",
    "Gorilla",
    "Grasshopper",
    "Guinea Pig",
    "Hamster",
    "Hawk",
    "Hedgehog",
    "Hippo",
    "Horse",
    "Hummingbird",
    "Hyena",
    "Iguana",
    "Jaguar",
    "Jellyfish",
    "Kangaroo",
    "Kitten",
    "Koala",
    "Ladybug",
    "Lamb",
    "Lemming",
    "Lemur",
    "Leopard",
    "Lion",
    "Lizard",
    "Llama",
    "Lobster",
    "Lynx",
    "Manatee",
    "Manta Ray",
    "Meerkat",
    "Monkey",
    "Moose",
    "Mosquito",
    "Moth",
    "Mountain Lion",
    "Mouse",
    "Mule",
    "Muskrat",
    "Mussel",
    "Narwhal",
    "Newt",
    "Ocelot",
    "Octopus",
    "Opossum",
    "Orangutan",
    "Otter",
    "Owl",
    "Oyster",
    "Panda",
    "Parrot",
    "Pelican",
    "Penguin",
    "Pig",
    "Pigeon",
    "Piglet",
    "Piranha",
    "Platypus",
    "Pony",
    "Porcupine",
    "Prairie Dog",
    "Prawn",
    "Praying Mantis",
    "Puma",
    "Puppy",
    "Quail",
    "Rabbit",
    "Raccoon",
    "Rat",
    "Rhino",
    "Robin",
    "Rooster",
    "Salamander",
    "Scorpion",
    "Sea Lion",
    "Sea Urchin",
    "Seagull",
    "Seal",
    "Shark",
    "Sheep",
    "Shrew",
    "Shrimp",
    "Silverfish",
    "Skunk",
    "Sloth",
    "Slug",
    "Snail",
    "Snake",
    "Sparrow",
    "Spider",
    "Squid",
    "Squirrel",
    "Starfish",
    "Stork",
    "Swan",
    "Tadpole",
    "Termite",
    "Tick",
    "Tiger",
    "Toad",
    "Tortoise",
    "Turkey",
    "Turtle",
    "Wallaby",
    "Walrus",
    "Weasel",
    "Whale",
    "Wildebeest",
    "Wolf",
    "Wombat",
    "Woodpecker",
    "Worm",
    "Yak",
    "Zebra"
]

# You need to download the model file first — e.g. `cc.en.300.bin.gz`
# Download from fastText website: https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
# Then load it:
model = fasttext.load_model('cc.en.300.bin')

# This function does the projection, changed from the previous one to one I understand intuitively
def proj_meas(v1, v2, v3):
    dl = np.linalg.norm(v1 - v3)
    dr = np.linalg.norm(v2 - v3)
    dh = np.linalg.norm(v1 - v2)
    
    if dl != 0:
        alpha = np.arccos((dl**2+dh**2-dr**2)/(2*dl*dh))
    else:
        alpha = 0
    proj, d = dl*np.cos(alpha), dl*np.sin(alpha)
    t = proj/dh
    # print('alpha, ', alpha, 'should angle be >pi/2', dr>dh)
    return d, t

def find_antonym(word, num_neighbours=300):
    neighbours = [w for _, w in model.get_nearest_neighbors(word, num_neighbours)]

    for nb in neighbours:
        for syn in wn.synsets(nb):
            for lemma in syn.lemmas():
                for ant in lemma.antonyms():
                    return ant.name()  # <-- get string, not Lemma object

    return None  # no antonym found

def create_regular_list(word, word_list, num_neighbours=300):
    bad_words, good_words = [word], [find_antonym(word, num_neighbours)]

    bad_vecs = np.array([model.get_word_vector(w) for w in bad_words])
    good_vecs = np.array([model.get_word_vector(w) for w in good_words])

    bad_mean = np.mean(bad_vecs, axis=0)
    good_mean = np.mean(good_vecs, axis=0)

    scale_scores = []
    dist_scores = []

    # Compute t and distance for each word
    for word in word_list:
        deter = model.get_word_vector(word)
        d, t = proj_meas(bad_mean, good_mean, deter)
        scale_scores.append(t)
        dist_scores.append(d)

    scores = np.array(scale_scores)
    dists = np.array(dist_scores)
    words = np.array(word_list)
    
    ordered_words = list(words[np.argsort(scores)])
    ordered_scores = list(scores[np.argsort(scores)])
    return ordered_words, ordered_scores, bad_words, good_words

def create_t_adjusted_list(word, word_list, num_neighbours=100, weight=1):
    bad_words, good_words = [word], [find_antonym(word, num_neighbours)]
    bad_vecs = np.array([model.get_word_vector(w) for w in bad_words])
    good_vecs = np.array([model.get_word_vector(w) for w in good_words])

    bad_mean = np.mean(bad_vecs, axis=0)
    good_mean = np.mean(good_vecs, axis=0)

    scale_scores = []
    dist_scores = []

    # Compute t and distance for each word
    for word in word_list:
        deter = model.get_word_vector(word)
        d, t = proj_meas(bad_mean, good_mean, deter)
        scale_scores.append(t)
        dist_scores.append(d)

    scores = np.array(scale_scores)
    dists = np.array(dist_scores)
    words = np.array(word_list)
    
    normed_dists = (dists - np.min(dists)) / (np.max(dists) - np.min(dists))
    t_avg = np.mean(scores)
    t_adjusted = scores + weight * (t_avg - scores) * normed_dists

    ordered_scores = list(t_adjusted[np.argsort(t_adjusted)])
    ordered_words = list(words[np.argsort(t_adjusted)])
    # ordered_scores = list(scores[np.argsort(scores)])
    return ordered_words, ordered_scores, bad_words, good_words


word = 'ugly'
# ordered_words, ordered_scores, bad_words, good_words = create_regular_list(word, animals)
ordered_words, ordered_scores, bad_words, good_words = create_t_adjusted_list(word, animals)
print(bad_words)
print('from', word, ' to ', good_words[0])
for word2, score in zip(ordered_words, ordered_scores):
    print(word2, score)


NameError: name 'fasttext' is not defined