In [2]:
from time import sleep
from functools import cache
from keybert import KeyBERT
import numpy as np

kw_model = KeyBERT(model='all-MiniLM-L6-v2')

@cache
def create_embeddings(texts):
    doc_embeddings = kw_model.extract_embeddings(texts)
    if doc_embeddings: 
        return np.array(doc_embeddings[0][0])
    return np.zeros(384)

create_embeddings('hello')[:10]

array([-0.06277172,  0.05495872,  0.05216478,  0.08578996, -0.08274896,
       -0.07457294,  0.06855468,  0.01839648, -0.08201128, -0.03738483],
      dtype=float32)

In [3]:
def get_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))   

get_similarity(create_embeddings('dog'), create_embeddings('cat'))

0.6606376

In [19]:
from itertools import combinations


arr = ['DUST', 'STRIKE', 'BAD', 'THIMBLE', 'SOCK', 'BOOT', 'BASE', 'BALL', 'IRON', 'HONEY', 'BLOW', 'TOP HAT', 'GLOVE', 'BAT', 'BUGS', 'LICK']
arr = [x.lower() for x in arr if x]

for _ in range(len(arr) // 4): 
    poss = []

    for i in combinations(arr, 4):
        total_similarity = []
        for j in combinations(i, 2):
            total_similarity.append(get_similarity(create_embeddings(j[0]), create_embeddings(j[1])))
        
        score = sum(total_similarity)
        poss.append((score, i))
        
    poss.sort(key=lambda x: x[0], reverse=True)
    
    print(poss[0])
    for i in poss[0][1]:
        arr.remove(i)

(2.4787303507328033, ('strike', 'ball', 'glove', 'bat'))
(2.309135138988495, ('sock', 'honey', 'blow', 'lick'))
(1.9411942660808563, ('dust', 'bad', 'iron', 'bugs'))
(1.5761931538581848, ('thimble', 'boot', 'base', 'top hat'))


In [None]:
from sklearn.model_selection import train_test_split

X = np.array(X)
Y = np.array(Y)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers.legacy import Adam # legacy faster on m2 macs
import tensorflow_addons as tfa

model = Sequential([
    Dense(512, activation='relu', input_shape=(16*384,)),
    BatchNormalization(),
    Dropout(0.5),
    Dense(256, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_train, Y_train, epochs=20, batch_size=32, validation_split=0.2)

import matplotlib.pyplot as plt

plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper left')

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Val'], loc='upper left')

plt.show()
 
print(f'Train accuracy: {model.evaluate(X_train, Y_train)[1]}')
print(f'Val accuracy: {model.evaluate(X_test, Y_test)[1]}')

same = 0 
for i in model.predict(X_test):
    if i[0] > 0.5:
        same += 1

print(f'Predicted same: {same} out of {len(Y_test)}')
