In [1]:
from functools import cache
from keybert import KeyBERT
import numpy as np
import pandas as pd

kw_model = KeyBERT(model='all-MiniLM-L6-v2')

@cache
def create_embeddings(texts):
    try : 
        return np.array(kw_model.extract_embeddings(texts)[0][0])
    except:
        return np.zeros(384)

In [2]:
df = pd.read_csv('connections_clean.csv')

label = []
word = []

for idx, row in df.iterrows():
    for i in range(1, 5):
        word.append(row[f'w{i}'])
    label.extend([idx] * 4)

finished_df = pd.DataFrame({'label': label, 'word': word})
finished_df['embedding'] = finished_df['word'].apply(create_embeddings)
finished_df['puzzle'] = np.arange(len(finished_df)) // 16
finished_df.head()

Unnamed: 0,label,word,embedding,puzzle
0,0,ANNIVERSARY,"[-0.06276984, 0.10821476, 0.047664665, -0.0253...",0
1,0,BIRTHDAY,"[-0.084567346, 0.11157629, -0.006238818, 0.041...",0
2,0,SHOWER,"[-0.004404458, 0.015580897, 0.112289116, 0.019...",0
3,0,WEDDING,"[-0.016065089, 0.090614215, 0.003278323, 0.038...",0
4,1,BARS,"[-0.010722448, 0.019358307, -0.021829613, 0.01...",0


In [3]:
from tqdm import tqdm

NUM_DATA_PER_PUZZLE = 500

X = []
Y = []

for idx, i in finished_df.groupby('puzzle'):
    for _ in range(NUM_DATA_PER_PUZZLE):
        stacked = []
        sample = i.sample(n = 16, replace = False)
        for embedding in sample['embedding']:
            stacked.extend(embedding)
        X.append(stacked)
        if sample['label'].iloc[0] == sample['label'].iloc[1]:
            Y.append(1)
        else: 
            Y.append(0)

In [4]:
data_dict = {f'feature_{i}': [x[i] for x in X] for i in range(len(X[0]))}
data_dict['label'] = Y
data_df = pd.DataFrame(data_dict)


data_df.to_csv('siamese_big.csv', index=False)