In [38]:
import pandas as pd

df = pd.read_csv('connections_clean.csv')
# drop 2023-08-12
df = df[df['date'] != '2023-08-12']
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,date,category,w1,w2,w3,w4
0,2023-10-08,CELEBRATORY OCCASIONS,ANNIVERSARY,BIRTHDAY,SHOWER,WEDDING
1,2023-10-08,PHONE CONNECTIVITY,BARS,RECEPTION,SERVICE,SIGNAL
2,2023-10-08,CITIES AND TOWNS IN ENGLAND,BATH,DERBY,READING,SANDWICH
3,2023-10-08,___ OPERA,COMIC,MET,ROCK,SOAP
4,2023-10-07,RHYMES,DARREN,KAREN,SHARON,AARON
...,...,...,...,...,...,...
467,2023-06-13,LETTER HOMOPHONES,ARE,QUEUE,SEA,WHY
468,2023-06-12,WET WEATHER,HAIL,RAIN,SLEET,SNOW
469,2023-06-12,NBA TEAMS,BUCKS,HEAT,JAZZ,NETS
470,2023-06-12,KEYBOARD KEYS,OPTION,RETURN,SHIFT,TAB


In [24]:
from functools import cache
from keybert import KeyBERT
import numpy as np

kw_model = KeyBERT(model='all-MiniLM-L6-v2')

@cache
def create_embeddings(texts):
    doc_embeddings = kw_model.extract_embeddings(texts)
    if doc_embeddings: 
        return np.array(doc_embeddings[0][0])
    return np.zeros(384)

create_embeddings('hello')[:10]

array([-0.06277172,  0.05495872,  0.05216478,  0.08578996, -0.08274896,
       -0.07457294,  0.06855468,  0.01839648, -0.08201128, -0.03738483],
      dtype=float32)

In [39]:
from tqdm import tqdm

puzzles = []
for date, puzzle in tqdm(df.groupby('date')):
    puzzle_data = {
        'Date': date,
        'Categories': [],
        'Words': [],
        'CategoryEmbeddings': [],
        'WordEmbeddings': [],
        'Labels': [],
        'OneHotCategories': []
    }

    for i in range(4): 
        category = puzzle.iloc[i, 1:].values[0]
        words = puzzle.iloc[i, 1:].values[1:]
        
        puzzle_data['Categories'].append(category)
        puzzle_data['Words'].extend(words)
        puzzle_data['CategoryEmbeddings'].append(create_embeddings(category))
        puzzle_data['WordEmbeddings'].extend([create_embeddings(word) for word in words])

        temp = [0] * 4
        temp[i] = 1
        puzzle_data['Labels'].extend([i] * len(words))
        puzzle_data['OneHotCategories'].extend([temp] * len(words))
    puzzles.append(puzzle_data)

puzzle_df = pd.DataFrame(puzzles)
print(puzzle_df.iloc[0])

100%|██████████| 118/118 [00:00<00:00, 4884.08it/s]

Date                                                         2023-06-12
Categories            [WET WEATHER, NBA TEAMS, KEYBOARD KEYS, PALIND...
Words                 [HAIL, RAIN, SLEET, SNOW, BUCKS, HEAT, JAZZ, N...
CategoryEmbeddings    [[-0.029925823, 0.016922211, 0.14112554, 0.058...
WordEmbeddings        [[-0.11493516, 0.123046435, 0.041388754, 0.000...
Labels                 [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3]
OneHotCategories      [[1, 0, 0, 0], [1, 0, 0, 0], [1, 0, 0, 0], [1,...
Name: 0, dtype: object





In [40]:
puzzle_df.to_csv('puzzles.csv', index=False)