In [1]:
import pandas as pd
from enum import StrEnum
from dataclasses import dataclass
import pickle
import os
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

BOARD_FILENAME = "board.csv"
COLOR_COLUMN = "color"
WORD_COLUMN = "word"

VOCABULARY_FILENAME = "words_alpha.txt"
VOCABULARY_EMBEDDINGS_FILENAME = "vocabulary_embeddings.pkl"

EMBEDDING_MODEL = "all-MiniLM-L6-v2"

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
@dataclass
class Board:
    team_words: list[str]
    enemy_words: list[str]
    neutral_words: list[str]
    assassin_word: str


class Colors(StrEnum):
    BLACK = "black"
    BLUE = "blue"
    RED = "red"
    WHITE = "white"


class TeamColor(StrEnum):
    BLUE = Colors.BLUE
    RED = Colors.RED


def read_board(team_color: TeamColor, board_filename: str = BOARD_FILENAME) -> Board:
    board_df = pd.read_csv(board_filename)
    
    if team_color == TeamColor.BLUE:
        enemy_color = TeamColor.RED
    else:
        enemy_color = TeamColor.BLUE
    
    board = Board(
        team_words=board_df[board_df[COLOR_COLUMN] == team_color][WORD_COLUMN].to_list(),
        enemy_words=board_df[board_df[COLOR_COLUMN] == enemy_color][WORD_COLUMN].to_list(),
        neutral_words=board_df[board_df[COLOR_COLUMN] == Colors.WHITE][WORD_COLUMN].to_list(),
        assassin_word=board_df[board_df[COLOR_COLUMN] == Colors.BLACK][WORD_COLUMN].iloc[0],
    )

    return board

board = read_board(team_color=TeamColor.BLUE)
board

Board(team_words=['sail', 'war', 'luck', 'shell', 'gum', 'turkey', 'director', 'earthquake'], enemy_words=['crab', 'box', 'bonsai', 'pillow', 'joan of arc', 'pentagon', 'second'], neutral_words=['england', 'astronaut', 'wake', 'taste', 'golf', 'mother', 'theater'], assassin_word='radio')

In [13]:
def read_txt_as_list(filename):
    words = []
    with open(filename, 'r') as txtfile:
        for line in txtfile:
            word = line.strip()
            words.append(word)
    return words

# TODO: Improve this to do proper caching based on the board parameter
def get_vocabulary_embeddings(board: Board) -> dict:
    if os.path.exists(VOCABULARY_EMBEDDINGS_FILENAME):
        with open(VOCABULARY_EMBEDDINGS_FILENAME, 'rb') as f:
            vocabulary_embeddings = pickle.load(f)
            return vocabulary_embeddings
    else:
        embedder = SentenceTransformer(EMBEDDING_MODEL)
        vocabulary = read_txt_as_list(VOCABULARY_FILENAME)
        vocabulary.extend(board.team_words + board.enemy_words + board.neutral_words + [board.assassin_word])
        embeddings = embedder.encode(vocabulary, normalize_embeddings=True)
        vocabulary_embeddings = dict(zip(vocabulary, embeddings))
        with open(VOCABULARY_EMBEDDINGS_FILENAME, 'wb') as f:
            pickle.dump(vocabulary_embeddings, f)
        return vocabulary_embeddings
    
vocabulary_embeddings = get_vocabulary_embeddings(board)

In [24]:
def generate_clue(board: Board, vocabulary_embeddings: dict) -> str:
    # Calculate the centroid for the team's words
    team_centroid = np.mean([vocabulary_embeddings[word] for word in board.team_words], axis=0).reshape(1, -1)
    
    # Initialize variables to store the best clue found
    best_clue = None
    best_clue_score = -np.inf  # Start with a very low score

    # Iterate over possible clue words in the embeddings vocabulary
    for clue_word, clue_vector in vocabulary_embeddings.items():
        # Skip if the clue word is any of the board words
        if clue_word in board.team_words + board.enemy_words + board.neutral_words + [board.assassin_word]:
            continue
        
        clue_vector = clue_vector.reshape(1, -1)
        
        # Calculate distances to other categories
        enemy_dist = min([cosine_similarity(clue_vector, vocabulary_embeddings[word].reshape(1, -1)) for word in board.enemy_words])
        neutral_dist = min([cosine_similarity(clue_vector, vocabulary_embeddings[word].reshape(1, -1)) for word in board.neutral_words])
        assassin_dist = cosine_similarity(clue_vector, vocabulary_embeddings[board.assassin_word].reshape(1, -1))
        
        # Calculate score (example scoring, can be adjusted)
        score = cosine_similarity(clue_vector, team_centroid) - enemy_dist - neutral_dist - assassin_dist
        
        # Update best clue if this clue has a better score
        if score > best_clue_score:
            best_clue = clue_word
            best_clue_score = score
            print(f"best clue so far: '{best_clue}' score: {best_clue_score}")

    
    return best_clue
    
clue = generate_clue(board, vocabulary_embeddings)
clue

best clue so far: 'a' score: [[-0.25659236]]
best clue so far: 'aa' score: [[-0.14256954]]
best clue so far: 'aah' score: [[-0.1228091]]
best clue so far: 'aahed' score: [[-0.10565453]]
best clue so far: 'aahing' score: [[-0.04309511]]
best clue so far: 'aam' score: [[-0.04012302]]
best clue so far: 'aarrgh' score: [[-0.03938065]]
best clue so far: 'aasvogel' score: [[-0.02438551]]
best clue so far: 'aasvogels' score: [[0.01436937]]
best clue so far: 'abacterial' score: [[0.01822604]]
best clue so far: 'abaisance' score: [[0.01927333]]
best clue so far: 'abamps' score: [[0.05959095]]
best clue so far: 'abaptiston' score: [[0.08354449]]
best clue so far: 'abattoirs' score: [[0.14845562]]
best clue so far: 'acanthuridae' score: [[0.1612857]]
best clue so far: 'acetylsalicylate' score: [[0.17160147]]
best clue so far: 'aegithognathism' score: [[0.1741601]]
best clue so far: 'amaranthaceae' score: [[0.18790522]]
best clue so far: 'amphibiousness' score: [[0.21894301]]
best clue so far: 'an

'viscoelasticity'