In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

# Target Creation
This notebook is the first attempt at creating our target data for the supervised portion of the codenames RL bot. Basic idea is to run through lots of games and pick out the clue, number combos that worked to form a baseline. In the absense of a human game dataset we will generate by code

In [2]:
from os import getcwd, chdir

chdir("../")  # Hate this, but quick and dirty

import logging
import pandas as pd

from codenames.greedy_matrix_helpers import *


logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

<IPython.core.display.Javascript object>

In [4]:
def get_topn_from_vocab_file(topn, vocab_file="data/en_vocab_100k.txt"):
    vocab_lines = open(vocab_file, "r").readlines()
    vocab_list = [
        l.replace("\n", "").lower()
        for idx, l in enumerate(vocab_lines)
        if "#!" not in l and idx < topn
    ]
    # Need to remember codenames words!!
    with open("codenames/words.txt", "r", newline="\n") as inputfile:
        codenames_words = inputfile.read().split("\n")
    vocab_list += list(map(str.lower, codenames_words))
    return set(vocab_list)

<IPython.core.display.Javascript object>

In [102]:
def collect_game_data(model, **kwargs):
    codenames_game = CodenamesGame(words_loc="codenames/words.txt")
    all_cards, _, _, _, _, _, _ = get_game_data(codenames_game, model)
    
    assassin_weight = kwargs.get("assassin_weight", -10)
    enemy_weight = kwargs.get("enemy_weight", -5)
    neutral_weight = kwargs.get("neutral_weight", 0)
    ally_weight = kwargs.get("ally_weight", 10)
    risk_weight = kwargs.get("risk_weight", 0)
    clue_score_threshold = kwargs.get("clue_score_threshold", 0)
    one_word_clues = kwargs.get("one_word_clues", False)
    with_normalisation = kwargs.get("with_normalisation", False)
    codenames_data = []
    while not codenames_game.winning_team:
        (
            cards,
            ally_cards,
            enemy_cards,
            neutral_cards,
            assassin_card,
            spymaster_map,
            team,
        ) = get_game_data(codenames_game, model)

        enemy_team = "blue" if team == "red" else "red"

        word_combos, vector_combos = create_word_combinations_matrices(
            ally_cards, model, default_max_combo=9, one_word_clues=one_word_clues
        )
        word_combo_clue_dict = get_most_similar_vectors_for_combos(
            word_combos, cards, model, negative_cards=[assassin_card], topn=10
        )

        clue_tuples = create_clue_tuples(word_combo_clue_dict, cards, model)

        clue_df = create_clue_df(clue_tuples, cards)

        best_clue, weighted_df = calculate_best_clue(
            clue_df,
            spymaster_map,
            ally_cards,
            enemy_cards,
            neutral_cards,
            assassin_card,
            assassin_weight=assassin_weight,
            enemy_weight=enemy_weight,
            neutral_weight=ally_weight,
            ally_weight=ally_weight,
            risk_weight=risk_weight,
            clue_score_threshold=clue_score_threshold,
            with_normalisation=False,
        )

        if not one_word_clues and weighted_df.actual_combo_length.max() > 1:
            weighted_df = weighted_df[weighted_df.actual_combo_length > 1]
#         else:
#             weighted_df = weighted_df[weighted_df.actual_combo_length >= 1]

        # Removing any clue duplicates (where there were multiple intended combos but the amended combos are identical)
        weighted_df = (
            weighted_df.reset_index()
            .drop_duplicates(subset=["clue", "weighted_score", "actual_combo_length"],)
            .sort_values(["weighted_score"], ascending=False)
        )
        weighted_df.reset_index(inplace=True)
        best_clue_idx = weighted_df.weighted_score.argmax()
        best_clue_row = weighted_df.loc[best_clue_idx]
        clue = best_clue_row["clue"]
        clue_number = best_clue_row["actual_combo_length"]
        remaining_cards = cards.copy()
        revealed_cards_before_clue = codenames_game.revealed
        intended_cards = best_clue_row["amended_combo"]
        words_guessed = []
        if clue_number == 0:
            print("Clue number below threshold, game ended")
            break
        while (
            team == codenames_game.current_team
            and codenames_game.round_score < clue_number
        ):
            # If word not in vocab, none or special "I don't fucking know"
            guess = model.most_similar_to_given(clue, remaining_cards)
            codenames_game.make_guess(guess.upper())
            remaining_cards.remove(guess)
            correct = codenames_game.current_team == team
            if correct:
                words_guessed.append(guess)

            if codenames_game.round_score == clue_number:
                codenames_game.next_turn()
                if clue_number != 1 or len(ally_cards) == 1:
                    yield {
                        "clue": clue,
                        "clue_number": clue_number,
                        "intended_combo": intended_cards,
                        "words_guessed": words_guessed,
                        "cards": all_cards,
                        "revealed_before_clue": revealed_cards_before_clue,
                        "assassin_weight": assassin_weight,
                        "enemy_weight": enemy_weight,
                        "neutral_weight": neutral_weight,
                        "ally_weight": ally_weight,
                        "risk_weight": risk_weight,
                        "clue_score_threshold": clue_score_threshold,
                    }




<IPython.core.display.Javascript object>

In [111]:
model_name = "conceptnet-numberbatch-17-06-300"

model_path = join("models", model_name)
if isfile(model_path):
    model = KeyedVectors.load(model_path)
else:
    model = api.load(model_name)
    model.save(model_path)

topn_vocab = 5000
restrict_vocab_to_english(model)
vocab_set = get_topn_from_vocab_file(topn_vocab)
restrict_vocab_with_set(model, vocab_set)

with open("data/5k_conceptnet_game_data/model_vocab.txt", "w") as outfile:
    outfile.write("\n".join(str(item) for item in list(model.index2entity)))

dataset_size = 10000
df_filename = f"data/5k_conceptnet_game_data/{dataset_size}_dataset.csv"

codenames_data = []
while len(codenames_data) < dataset_size:
    kwargs = {
        "assassin_weight": np.random.uniform(-10, 0),
        "enemy_weight": np.random.uniform(-10, 0),
        "neutral_weight": np.random.uniform(-10, 1),
        "ally_weight": np.random.uniform(1, 11),
        "risk_weight": np.random.uniform(0, 10),
        "clue_score_threshold": np.random.uniform(0.05, 0.4),
        "one_word_clues": False,
        "with_normalisation": True,
    }

    codenames_data += [d for d in collect_game_data(model, **kwargs)]
    if (current_size := len(codenames_data)) % 100 == 0:
        print(f"{current_size=}")

pd.DataFrame.from_dict(codenames_data).to_csv(df_filename, index=False)

Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
current_size=200
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below thres

Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
current_size=1900
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below thre

Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
current_size=3600
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below thre

Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
current_size=5100
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below thre

Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended


Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended
Clue number below threshold, game ended


<IPython.core.display.Javascript object>

In [113]:
df = pd.DataFrame.from_dict(codenames_data)
df.head().style

Unnamed: 0,clue,clue_number,intended_combo,words_guessed,cards,revealed_before_clue,assassin_weight,enemy_weight,neutral_weight,ally_weight,risk_weight,clue_score_threshold
0,character,2,"['figure', 'comic']","['comic', 'figure']","['figure', 'comic', 'head', 'dragon', 'ham', 'contract', 'new_york', 'ice', 'box', 'nurse', 'amazon', 'sound', 'torch', 'dwarf', 'hawk', 'vacuum', 'club', 'shoe', 'flute', 'knight', 'plate', 'hole', 'crane', 'lab', 'duck']","[True, True, True, True, True, True, False, True, True, True, True, True, True, False, True, False, False, True, True, True, False, False, True, True, True]",-3.780462,-3.87691,-0.527268,4.891941,6.680596,0.265841
1,penguin,2,"['ice', 'duck']","['duck', 'ice']","['figure', 'comic', 'head', 'dragon', 'ham', 'contract', 'new_york', 'ice', 'box', 'nurse', 'amazon', 'sound', 'torch', 'dwarf', 'hawk', 'vacuum', 'club', 'shoe', 'flute', 'knight', 'plate', 'hole', 'crane', 'lab', 'duck']","[True, True, True, True, True, True, False, True, True, True, True, True, True, False, True, False, False, True, True, True, False, False, True, True, True]",-3.780462,-3.87691,-0.527268,4.891941,6.680596,0.265841
2,carrot,1,['ham'],['ham'],"['figure', 'comic', 'head', 'dragon', 'ham', 'contract', 'new_york', 'ice', 'box', 'nurse', 'amazon', 'sound', 'torch', 'dwarf', 'hawk', 'vacuum', 'club', 'shoe', 'flute', 'knight', 'plate', 'hole', 'crane', 'lab', 'duck']","[True, True, True, True, True, True, False, True, True, True, True, True, True, False, True, False, False, True, True, True, False, False, True, True, True]",-3.780462,-3.87691,-0.527268,4.891941,6.680596,0.265841
3,air,2,"['wind', 'gas']","['gas', 'wind']","['log', 'leprechaun', 'giant', 'angel', 'drill', 'bermuda', 'heart', 'night', 'ruler', 'oil', 'block', 'wind', 'band', 'whale', 'game', 'pitch', 'gas', 'root', 'eagle', 'iron', 'whip', 'film', 'face', 'snowman', 'screen']","[True, True, False, False, True, True, True, False, True, True, False, True, True, True, False, True, True, True, False, False, True, True, True, False, False]",-2.37934,-1.309479,-9.490031,1.653692,4.137461,0.31336
4,shore,1,['whale'],['whale'],"['log', 'leprechaun', 'giant', 'angel', 'drill', 'bermuda', 'heart', 'night', 'ruler', 'oil', 'block', 'wind', 'band', 'whale', 'game', 'pitch', 'gas', 'root', 'eagle', 'iron', 'whip', 'film', 'face', 'snowman', 'screen']","[True, True, False, False, True, True, True, False, True, True, False, True, True, True, False, True, True, True, False, False, True, True, True, False, False]",-2.37934,-1.309479,-9.490031,1.653692,4.137461,0.31336


<IPython.core.display.Javascript object>

In [110]:
df_filename = f"data/5k_conceptnet_game_data/{dataset_size}_dataset.csv"
df_filename

'data/5k_conceptnet_game_data/10000_dataset.csv'

<IPython.core.display.Javascript object>

In [109]:
pd.read_csv("data/5k_conceptnet_game_data/tt.csv")

Unnamed: 0,clue,clue_number,intended_combo,words_guessed,cards,revealed_before_clue,assassin_weight,enemy_weight,neutral_weight,ally_weight,risk_weight,clue_score_threshold
0,deadly,2,"['death', 'disease']","['death', 'disease']","['moscow', 'screen', 'dance', 'strike', 'beach...","[False, True, False, True, True, True, True, T...",-7.365611,-0.06167,-5.87155,3.028007,9.951633,0.077975
1,pitch,4,"['strike', 'mass', 'stick', 'scale']","['strike', 'stick', 'scale', 'mass']","['moscow', 'screen', 'dance', 'strike', 'beach...","[False, True, False, True, True, True, True, T...",-7.365611,-0.06167,-5.87155,3.028007,9.951633,0.077975
2,boats,1,['ship'],['ship'],"['moscow', 'screen', 'dance', 'strike', 'beach...","[False, True, False, True, True, True, True, T...",-7.365611,-0.06167,-5.87155,3.028007,9.951633,0.077975


<IPython.core.display.Javascript object>

In [112]:
model.index2entity

array(['a', 'aan', 'abandoned', ..., 'zur', 'était', 'être'], dtype='<U15')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>