TODOs

- predict vector instead of position in vocab? then do distance between vecs as loss?

- MSE on number predicted rather than categorical?

In [1]:
import sys
sys.path.append("../") # hack to add module to path

In [2]:
import os
import json

import pandas as pd
from pandas.io.json import json_normalize

import gensim.downloader as api
from gensim.models import KeyedVectors

from tqdm.auto import tqdm

import numpy as np

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.utils import to_categorical

from sklearn.model_selection import train_test_split

from codenames.greedy_matrix_helpers import restrict_vocab_to_english, restrict_vocab_with_set

In [3]:
tqdm.pandas()

  from pandas import Panel


In [4]:
DATASET_CSV = "../data/5k_conceptnet_game_data/10000_dataset.csv"
VOCAB_TXT = "../data/5k_conceptnet_game_data/model_vocab.txt"

MODEL_NAME = "conceptnet-numberbatch-17-06-300"
RESTRICTED_MODEL_NAME = f"restricted_{MODEL_NAME}"
MODELS_DIR = "../models"

load wordvector model

In [5]:
with open(VOCAB_TXT) as f: 
    model_vocab = set(f.read().splitlines())

model_path = os.path.join(MODELS_DIR, MODEL_NAME)

if os.path.isfile(model_path):
    kv = KeyedVectors.load(model_path)
else:
    kv = api.load(MODEL_NAME)
    kv.save(model_path)

restrict_vocab_to_english(kv)
restrict_vocab_with_set(kv, model_vocab)

INFO:gensim.utils:loading Word2VecKeyedVectors object from ../models/conceptnet-numberbatch-17-06-300
DEBUG:smart_open.smart_open_lib:{'uri': '../models/conceptnet-numberbatch-17-06-300', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'ignore_ext': False, 'transport_params': None}
INFO:gensim.utils:loading vectors from ../models/conceptnet-numberbatch-17-06-300.vectors.npy with mmap=None
INFO:gensim.utils:setting ignored attribute vectors_norm to None
INFO:gensim.utils:loaded ../models/conceptnet-numberbatch-17-06-300


load data

In [6]:
df = pd.read_csv(DATASET_CSV)
df.head(2)

Unnamed: 0,clue,clue_number,intended_combo,words_guessed,cards,card_types,revealed_before_clue,team,red_team_guesses,blue_team_guesses,assassin_weight,enemy_weight,neutral_weight,ally_weight,risk_weight,clue_score_threshold
0,fly,6,"['lap', 'washington', 'strike', 'shadow', 'mos...","['strike', 'moscow', 'lap', 'bermuda', 'washin...","['telescope', 'mount', 'lion', 'kiwi', 'lap', ...","['neutral', 'red', 'neutral', 'blue', 'blue', ...","[True, True, True, True, True, False, False, T...",blue,"['straw', 'carrot', 'america', 'buck', 'pan', ...","['telescope', 'strike', 'moscow', 'lap', 'berm...",-6.1032,-3.599553,-4.525893,10.12711,2.892535,0.071163
1,setting,2,"['mount', 'plot']","['mount', 'plot']","['telescope', 'mount', 'lion', 'kiwi', 'lap', ...","['neutral', 'red', 'neutral', 'blue', 'blue', ...","[True, True, True, True, True, False, False, T...",red,"['straw', 'carrot', 'america', 'buck', 'pan', ...","['telescope', 'strike', 'moscow', 'lap', 'berm...",-6.1032,-3.599553,-4.525893,10.12711,2.892535,0.071163


In [7]:
n_samples = len(df)

In [8]:
def parse_strlist(strlist):
    strlist = strlist.replace("'", "\"")
    return json.loads(strlist)

In [9]:
tile_types = ["blue", "red", "neutral", "assassin"]
tile_type_dict = {v:k for k,v in enumerate(tile_types)}
n_tile_types = len(tile_types)

In [10]:
df["cards_parsed"] = df["cards"].progress_map(parse_strlist)
df["card_types_parsed"] = df["card_types"].progress_map(parse_strlist).map(lambda card_types: [tile_type_dict[ct] for ct in card_types])

HBox(children=(FloatProgress(value=0.0, max=10001.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=10001.0), HTML(value='')))




In [11]:
M = 25
V = 300

In [12]:
card_types = np.vstack(df.card_types_parsed.values)

z = [[kv.get_vector(word) for word in cards] for cards in df.cards_parsed]
z = np.array(z)

z = z.reshape(-1, V)

X = np.zeros((z.shape[0], n_tile_types, V), dtype=np.float32)

flat_card_types = card_types.flatten()

for i in tqdm(range(X.shape[0])):
    X[i, flat_card_types[i]] = z[i]
    
del z

# back to desired shape
X = X.reshape(n_samples, M, n_tile_types, V)

HBox(children=(FloatProgress(value=0.0, max=250025.0), HTML(value='')))




deal with y

In [14]:
vocab2ind = {word:i for i, word in (enumerate(kv.vocab.keys()))}

In [15]:
y_word =  df.clue.map(vocab2ind).values

In [16]:
# TODO review lack of representation for each target
from collections import Counter
Counter(y_word).most_common(20)

[(2809, 52),
 (3623, 42),
 (3677, 37),
 (305, 37),
 (1828, 36),
 (4205, 35),
 (3522, 35),
 (496, 34),
 (275, 33),
 (1106, 33),
 (4034, 33),
 (629, 31),
 (166, 31),
 (2837, 30),
 (2168, 30),
 (1160, 29),
 (1566, 29),
 (3624, 29),
 (3055, 28),
 (4251, 28)]

In [17]:
y_word = to_categorical(y_word, num_classes=y_word.max()+1)

In [18]:
y_num = df.clue_number.values
y_num = to_categorical(y_num, num_classes=y_num.max()+1)

train test split

In [19]:
X.shape, y_word.shape, y_num.shape

((10001, 25, 4, 300), (10001, 4342), (10001, 8))

do some data augmentation 

shuffle each board n times to create a dataset n times as big.

WARNING - TAKES A LOT OF RAM!

In [20]:
aug_multiplier = 10
X_aug = np.zeros((aug_multiplier*X.shape[0],) + X.shape[1:], dtype=np.float32)
counter = 0
for i in tqdm(range(X.shape[0])):
    for j in range(10):
        X_aug[counter] = np.random.permutation(X[i])
        counter += 1

y_word_aug = np.tile(y_word, 10).reshape((X_aug.shape[0], y_word.shape[1]))
y_num_aug = np.tile(y_num, 10).reshape((X_aug.shape[0], y_num.shape[1]))

HBox(children=(FloatProgress(value=0.0, max=10001.0), HTML(value='')))




In [21]:
X_aug.shape, y_word_aug.shape, y_num_aug.shape

((100010, 25, 4, 300), (100010, 4342), (100010, 8))

In [22]:
X_flat = X_aug.reshape(X_aug.shape[0], -1)

In [23]:
input_shape = X_flat.shape[1:]
word_output_size = y_word.shape[1]
num_output_size = y_num.shape[1]

In [24]:
input_shape, word_output_size, num_output_size

((30000,), 4342, 8)

again pretty expensive on RAM as we've got a rather large X_aug

In [25]:
X_train, X_test, y_word_train, y_word_test, y_num_train, y_num_test = train_test_split(
    X_flat, y_word_aug, y_num_aug, test_size=0.1
)

In [28]:
del X_flat, X_aug, y_word_aug, y_num_aug

In [30]:
inp = layers.Input(shape=input_shape, name="input")
dense_1 = layers.Dense(1000, activation="relu", name="layer1")(inp)
dense_word = layers.Dense(word_output_size, name="dense_word", activation=None)(dense_1)

concat = layers.concatenate((dense_1, dense_word), name="concat")

out_word = layers.Softmax(name="out_word")(dense_word)

out_num = layers.Dense(num_output_size, name="out_num", activation="softmax")(concat)

model = keras.Model(inputs=(inp,), outputs=(out_word,out_num))

model.summary()

model.compile(loss="categorical_crossentropy", optimizer=keras.optimizers.Adam(lr=0.002), metrics=["accuracy"])

model.fit(X_train, (y_word_train, y_num_train), batch_size=128, epochs=10, validation_split=0.1)

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input (InputLayer)              [(None, 30000)]      0                                            
__________________________________________________________________________________________________
layer1 (Dense)                  (None, 1000)         30001000    input[0][0]                      
__________________________________________________________________________________________________
dense_word (Dense)              (None, 4342)         4346342     layer1[0][0]                     
__________________________________________________________________________________________________
concat (Concatenate)            (None, 5342)         0           layer1[0][0]                     
                                                                 dense_word[0][0]      

KeyboardInterrupt: 

In [32]:
model.evaluate(X_test, (y_word_test, y_num_test))



[10.336933135986328,
 4.592963695526123,
 5.743966102600098,
 0.11265414953231812,
 0.6081546545028687]

num acc. suspiciously high, check distribution

In [36]:
Counter(np.argmax(y_num_test, axis=1))

Counter({1: 1502, 2: 6619, 3: 1302, 4: 394, 5: 142, 6: 41, 7: 1})