In [14]:
from copy import deepcopy
from itertools import product
import logging
import random
import sys

from loguru import logger
import pandas as pd
from pandas import DataFrame
import numpy as np
from tqdm import tqdm


from word_grid import WordGrid, Direction, ValidationMode
logger.remove()
logger.add(sys.stdout, level="ERROR")

MIN_WORD_LEN = 3

In [15]:
word_index = pd.read_csv("data/word_index.csv", encoding='utf-8')
dictionary = word_index[word_index["lang_code"] == "en"]
dictionary["word"] = dictionary["word"].astype(str)
dictionary = dictionary[dictionary["len"] >= MIN_WORD_LEN]
dictionary = dictionary[dictionary["freq"].fillna(0) > 0]
dictionary = dictionary[~dictionary["word"].str.contains(r"[0-9-]")]

print(len(dictionary))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dictionary["word"] = dictionary["word"].astype(str)


205742


In [16]:
def get_logger():
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.ERROR)
    console_handler = logging.StreamHandler()
    console_handler.setLevel(logging.ERROR)
    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
    console_handler.setFormatter(formatter)
    logger.addHandler(console_handler)
    return logger

logger = get_logger()

In [17]:
len_dictionary = {key: df for key, df in dictionary.groupby("len")}


In [18]:
puzzle = WordGrid((5,10))
puzzle

-  -  -  -  -  -  -  -  -  -
-  -  -  -  -  -  -  -  -  -
-  -  -  -  -  -  -  -  -  -
-  -  -  -  -  -  -  -  -  -
-  -  -  -  -  -  -  -  -  -

In [19]:
puzzle.reset()

seed = 1 #random.randint(0, 1000)
target_n = 15
random.seed(seed)
np.random.seed
direction = Direction.DOWN
word_list = []
snapshots = []
positions = {
    Direction.DOWN: {pos: [] for pos in product(range(puzzle.shape[1]), range(puzzle.shape[0] - MIN_WORD_LEN + 1))},
    Direction.ACROSS: {pos: [] for pos in product(range(puzzle.shape[1] - MIN_WORD_LEN + 1), range(puzzle.shape[0]))}
}
pbar = tqdm(total=target_n)
while len(word_list) < target_n:
    if len(positions[direction]) == 0:
        if len(positions[Direction.flip(direction)]) == 0:
            break
        direction = Direction.flip(direction)
    
    position = random.choice(list(positions[direction]))
    
    blacklist = positions[direction][position] + word_list
    candidates = dictionary[dictionary["word"].apply(lambda w: puzzle.validate_word(position, direction, w, ValidationMode.HARD))]
    candidates = candidates[~candidates["word"].isin(blacklist)]

    if len(candidates) == 0:
        positions[direction].pop(position, None)
        continue
    
    try:
        weights = np.log(np.log(dictionary.freq) + 1) + dictionary.len
        word = candidates.word.sample(1, weights=weights, random_state=seed).item()
    except:
        word = candidates.word.sample(1, random_state=12).item()
    
    pbar.update(1)
    pbar.set_description(f"word: {word}, pos: {position}, dir: {direction.name.lower()}, cnd: {len(candidates)}, slots {len(positions[Direction.DOWN])}d {len(positions[Direction.ACROSS])}a", refresh=True)
    
    if puzzle.add_word(position, direction, word):
        snapshots.append(({"position": position, "direction": direction, "word": word}, deepcopy(puzzle)))
        if len(positions[Direction.flip(direction)]) > 0:
            direction = Direction.flip(direction)

        positions[direction].pop(position, None)    
        word_list.append(word)
    else:
        positions[direction][position].append(word)
        logger.info(f"Can't place word {word} at {position}")

print(word_list)
print(puzzle)

word: know, pos: (5, 2), dir: across, cnd: 28, slots 0d 16a:  67%|██████▋   | 10/15 [1:43:22<51:41, 620.21s/it]
word: wai, pos: (7, 0), dir: down, cnd: 2, slots 7d 36a:  60%|██████    | 9/15 [00:27<00:26,  4.47s/it]            

['stat', 'IMS', 'esp', 'stumbling', 'ell', 'AWD', 'loom', 'those', 'wai']
-  -  -  -  -  -  a  w  d  -
-  s  -  l  -  -  -  a  -  e
-  t  h  o  s  e  -  i  m  s
-  a  -  o  -  l  -  -  -  p
s  t  u  m  b  l  i  n  g  -


In [20]:
for params, grid in snapshots:
    position, direction, word = list(params.values())
    print(f"word: {word}, pos: {position}, dir: {direction.name.lower()}")
    grid.color_print()

word: stat, pos: (1, 1), dir: down
-  -  -  -  -  -  -  -  -  -
[30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m
[30m-[0m  [33ms[0m  [30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m
[30m-[0m  [33mt[0m  [30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m
[30m-[0m  [33ma[0m  [30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m
[30m-[0m  [33mt[0m  [30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m
-  -  -  -  -  -  -  -  -  -
word: IMS, pos: (7, 2), dir: across
-  -  -  -  -  -  -  -  -  -
[30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m
[30m-[0m  [33ms[0m  [30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m  [30m-[0m
[30m-[0

In [22]:
all_categories = {}
for i, row in tqdm(word_index.fillna('').iterrows()):
    if not row.cats:
        continue
    cat_list = [cat for cat in row.cats[1:-1].split(", ")]
    for cat in cat_list:
        if cat not in all_categories:
            all_categories[cat] = 1
        else:
            all_categories[cat] += 1
print(all_categories)



2814423it [02:07, 22111.59it/s]

{"'English entries with incorrect language header'": 238167, "'English lemmas'": 218023, "'English nouns'": 149601, "'English terms derived from Latin'": 19022, "'English uncountable nouns'": 73015, "'English 4-syllable words'": 7458, "'English countable nouns'": 136145, "'English terms suffixed with -or (agent noun)'": 6, "'English terms with IPA pronunciation'": 76985, "'English terms with audio links'": 61863, "'English 6-syllable words'": 565, "'Rhymes:English/eɪʃən'": 712, "'Rhymes:English/eɪʃən/6 syllables'": 36, "'English 3-syllable words'": 17481, "'English non-lemma forms'": 51953, "'English terms suffixed with -ing'": 1850, "'English verb forms'": 45347, "'Rhymes:English/ɛəɹɪŋ'": 43, "'Rhymes:English/ɛəɹɪŋ/3 syllables'": 2, "'English 2-syllable words'": 32243, "'English terms derived from Proto-Indo-European'": 13295, "'English terms derived from the Proto-Indo-European root *h₁ers-'": 10, "'English verbs'": 46109, "'Rhymes:English/ɜː(ɹ)'": 132, "'English 5-syllable words'": 




In [24]:
cats = pd.DataFrame([(key, value) for key, value in all_categories.items()], columns=["category", "count"])
cats.to_csv("data/categories.csv", index=False)
