## Read in word list

In [1]:
import src.utils.git as gitutil
bpath = gitutil.get_root()

In [2]:
word_list_name = 'twl06.txt'

word_list_file = bpath + '/data/processed/' + word_list_name

with open(word_list_file, 'r') as f:
    word_list = f.read().splitlines()

## Initialise

In [3]:
possible_words = word_list

## Define entropy calculation

In [4]:
import math

def calc_entropy(possible_words):
    num_possible_words = len(possible_words)
    prob_each_possible_word = 1/num_possible_words
    entropy = math.log(prob_each_possible_word, 2)

    return(entropy)

## Define guess checks
### Function to check for correct positions

In [5]:
def get_correct_positions(w1, w2):
    correct_positions = dict([(x, w1[x]) for x in range(0,5) if w1[x] == w2[x]])

    return correct_positions


### Function to check for incorrect positions

In [6]:
def get_guess_info(w1, w2):

    correct_positions = get_correct_positions(w1,w2)

    incorrect_poisition_letters = set(w1).intersection(set(w2)).symmetric_difference(correct_positions.values())

    incorrect_positions = dict([(letter, w1.find(letter)) for letter in incorrect_poisition_letters])

    incorrect_letters = set(w1).symmetric_difference(set(w2)).intersection(w1)

    return {"correct_positions": correct_positions,
            "incorrect_positions": incorrect_positions,
            "incorrect_letters": incorrect_letters}

## Functions to subset possible words based on guess info

In [7]:
def subset_for_correct_position(possible_words, correct_position):

    for pos, letter in correct_position.items():
        possible_words = [word for word in possible_words if word[pos] == letter]
    
    return(possible_words)

def subset_possible_for_incorrect_position(possible_words, incorrect_position):

    for letter, pos in incorrect_position.items():
        possible_words = [word for word in possible_words if letter in word]
        possible_words = [word for word in possible_words if word[pos] != letter]

    return(possible_words)

def subset_possible_for_incorrect_letters(possible_words, incorrect_letters):

    for letter in incorrect_letters:
        possible_words = [word for word in possible_words if letter not in word]

    return(possible_words)

def subset_from_guess(possible_words, guess_info):

    correct_positions = guess_info["correct_positions"]
    incorrect_positions = guess_info["incorrect_positions"]
    incorrect_letters = guess_info["incorrect_letters"]

    possible_words = subset_possible_for_incorrect_letters(possible_words, incorrect_letters)
    # keep words in sensible guesses based on only incorrect letters being excluded
    sensible_guesses = possible_words.copy()

    possible_words = subset_possible_for_incorrect_position(possible_words, incorrect_positions)
    possible_words = subset_for_correct_position(possible_words, correct_positions)

    return {"possible_words": possible_words,
            "sensible_guesses": sensible_guesses}

### Determine number of possible words remaining with given guess and correct word

In [8]:
def num_poss_if_right(possible_words, guess, correct):

    possible_words = subset_from_guess(possible_words, get_guess_info(guess, correct))["possible_words"]
    num_poss = len(possible_words)

    return(num_poss)

### Calculate expected number of possible words remaining for given guess and any possible correct word

In [9]:
from statistics import mean

def calculate_expected_remaining(possible_words, guess):
    num_posses = [num_poss_if_right(possible_words, guess, word) for word in possible_words]
    expected_remaining = mean(num_posses)

    return(expected_remaining)

### Example number possible from first guess

In [10]:
calculate_expected_remaining(possible_words, 'irate')

175.35930810070218

In [11]:
len(subset_from_guess(possible_words, get_guess_info('train', 'blimp'))["possible_words"])

297

In [12]:
len(subset_from_guess(possible_words, get_guess_info('train', 'blimp'))["sensible_guesses"])

1010