# wordle

Make wordle, the fun viral word game sensation, no longer fun.

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pickle
from tabulate import tabulate
from tqdm.notebook import tqdm
from copy import deepcopy

In [2]:
#turns out the first major challenge is assembling a good dictionary. 
#to do this, we are going to use a composite of two sources.
# First, we will be using norvig's ngram frequency counts that is based on
# google's 1 trillion word corpus. 
# We will then reject any non-english words using a dictionary

i = 0
words_google = []
freq_google = []
with open("count_1w.txt") as wordfile:
    for line in wordfile:
        line_data = line.strip().split("\t")
        if(len(line_data[0])==5):
            words_google.append(line_data[0])
            freq_google.append(line_data[1])
words_google = np.array(words_google)
freq_google = np.array(freq_google).astype(int)
            
# words_alpha taken from: https://github.com/dwyl/english-words
with open('words_alpha.txt') as word_file:
    words_alpha = set(word_file.read().split())
words_alpha = [w for w in words_alpha if len(w)==5]

print("Length of google words dataset:",len(words_google))
print("Length of dictionary dataset",len(words_alpha))

Length of google words dataset: 39933
Length of dictionary dataset 15918


In [21]:
filt = np.isin(words_google, words_alpha)
words_filt = words_google[filt]
freq_filt = freq_google[filt]
print("Length of filtered dataset",len(words_filt))

Length of filtered dataset 9383


In [23]:
#sgb-words taken from norvig
with open('sgb-words.txt') as word_file:
    words = list(set(word_file.read().split()))
words=np.array(words)

In [24]:
# slightly trim down this word set by only keeping the ones that are relatively frequent according to google.

w_freq = []
for w in words:
    res = freq_google[words_google==w]
    if(len(res)>0):
        w_freq.append(res[0])
    else:
        w_freq.append(0)
w_freq = np.array(w_freq)

In [25]:
print("Length before filtering:", len(words))
words = words[w_freq!=0]
words = words[np.isin(words, words_alpha)]
print("Length after filtering:", len(words))

Length before filtering: 5757
Length after filtering: 5086


In [12]:
# a nice fallback/starter dictionary
# taken from https://github.com/first20hours/google-10000-english
# with open("google-10000-english-usa-no-swears-medium.txt") as word_file:
#     words = set(word_file.read().split())
# print(len(words))
# words_5 = [w for w in words if len(w)==5]
# print(len(words_5))
# words = words_5

In [26]:
#there are likely far faster numeric representations of this information, but I find 
#this to be intuitive.
alphabet = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
possible_letters = [list(alphabet), #cant just do [alphabet]*5 as it creates a reference to a single list 
                    list(alphabet), 
                    list(alphabet), 
                    list(alphabet), 
                    list(alphabet)] 
required_letters = []

In [27]:
def get_possible_letters(possible_letters, guess, true_word):
    #replace this function with a "get hint from true word" and a "get possible letters from hint"
    # combo
    pl = deepcopy(possible_letters)
    required_letters = []
    for i_g, l in enumerate(guess):
        if l not in true_word:
            #remove from each index
            for i in range(len(pl)):
                try:
                    pl[i].remove(l)
                except ValueError:
                    pass
        else: #the letter is in the word
            if l == true_word[i_g]:
                #it is in the correct location
                pl[i_g] = [l]
            else:
                #it was not in the correct location
                #remove the letter from the other lists and add it to the required letters list
                try:
                    pl[i_g].remove(l)
                except ValueError:
                    pass
                required_letters.append(l)
    
                
    return pl, required_letters

%timeit get_possible_letters(possible_letters, "chief", "proxy")

50.4 µs ± 1.44 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [28]:
def get_words_subset(words, possible_letters, required_letters):
    words_subset = []
    for w in words:
        possible_word = True
        for l, a in zip(w, possible_letters):
            if l not in a:
                possible_word = False
                break
        for l in required_letters:
            if l not in w:
                possible_word = False
                break
        if possible_word:
            words_subset.append(w)
    return words_subset

%timeit get_words_subset(words, possible_letters, required_letters)

6.48 ms ± 242 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [29]:
possible_letters, required_letters = get_possible_letters(possible_letters, "rates", "proxy")
words_subset = get_words_subset(words, possible_letters, required_letters)

In [30]:
print(len(words_subset))

142


In [31]:
def get_guess_scores(words_subset, possible_letters, required_letters):
    guess_scores = []
    for w_guess in words_subset:
        pool = []
        for w_test_true in words_subset:
            pl, r = get_possible_letters(possible_letters, w_guess, w_test_true)
            w_sub = get_words_subset(words_subset, pl, r)
            pool.append(len(w_sub))
        guess_scores.append([w_guess, np.mean(pool)])
    guess_scores = np.array(guess_scores)
    return guess_scores

%timeit get_guess_scores(words_subset, possible_letters, required_letters)

2.5 s ± 60.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
#calculate the best initial guess. This takes a while but only needs to be done once.
#takes about 40 minutes

# possible_letters = [list(alphabet), #cant just do [alphabet]*5 as it creates a reference to a single list 
#                     list(alphabet), 
#                     list(alphabet), 
#                     list(alphabet), 
#                     list(alphabet)] 
# required_letters = []
# guess_score=get_guess_scores(words, possible_letters, required_letters)
# guess_score = guess_score[guess_score[:,1].astype(float).argsort()] # sort them so we can take a look
# print(guess_score[:10])
# pickle.dump( guess_score, open( "initial_guess_score.pkl", "wb" ) )

In [27]:
# in order to integrate with the web app we need to write code that 
# returns the possible letters and required letters without knowing the true word
# The information that is returned is the location of true letters, the location of possible letters
# and letters that are incorrect.

def get_possible_letters_from_hint(possible_letters, guess, hint):
    #the hint is formatted as:
    #  n: this letter is not in the word
    #  y: this letter is in the word, and is in the right place
    #  m: this letter is in the word, but is not in the right place
    pl = deepcopy(possible_letters)
    required_letters = []
    for i_g, l in enumerate(guess):
        if hint[i_g] == 'n':
            #remove from each index
            for i in range(len(pl)):
                try:
                    pl[i].remove(l)
                except ValueError:
                    pass
        elif hint[i_g] == 'y': #the letter is in the word
            #it is in the correct location
            pl[i_g] = [l]
        else:
            #it was not in the correct location
            #remove the letter from the other lists and add it to the required letters list
            try:
                pl[i_g].remove(l)
            except ValueError:
                pass
            required_letters.append(l)            
    return pl, required_letters

%timeit get_possible_letters_from_hint(possible_letters, "rates", "nnnny")

10000 loops, best of 5: 52.9 µs per loop


In [29]:
possible_letters = [list(alphabet), #cant just do [alphabet]*5 as it creates a reference to a single list 
                    list(alphabet), 
                    list(alphabet), 
                    list(alphabet), 
                    list(alphabet)] 
required_letters = []
hint='nnnnn'
words_subset = words
guess_data = []
attempts = 1

guess_scores=pickle.load(open("initial_guess_scores.pkl", "rb" )) # precomputed best guess 
guess_scores = guess_scores[guess_scores[:,1].astype(float).argsort()]

print("Best initial guess is: rates")

while not hint == 'yyyyy':
    #loop, providing new optimal guesses until we have sucessfully found the word
    guess = input("Enter your guess: ")

    my_guess_score = guess_scores[guess_scores[:, 0]==guess][0, 1]
    guess_data.append([attempts, guess, f"{float(my_guess_score):.2f}", len(words_subset)])
    print(guess_data[-1]) 
   
    hint = input("input the hint received: ")
    
    possible_letters, r = get_possible_letters_from_hint(possible_letters, guess, hint)
    required_letters.extend(r)
    words_subset = get_words_subset(words_subset, possible_letters, required_letters)
    print("New word pool length:",len(words_subset))
    guess_scores=get_guess_scores(words_subset, possible_letters, required_letters)
    
    if(len(guess_scores)==0):
        print("Out of guesses! try a larger dataset")
        break
    
    guess_scores = guess_scores[guess_scores[:,1].astype(float).argsort()] # sort them so we can take a look

    if(len(guess_scores)==1):
        print("The answer is:", guess_scores[0, 0])
        print(guess_scores)

        break
    
    print(guess_scores[0:10, :])
        
    attempts += 1
    

    
    
print(tabulate(guess_data, headers=['attempts', 'guess', 'expected pool size', 'total guess pool']))

Best initial guess is: rates
Enter your guess: rates
[1, 'rates', '29.22', 5086]
input the hint received: nnnmn
New word pool length: 211
[['ligne' '8.222748815165877']
 ['clone' '8.289099526066352']
 ['oldie' '8.582938388625593']
 ['noble' '9.132701421800947']
 ['dolce' '9.417061611374407']
 ['voile' '9.710900473933648']
 ['lodge' '9.966824644549764']
 ['binge' '10.080568720379146']
 ['opine' '10.184834123222748']
 ['glide' '10.364928909952607']]
Enter your guess: ligne
[2, 'ligne', '8.22', 211]
input the hint received: nynmy
New word pool length: 5
[['wince' '1.0']
 ['mince' '1.0']
 ['niche' '1.4']
 ['niece' '1.4']
 ['nixie' '1.8']]
Enter your guess: wince
[3, 'wince', '1.00', 5]
input the hint received: yyyyy
New word pool length: 1
The answer is: wince
[['wince' '1.0']]
  attempts  guess      expected pool size    total guess pool
----------  -------  --------------------  ------------------
         1  rates                   29.22                5086
         2  ligne            

In [134]:
# guess_scores=pickle.load(open("initial_guess_scores.pkl", "rb" )) # precomputed best guess 
# guess_scores = guess_scores[guess_scores[:,1].astype(float).argsort()]

In [248]:
# it works pretty well, and is definitely super human.
# it falls a little flat when looking attempting to determine the last few guesses as there isn't much
# differentiation and it is mostly down to luck.

# bug when choosing non suggested guess
# increase word libary to 2500
# profiling
# 