In [None]:
import nltk
from collections import Counter
from functools import reduce
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np

In [2]:
pwd

'/Users/matthewmitchell/Documents/Projects/WordleBot'

In [None]:
# nltk.download ('words')

In [4]:
word_list = nltk.corpus.words.words ()

In [5]:
five_letter_words = [w for w in word_list if len(w)==5]

In [6]:
def letterfy(word_list) :
    return [element.lower() for w in word_list for element in list(set(w))]

## All Words 

In [7]:
words_df = pd.DataFrame(five_letter_words)
words_df.columns = ['word']
for i in range (5):
    words_df['Letter{}'.format(i+1)] = words_df['word'].apply(lambda x: x[i])
print(len(words_df))
words_df = words_df[words_df.Letter1 == words_df.Letter1.str.lower()]
print(len(words_df))
words_df

10422
8689


Unnamed: 0,word,Letter1,Letter2,Letter3,Letter4,Letter5
0,aalii,a,a,l,i,i
2,abaca,a,b,a,c,a
3,aback,a,b,a,c,k
4,abaff,a,b,a,f,f
5,abaft,a,b,a,f,t
...,...,...,...,...,...,...
10417,white,w,h,i,t,e
10418,woman,w,o,m,a,n
10419,wound,w,o,u,n,d
10420,wrong,w,r,o,n,g


In [8]:
def generate_word_query(L, isIn=True):
    if isIn:
        symbol = "="
        conjunction = "or"
    else:
        symbol = "!"
        conjunction = "and"
    query = "and (Letter1 {}= '{}' ". format (symbol, L)
    for i in range (2,6) :
        query+= "{} Letter{} {}='{}' ".format(conjunction, i, symbol, L)
    query += ")"
    return query

In [9]:
def clueUpdate(guessword, response):
    query = ""
    for idx, letter in enumerate(guessword):
        letterJudgement = response[idx]
        if letterJudgement == "G":
            query += "and (Letter{}= '{}')".format(idx+1, letter)
        elif letterJudgement == "Y":
            query += "and (Letter{} != '{}')".format(idx+1, letter)
            query += generate_word_query(letter)
        elif letterJudgement == "R":
            query += generate_word_query(letter, isIn=False)
        else:
            assert False, "Please ensure response is a string of RGY: GYGRY"
    query = query.strip("and ")
    temp = temp.query(query)

In [10]:
def generate_clueDeduction_query (guessword, possibleWord):
    query = ""
    for idx, letter in enumerate(guessword):
        #Does this letter match prospectword?
        if letter == possibleWord[idx]:
            #print(idx, Letter, possiblewordfidx)) #for debugging
            query += "and (Letter{} == '{}')".format(idx+1, letter)
        elif letter in possibleWord:
            query += "and (Letter{} != '{}')".format(idx+1, letter)
            query += generate_word_query(letter)
        else:
            query += generate_word_query(letter, isIn=False)
    query = query.strip("and ")
    return query

In [None]:
generate_clueDeduction_query('pulse', 'pulse')

In [12]:
words_df.query(generate_clueDeduction_query('pulse', 'pulse'))

Unnamed: 0,word,Letter1,Letter2,Letter3,Letter4,Letter5
6779,pulse,p,u,l,s,e


In [13]:
def simulate(temp, guessWord):
    #for each potential word, simulate the remaining words possible given the guess word
    haystacks =[]
    for possibleWord in temp.word:
        #Return clue from word
        #For Letter in, guessiond:
        haystacks.append(len(temp.query(generate_clueDeduction_query(guessWord, possibleWord))))
    return haystacks

In [14]:
def return_best_words(guessList, toPrint=True):
    guessList = list(guessList)
    scores = []
    words_and_scores = []
    for word in guessList:
        haystacks = simulate(temp, word)
        score = np.mean(haystacks)
        scores.append(score)
        words_and_scores.append((word, score))
        if toPrint:
            print(word, score)
    min_idx = np.argmin(scores)
    best_score = scores[min_idx]
    best_word = guessList[min_idx]
    if toPrint:
        print ("In\n")
        print ("Best score: {} From word: {}".format(best_score, best_word))
    words_and_scores = sorted(words_and_scores, key=lambda x: x[1])
    return words_and_scores

## NY Times 2309 Word Update

In [15]:
solution_words = []
with open('solutions_nyt.txt') as f:
    for line in f.readlines():
        for w in line.split(","):
            solution_words.append(w.strip().strip('"'))
print(len(solution_words), solution_words[:5], solution_words[-5:])

2309 ['cigar', 'rebut', 'sissy', 'humph', 'awake'] ['judge', 'rower', 'artsy', 'rural', 'shave']


In [16]:
words_df = pd.DataFrame(solution_words)
words_df.columns = ['word']
for i in range(5):
    words_df['Letter{}'.format(i+1)] = words_df['word'].apply(lambda x: x[i])

print(len(words_df))
words_df

2309


Unnamed: 0,word,Letter1,Letter2,Letter3,Letter4,Letter5
0,cigar,c,i,g,a,r
1,rebut,r,e,b,u,t
2,sissy,s,i,s,s,y
3,humph,h,u,m,p,h
4,awake,a,w,a,k,e
...,...,...,...,...,...,...
2304,judge,j,u,d,g,e
2305,rower,r,o,w,e,r
2306,artsy,a,r,t,s,y
2307,rural,r,u,r,a,l


In [None]:
temp = words_df.copy()

## Simulate Best Starter Words

In [None]:
words_and_scores = return_best_words(temp.word.sample(10))
words_and_scores = sorted(words_and_scores, key=lambda x: x[1])
print(len(words_and_scores), words_and_scores[:5], words_and_scores[-5:])

In [None]:
letter_frequency = Counter(letterfy(words_df.word))
letter_frequency = pd.DataFrame.from_dict(letter_frequency, orient='index')
letter_frequency.columns = ['count']
letter_frequency = letter_frequency.sort_values(by='count', ascending=False)
letter_frequency.head()

In [None]:
letter_frequency.plot(kind='barh', figsize=(10,10))

In [None]:
letter_frequency.index[:4]

In [None]:
sample_words = words_df.copy()
for letter in letter_frequency.index[:3]:
    sample_words = sample_words[sample_words.word.str.contains(letter)]
    print(len(sample_words))

In [None]:
temp = words_df.copy().sample(30)

words_and_scores = return_best_words(sample_words)
words_and_scores = sorted(words_and_scores, key=lambda x: x[1])
print(len(words_and_scores), words_and_scores[:5], words_and_scores[-5:])

## Daily Runs

In [None]:
temp = words_df.copy()
print(len(temp))

In [None]:
print(len(temp))
clueUpdate('least','YYRYR')
print(len(temp))
temp

In [None]:
temp = temp[(temp.word.str.contains('l'))
                & (temp.word.str.contains('e'))
                & (~temp.word.str.contains ('a'))
                & (temp.word.str.contains('s'))
                & (~temp.word.str.contains( 't'))
                & (temp['Letter1'] != 'l')
                & (temp['Letter2'] != 'e')
                & (temp['Letter3'] != 'a')
                & (temp['Letter4'] != 's')
                & (temp['Letter5'] != 't')]
print(len(temp))
temp.head()

In [None]:
bestGuesses = return_best_words(temp.word)
bestGuesses[:5]

In [None]:
return_best_words(['spent'])

In [None]:
simulate(temp, "pulse")

## DEV Timing

In [None]:
%%timeit
temp = words_df.copy()
# letters_retrieved = 
word_vector = temp.word.apply(lambda x: len(set("least").intersection(x)))
print(np.mean(word_vector))
word_vector.hist()

In [None]:
# %%timeit

temp = words_df.copy()
# letters_retrieved = 
guess_word = "least"
positions_vector = temp.word.apply(lambda x: sum([1 if x[i] == guess_word[i] else 0 for i in range(5) ]))
print(np.mean(positions_vector))
positions_vector.hist()

In [None]:
# %%timeit

temp = words_df.copy()
# letters_retrieved = 
guess_word = "slate"
positions_vector = temp.word.apply(lambda x: sum([1 if x[i] == guess_word[i] else 0 for i in range(5) ]))
print(np.mean(positions_vector))
positions_vector.hist()

In [None]:
def score_first_guess(guess_word, toPrint=False):
    temp = words_df.copy()
    # letters_retrieved = 
    word_vector = temp.word.apply(lambda x: len(set(guess_word).intersection(x)))
    positions_vector = temp.word.apply(lambda x: sum([1 if x[i] == guess_word[i] else 0 for i in range(5) ]))
    if toPrint:
        print("Average letters retrieved: ", np.mean(word_vector),
              "Average correct number of positions: ", np.mean(positions_vector)
             )
    return (np.mean(word_vector), np.mean(positions_vector))
    
#     print(np.mean(positions_vector))
#     positions_vector.hist()

In [None]:
temp['FirstGuessScore'] = temp.word.apply(lambda x: sum(score_first_guess(x)))
temp = temp.sort_values(by='FirstGuessScore', ascending = 'True')
temp.head()

In [None]:
temp.tail()

In [None]:
temp['FirstGuessScores'] = temp.word.apply(score_first_guess)

In [None]:
temp.tail()

In [None]:
temp['LettersRetrieved'] = temp['FirstGuessScores'].apply(lambda x: x[0])
temp = temp.sort_values(by='LettersRetrieved', ascending=False)
temp.head()

In [None]:
temp.tail()

In [None]:
temp.LettersRetrieved.value_counts(normalize=True)

In [None]:
temp = words_df.copy()
# letters_retrieved = 
word_vector = temp.word.apply(lambda x: len(set("arise").intersection(x)))
print(np.mean(word_vector))
word_vector.hist()

In [None]:
temp = words_df.copy()
# letters_retrieved = 
word_vector = temp.word.apply(lambda x: len(set("slate").intersection(x)))
print(np.mean(word_vector))
word_vector.hist()

In [None]:
temp = words_df.copy()
# letters_retrieved = 
word_vector = temp.word.apply(lambda x: len(set("slate").intersection(x)))
print(np.mean(word_vector))
word_vector.hist()