In [230]:
import pandas as pd

In [231]:
data_folder = 'data\\'

## Data

### Word frequency
Let's start with data from the Word Frequency on Kaggle challenge -- https://www.kaggle.com/rtatman/english-word-frequency

This exercise is about trying to match the most frequency English words with the most frequent words in other sources / languages.  We can take this data (a reduced set of the Google corpus) as a manageable size of words to deal with.

There are a couple of steps to what needs to happen:
1)  Read the data and make sure we understand it
2)  Cut down the corpus to only 5-letter words
3)  Compute a new column that represents the relative probability of the 5-letter words specifically
4)  Extract from all of these words the letter frequency


#### Read the data and examine it

In [232]:
kaggle_file = 'unigram_freq.csv'
freq_all_df = pd.read_csv(f'{data_folder}{kaggle_file}')
print(freq_all_df.shape)
freq_all_df.head()

(333333, 2)


Unnamed: 0,word,count
0,the,23135851162
1,of,13151942776
2,and,12997637966
3,to,12136980858
4,a,9081174698


Pretty simple.  1/3M words and total count in the corpus for each word

#### Cut down the corpus to 5-letter words

In [233]:
freq_flw_df = freq_all_df[freq_all_df['word'].str.len() == 5].copy()
print(freq_flw_df.shape)
freq_flw_df.head()


(39933, 2)


Unnamed: 0,word,count
35,about,1226734006
45,other,978481319
56,which,810514085
57,their,782849411
62,there,701170205


Down to 39933 words out of the corpus.

#### Compute 5-letter word probability

In [234]:
total_count = sum(freq_flw_df['count'])
probs = [x / total_count for x in freq_flw_df['count']]
freq_flw_df['prob'] = probs
freq_flw_df.head(10)

Unnamed: 0,word,count,prob
35,about,1226734006,0.017723
45,other,978481319,0.014136
56,which,810514085,0.011709
57,their,782849411,0.01131
62,there,701170205,0.01013
82,first,578161543,0.008353
85,would,572644147,0.008273
92,these,541003982,0.007816
93,click,536746424,0.007754
100,price,501651226,0.007247


#### Extract letter frequency
Old-school answer is a for loop over each word, counting letters and incrementing a dataframe of letters.  Not a bad approach, just a bit slow.

In more recent times, Python and Pandas have tried to create optimized sorting and summary functions to handle this type of work.  First we will create one giant string from all of the words and then feed this string into the Counter funtion.

In [235]:
from collections import Counter
all_words = "".join(freq_flw_df['word'])
letter_freq = Counter(all_words)
letter_freq.most_common(10)

[('a', 21942),
 ('e', 18907),
 ('o', 14627),
 ('i', 13749),
 ('s', 13683),
 ('r', 12185),
 ('n', 11447),
 ('l', 10375),
 ('t', 9856),
 ('c', 7422)]

Not a huge surprise that a, e, i & o are 4 of the top 5 ... so now we have the first letters to try as our first word ... aeois.

Except that each word in Wordle has to be a real word.  If we look down through the list of most common words, the word THEIR has 4 of the top 10 letters in use.  Let's start with that. 

### Wordle time

#### Round 1 -- enter THEIR
green, yellow, or gray

green = proper position
yellow = proper letter wrong position
gray = invalid letter

In [236]:
correct_letters = [''] * 5
good_letters = []
bad_letters = []

word_1 = ['t', 'h', 'e', 'i', 'r']
result_1 = [
    'gray',
    'gray',
    'gray',
    'gray',
    'gray'
]


OK ... now we can eliminate all of the grays, set the value of any greens in the final word and use the yellows to continue digging

In [237]:
def build_rules(word, results, correct, good, bad):
    for i in range(5):
        result = results[i]
        letter = word[i]

        if result == 'green':
            correct[i] = letter
            if letter not in good:
                good.append(letter)
        elif result == 'yellow':
            if letter not in good:
                good.append(letter)
        else:
            if letter not in bad:
                bad.append(letter)

    return correct, good, bad


correct_letters, good_letters, bad_letters = build_rules(
    word_1,
    result_1,
    correct_letters,
    good_letters,
    bad_letters
)

print(correct_letters)
print(good_letters)
print(bad_letters)

['', '', '', '', '']
[]
['t', 'h', 'e', 'i', 'r']


#### Round 2
and we're starting to see some repetitive code.  We'll turn those into functions for the final algorithms


#### strip out all words with bad letters

In [238]:
def remove_bad_words(df, bad_letters):
    pattern = ''
    for i in range(len(bad_letters)-1):
        pattern += f'{bad_letters[i]}|'
    pattern += f'{bad_letters[-1]}'
    new_df = df[~df['word'].str.contains(pattern, case=False, na=False)]
    return new_df


r2_words_df = remove_bad_words(freq_flw_df, bad_letters)
print(r2_words_df.shape)
r2_words_df.head(10)

(6473, 3)


Unnamed: 0,word,count,prob
85,would,572644147,0.008273
162,books,347710184,0.005023
207,could,302311431,0.004367
236,local,270742935,0.003911
270,black,244690155,0.003535
294,found,232005894,0.003352
349,small,208371878,0.00301
387,class,191087771,0.002761
570,young,136341684,0.00197
802,sound,100010833,0.001445


#### and keep only words with good letters

In [239]:
def keep_good_words(df, good_letters, correct_letters):
    # cut down to only the words containing all of the good letters
    for i in range(len(good_letters)):
        df = df[df['word'].str.contains(good_letters[i], case=False)]
    
    # now we can iterate across this smaller list to check placement of the correct letters
    keep_rows = []
    for _, row in df.iterrows():
        keep = True
        for i in range(5):
            if correct_letters[i] != '':
                if row['word'][i] != correct_letters[i]:
                     keep = False

        if keep:
            keep_rows.append(row)

    return pd.DataFrame(keep_rows)

r2_words_df = keep_good_words(r2_words_df, good_letters, correct_letters)
print(r2_words_df.shape)
r2_words_df.head(10)

(6473, 3)


Unnamed: 0,word,count,prob
85,would,572644147,0.008273
162,books,347710184,0.005023
207,could,302311431,0.004367
236,local,270742935,0.003911
270,black,244690155,0.003535
294,found,232005894,0.003352
349,small,208371878,0.00301
387,class,191087771,0.002761
570,young,136341684,0.00197
802,sound,100010833,0.001445


In [240]:
def letter_freq(df):
    all_words = "".join(df['word'])
    letter_freq = Counter(all_words)
    return letter_freq

letters = letter_freq(r2_words_df)
letters.most_common(10)

[('a', 5429),
 ('o', 4024),
 ('s', 2742),
 ('l', 2578),
 ('n', 2345),
 ('u', 1726),
 ('c', 1636),
 ('m', 1556),
 ('g', 1455),
 ('d', 1396)]

OK, now we have a list of remaining words that do not contain any removed letters and also contain all good letters.  Any CORRECT letters should be in place to narrow down the choices.

In this first example, the word BOARD has the most letters in the top 5

### Round 3

In [241]:
word_2 = ['c', 'o', 'u', 'l', 'd']
result_2 = [
    'green',
    'gray',
    'green',
    'green',
    'gray'
]


In [242]:
correct_letters, good_letters, bad_letters = build_rules(
    word_2,
    result_2,
    correct_letters,
    good_letters,
    bad_letters
)

r3_words_df = remove_bad_words(r2_words_df, bad_letters)
r3_words_df = keep_good_words(r3_words_df, good_letters, correct_letters)
letters = letter_freq(r3_words_df)


In [243]:
print(correct_letters)
print(good_letters)
print(bad_letters)

print(r3_words_df.shape)
print(letters.most_common(10))
r3_words_df.head(10)

['c', '', 'u', 'l', '']
['c', 'u', 'l']
['t', 'h', 'e', 'i', 'r', 'o', 'd']
(3, 3)
[('c', 3), ('u', 3), ('l', 3), ('a', 2), ('s', 2), ('k', 1), ('b', 1)]


Unnamed: 0,word,count,prob
59571,caulk,245944,3.553138e-06
92706,csulb,113291,1.636708e-06
247370,csula,21559,3.114616e-07


Only 3 remain.  And now we have a few more letters to took for.  Knowing that 'r' does not belong in the 4th or 5th letter, letter 2 is not 'o' and letter 3 is not 'a'.  Need to code up a filter that can kill items that break these rules.

### Round 4

From this, next best guess would be group ... right now we do not have a good solution to make sure we are not about to try keep track of where a letter should NOT be.  If we can build that this can likely be fully automated with the highest statistical probability being used.

In [216]:
word_3 = ['c', 'a', 'u', 'l', 'k']
result_3 = [
    'green',
    'green',
    'green',
    'green',
    'green'
]


In [217]:
correct_letters, good_letters, bad_letters = build_rules(
    word_3,
    result_3,
    correct_letters,
    good_letters,
    bad_letters
)

r4_words_df = remove_bad_words(r3_words_df, bad_letters)
r4_words_df = keep_good_words(r4_words_df, good_letters, correct_letters)
letters = letter_freq(r4_words_df)


In [218]:
print(correct_letters)
print(good_letters)
print(bad_letters)

print(r4_words_df.shape)
print(letters.most_common(10))
r4_words_df.head(10)

['a', 'r', 'o', '', '']
['r', 'o', 'a']
['t', 'h', 'e', 'i', 'b', 'd', 'u', 'w']
(6, 3)
[('r', 3275), ('a', 2557), ('o', 1612), ('s', 896), ('n', 698), ('d', 672), ('u', 672), ('c', 601), ('m', 557), ('b', 501)]


Unnamed: 0,word,count,prob
17422,aroma,2018777,2.916515e-05
50754,arora,324655,4.690271e-06
87677,arosa,124877,1.80409e-06
102065,arona,95660,1.381994e-06
180494,aroon,36090,5.2139e-07
182818,arons,35316,5.102081e-07


And now we have 6 words to look into for the next attempt

### Round 5

In [184]:
word_3 = ['a', 'r', 'o', 'm', 'a']
result_3 = [
    'gray',
    'green',
    'green',
    'gray',
    'gray'
]


In [186]:
correct_letters, good_letters, bad_letters = build_rules(
    word_3,
    result_3,
    correct_letters,
    good_letters,
    bad_letters
)

r4_words_df = remove_bad_words(r3_words_df, bad_letters)
r4_words_df = keep_good_words(r4_words_df, good_letters, correct_letters)
letters = letter_freq(r4_words_df)


In [187]:
print(correct_letters)
print(good_letters)
print(bad_letters)

print(r4_words_df.shape)
print(letters.most_common(10))
r4_words_df.head(10)

['', 'r', 'o', '', '']
['r', 'o', 'a']
['t', 'h', 'e', 'i', 'b', 'd', 'g', 'u', 'p']
(56, 3)
[('r', 3275), ('a', 2557), ('o', 1612), ('s', 896), ('n', 698), ('d', 672), ('u', 672), ('c', 601), ('m', 557), ('b', 501)]


Unnamed: 0,word,count,prob
1082,cross,74230978,0.001072
4218,crown,16651038,0.000241
17422,aroma,2018777,2.9e-05
20127,krona,1585436,2.3e-05
22603,crows,1294733,1.9e-05
25334,crook,1067661,1.5e-05
27589,crock,918524,1.3e-05
31299,frown,742356,1.1e-05
32762,kroon,687367,1e-05
47472,kroll,363969,5e-06


And now we have 56 words to look into for the next attempt