In [1]:
import pandas as pd
from collections import Counter
import pprint as pp


In [36]:
class wordle_game:
    
    def __init__(self):
        self.round_num = 0
        self.word_letters = {
            0: { 'is': None, 'is_not': [] },
            1: { 'is': None, 'is_not': [] },
            2: { 'is': None, 'is_not': [] },
            3: { 'is': None, 'is_not': [] },
            4: { 'is': None, 'is_not': [] },           
        }
        self.good_letters = []
        self.bad_letters = []
    
    def load_data(self, folder, filename):
        self.words = pd.read_csv(f"{folder}{filename}")
        print(f"Words in corpus {self.words.shape[0]}")

    def prep_data(self):
        self.new_words = self.words[self.words['word'].str.len() == 5].copy().reset_index()

        total_count = sum(self.new_words['count'])
        probs = [x / total_count for x in self.new_words['count']]
        self.new_words['prob'] = probs

        print(f"Count of 5 Letter Words {self.new_words.shape[0]}")
        print("Highest frequency 5 letter words")
        print(self.new_words.head(10))

        self.get_letter_freq()
        print("Highest frequency letters")
        print(self.letter_freq.most_common(10))
        
    def recommend_next(self, options=50, display=10, allow_dupe_letters=True):
        # compute score for each of the top X words
        ranked_words = []
        print(f"\nRemaining word count {self.new_words.shape[0]}")
        
        for index, this_row in self.new_words[:options].iterrows():
            score = 0
            this_word = this_row['word']

            c = Counter(this_word)
            if allow_dupe_letters or max(c.values()) == 1:
                for i in range(5):
                    score += self.letter_freq[this_word[i]]

            score = round(score * this_row['prob'], 0)
            ranked_words.append([this_word, score])

        print(pd.DataFrame(ranked_words).sort_values(by=[1], ascending=False).head(display))
              
    def get_letter_freq(self):
        all_words = "".join(self.new_words['word'])
        self.letter_freq = Counter(all_words)

    def build_rules(self, word, results):
        for i in range(5):
            result = results[i]
            letter = word[i]

            if result == 'c':
                self.word_letters[i]['is'] = letter
                if letter not in self.good_letters:
                    self.good_letters.append(letter)
            elif result == 'y':
                self.word_letters[i]['is_not'].append(letter)
                if letter not in self.good_letters:
                    self.good_letters.append(letter)
            else:
                if letter not in self.bad_letters:
                    self.bad_letters.append(letter)

    def remove_bad_words(self):
        pattern = ""
        for i in range(len(self.bad_letters)-1):
            pattern += f"{self.bad_letters[i]}|"
        pattern += f"{self.bad_letters[-1]}"
        self.new_words = self.new_words[~self.new_words['word'].str.contains(pattern, case=False)]

    def words_with_good_letters(self):
        # cut down to only the words containing all of the good letters
        for i in range(len(self.good_letters)):
            self.new_words = self.new_words[self.new_words['word'].str.contains(self.good_letters[i], case=False)]
        # self.new_words.reset_index(inplace=True)
        
    def letters_in_correct_place(self):
        # now we can iterate across this smaller list to check placement of the correct letters
        keep_rows = []
        for _, row in self.new_words.iterrows():
            keep = True
            for i in range(5):
                if self.word_letters[i]['is'] != None:
                    if row['word'][i] != self.word_letters[i]['is']:
                         keep = False
                elif row['word'][i] in self.word_letters[i]['is_not']:
                    keep=False

            if keep:
                keep_rows.append(row)

        self.new_words = pd.DataFrame(keep_rows)

    def show_results(self):
        print(f"\nRound Number {self.round_num}")
        print("\n")
        print(f"Correct= {self.word_letters}")
        print(f"Good\t= {self.good_letters}")
        print(f"Bad\t= {self.bad_letters}")
        # print(f"\tRemaining Words = {self.new_words.shape[0]}")
        # print(self.letter_freq.most_common(10))
        # print(self.new_words.head(10))
        
    def new_round(self, word, results, round_num):
        self.round_num = round_num

        if results == 'ccccc':
            print(f"\nCongrats!  Victory in round {self.round_num}")
        else:
            self.build_rules(word, results)
            self.remove_bad_words()
            self.words_with_good_letters()
            self.letters_in_correct_place()
            self.get_letter_freq()
            self.show_results()


In [37]:
data_folder = 'data\\'
kaggle_file = 'unigram_freq.csv'


In [38]:
game = wordle_game()
game.load_data(data_folder, kaggle_file)
game.prep_data()
game.recommend_next(allow_dupe_letters=False)


Words in corpus 333333
Count of 5 Letter Words 39933
Highest frequency 5 letter words
   index   word       count      prob
0     35  about  1226734006  0.017723
1     45  other   978481319  0.014136
2     56  which   810514085  0.011709
3     57  their   782849411  0.011310
4     62  there   701170205  0.010130
5     82  first   578161543  0.008353
6     85  would   572644147  0.008273
7     92  these   541003982  0.007816
8     93  click   536746424  0.007754
9    100  price   501651226  0.007247
Highest frequency letters
[('a', 21942), ('e', 18907), ('o', 14627), ('i', 13749), ('s', 13683), ('r', 12185), ('n', 11447), ('l', 10375), ('t', 9856), ('c', 7422)]

Remaining word count 39933
        0       1
0   about  1032.0
1   other   865.0
3   their   682.0
11  email   461.0
5   first   440.0
9   price   419.0
14  after   356.0
19  years   348.0
6   would   341.0
21  items   302.0


In [39]:
game.new_round('email', 'yxxxx', round_num=1)
game.recommend_next(options=20, display=20, allow_dupe_letters=False)



Round Number 1


Correct= {0: {'is': None, 'is_not': ['e']}, 1: {'is': None, 'is_not': []}, 2: {'is': None, 'is_not': []}, 3: {'is': None, 'is_not': []}, 4: {'is': None, 'is_not': []}}
Good	= ['e']
Bad	= ['m', 'a', 'i', 'l']

Remaining word count 4474
        0      1
0   other  163.0
6   store   53.0
5   under   49.0
7   those   44.0
8   phone   38.0
10  house   36.0
11  power   35.0
16  quote   20.0
17  poker   20.0
4   order    0.0
3   where    0.0
2   these    0.0
9   check    0.0
1   there    0.0
12  three    0.0
13  press    0.0
14  never    0.0
15  users    0.0
18  offer    0.0
19  needs    0.0


In [40]:
game.new_round('store', 'xcyyy', round_num=2)
game.recommend_next(options=20, display=20)



Round Number 2


Correct= {0: {'is': None, 'is_not': ['e']}, 1: {'is': 't', 'is_not': []}, 2: {'is': None, 'is_not': ['o']}, 3: {'is': None, 'is_not': ['r']}, 4: {'is': None, 'is_not': ['e']}}
Good	= ['e', 't', 'o', 'r']
Bad	= ['m', 'a', 'i', 'l', 's']

Remaining word count 3
       0    1
0  other  0.0
1  otter  0.0
2  otehr  0.0


In [7]:
game.new_round('other', 'ccccc', round_num=3)
game.recommend_next(options=20, display=20)



Congrats!  Victory in round 3

Remaining word count 3
       0   1
1  otter  17
0  other  15
2  otehr  15


In [None]:
game.new_round('spill', 'cxccc')


In [None]:
game.new_round('swill', 'ccccc')