In [1]:
%load_ext cython

In [2]:
%%cython --annotate -3

cdef:
    int RESULT_CORRECT = 1
    int RESULT_IN_WORD = 2
    int RESULT_INVALID = 3
    

cdef class Constraint:

    cdef public: 
        int type_
        int index
        str letter
        
    def __cinit__(self, int type_, int index, str letter):
        self.type_ = type_
        self.index = index
        self.letter = letter
        
    cpdef bint match(self, str word):
        if self.type_ == RESULT_CORRECT:
            return word[self.index] == self.letter
        elif self.type_ == RESULT_IN_WORD:
            return word[self.index] != self.letter and self.letter in word
        elif self.type_ == RESULT_INVALID:  # INVALID
            return self.letter not in word
        else:
            print(self.type_)
            raise Exception()



In [3]:
%%cython -3

cpdef float match_probability(list constraints, list words):
   cdef float num_total_words = len(words)
   for c in constraints:
       words = [w for w in words if c.match(w)]
   return len(words) / num_total_words

In [4]:
import pandas as pd

dataset = pd.read_csv('./unigram_freq_wordle.csv')

In [5]:
import sqlite3
con = sqlite3.connect('word_entropy.db')

In [6]:
cur = con.cursor()

# Create tables
cur.execute('''CREATE TABLE IF NOT EXISTS word_info
               (id integer primary key, word text, entropy real, probability real)''')
cur.execute('''CREATE TABLE IF NOT EXISTS probability_constraint
               (id integer primary key autoincrement, constraint1 integer, constraint2 integer, constraint3 integer, constraint4 integer, constraint5 integer, probability real, id_word integer references word_info(id))''')

<sqlite3.Cursor at 0x7f71076c7c00>

In [15]:
drop = False
if drop:
    cur.execute('DELETE FROM probability_constraint')
    cur.execute('DELETE FROM word_info')

In [None]:
from itertools import product
from collections import Counter
import numpy as np
from tqdm import tqdm_notebook

def compute_constraints_entropy(word: str, id_word: int, word_list: list):
    """
    Compute probability distribution of all possible constrains and then compute entropy
    Warning: exponential complexity!! Implement with cython and test before run on all words
    """
    distribution = []
    for w_constraints in product([3,2,1], repeat=5):
        if Counter(w_constraints) == {1:4,2:1}:
            continue
        constraints = [Constraint(w_constraints[i], i, word[i]) for i in range(5)]
        p = match_probability(constraints, word_list)
        distribution.append(p)
        cur.execute(f'''INSERT INTO probability_constraint (constraint1, constraint2, constraint3, constraint4, constraint5, probability, id_word) 
                                                            VALUES ({w_constraints[0]}, {w_constraints[1]}, {w_constraints[2]}, {w_constraints[3]}, {w_constraints[4]}, {p}, {id_word})''')
    distribution = np.array(distribution)
    distribution = distribution[distribution > 0.0]
    entropy = - (distribution * np.log2(distribution)).sum()
    return entropy
    

remaining_indexes = dataset.index.difference([x[0] for x in cur.execute('select id from word_info')])
    
for i in tqdm_notebook(remaining_indexes):
    cur.execute(f'''INSERT INTO word_info (id, word) 
                    VALUES ({i}, '{dataset.loc[i, 'word']}');''')
    entropy = compute_constraints_entropy(dataset.loc[i, 'word'], i, dataset.word.tolist())
    cur.execute(f'''UPDATE word_info
                    SET entropy = {entropy}
                    WHERE id = {i};''')
    if (i % 10) == 0:
        con.commit()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for i in tqdm_notebook(remaining_indexes):


  0%|          | 0/12972 [00:00<?, ?it/s]

In [10]:
con.commit()

[0, 1, 2, 3, 4]

In [None]:
cur.close()