# **Scrabble superstring**
## [Riddler Classic, Jun. 28, 2019](https://fivethirtyeight.com/features/whats-your-best-scrabble-string/)

### solution by [Laurent Lessard](https://laurentlessard.com)

---

## Some preliminaries

Here I tabulate the tile distribution and collect the list of admissible words. I used the [ENABLE word list](https://norvig.com/ngrams/enable1.txt), as instructed in the problem statement.

In [155]:
import numpy as np
import random
from gurobipy import *
import networkx as nx
import matplotlib.pyplot as plt
from functools import lru_cache

In [156]:
# Load the list of legal words. There are N words total
f = open("enable1.txt","r")
WORDLIST = [ w[:-1] for w  in f ]
N = len(WORDLIST)
print('The word list contains', N, 'words.')

The word list contains 172820 words.


In [157]:
# Make a list of all the letter tiles
TILES = 12*'e' + 9*'a' + 9*'i' + 8*'o' + 6*'n' + 6*'r' + 6*'t' + 4*'l' \
       + 4*'s' + 4*'u' + 4*'d' + 3*'g' + 2*'b' + 2*'c' + 2*'m' + 2*'p' \
       + 2*'f' + 2*'h' + 2*'v' + 2*'w' + 2*'y' + 'kjxqz' + 2*'_'

# All the letters in the world
ALPHABET = 'abcdefghijklmnopqrstuvwxyz'
L = len(ALPHABET)

In [158]:
# How much is each letter worth, and how much is a word worth?
# an unknown blank is _, and a known blanks are in uppercase
scoredic = dict()
for lett in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ_':
    scoredic[lett] = 0
for lett in 'eaionrtlsu':
    scoredic[lett] = 1
for lett in 'dg':
    scoredic[lett] = 2
for lett in 'bcmp':
    scoredic[lett] = 3
for lett in 'fhvwy':
    scoredic[lett] = 4
for lett in 'k':
    scoredic[lett] = 5
for lett in 'jx':
    scoredic[lett] = 8
for lett in 'qz':
    scoredic[lett] = 10
    
@lru_cache(maxsize=None)
def wordscore(word):
    """
    Evaluate the Scrabble score of a given word
    """
    return sum( scoredic[lett] for lett in word )

In [159]:
def cull_wordlist( wordlist, tiles_available ):
    """
    return the subset of a wordlist (list of strings) that can be made
    using tiles from tiles_available (a string, can include blanks _)
    """
    available_letter_counts = np.array([ tiles_available.count(lett) for lett in ALPHABET ])
    available_blank_count = tiles_available.count('_')
    culled_wordlist = []
    
    word_letter_counts = np.array([ [word.count(lett) for lett in ALPHABET] for word in wordlist ])
    for (ix,lettcount) in enumerate(word_letter_counts):
        if sum( [n for n in available_letter_counts-lettcount if n < 0] ) + available_blank_count >= 0:
            culled_wordlist.append( wordlist[ix] )
            
    return culled_wordlist

In [160]:
def findall(s, p):
    """
    Yields all the positions of
    the pattern p in the string s.
    """
    i = s.find(p)
    while i != -1:
        yield i
        i = s.find(p, i+1)


def stringscore(s, wordlist):
    """
    Evaluate the total score of a superstring, looking for words in wordlist
    Use this when the string doesn't contain any blanks ('_' or uppercase)
    """
    scoretot = 0
    for word in wordlist:
        if word in s:
            scoretot += wordscore(word)
    return scoretot

def stringscore_filledblanks(s, wordlist):
    """
    Evaluate the total score of a superstring that contains filled blanks
    (filled in as uppercase letter)
    """
    slower = s.lower()
    scoretot = 0
    for word in wordlist:
        if word in slower:
            scoretot += max( [ wordscore(s[i:i+len(word)]) for i in findall(slower, word) ] )
    return scoretot
    
def stringscore_bestblanks(s, wordlist):
    """
    Evaluate the total score of a superstring that contains blanks ('_')
    by filling in the blanks with the letter yielding the highest score
    """
    if '_' not in s:
        return stringscore_filledblanks(s, wordlist)
    
    locblanks = list(findall(s, '_'))
    numblanks = len(locblanks)
    slist = list(s)
    scorelist = []
    for vals in itertools.product(ALPHABET.upper(), repeat=numblanks):
        for (ix,i) in enumerate(locblanks):
            slist[i] = vals[ix]
            scorelist.append( stringscore_filledblanks(''.join(slist), wordlist) )
    return max(scorelist)

In [161]:
def dotheymatch(w1,w2,lett):
    """
    Returns True if words w1 and w2 can both match while sharing letter lett.
    We assume w1 and w2 each contain letter lett exactly once.
    """
    i1 = w1.find(lett)
    i2 = w2.find(lett)
    match_right = (w1[i1:] in w2) or (w2[i2:] in w1)
    match_left  = (w1[:i1+1] in w2) or (w2[:i2+1] in w1)
    return match_right and match_left

def getlettwords(wordlist, lett):
    """
    filter a wordlist by those containing a particular letter exactly once
    """
    lwords = []
    lwordscores = []
    for w in wordlist:
        if w.count(lett) == 1:
            lwords.append(w)
            lwordscores.append(wordscore(w))
    return (lwords,lwordscores)

def makegraph(lwords,lett):
    """
    assume lwords is a list of words containing the letter lett exactly once.
    create graph where each node is one of the lwords. two lwords are connected
    by an edge if they can both simultaneously use letter lett (admissible overlap)
    """
    n = len(lwords)
    elist = [ (w1,w2) for (w1,w2) in itertools.combinations(lwords,2) if dotheymatch(w1,w2,lett) ]
    return nx.Graph(elist)

def unionize(clique,lett):
    """
    take in a clique of words that all match on letter lett
    and return the union word that contains all words in the clique
    """
    leftwords = []
    rightwords = []
    for w in clique:
        ix = w.find(lett)
        leftwords.append( w[:ix] )
        rightwords.append( w[ix+1:] )
    leftwords.sort(key=len)
    rightwords.sort(key=len)
    return leftwords[-1] + lett + rightwords[-1]

def getbestcliques(G,lett):
    """
    given a graph created using makegraph and the letter lett,
    for each maximal clique, return (each item is a list)
    ( clique score, list of clique words, unionized clique word )
    """
    cliques = nx.find_cliques(G)
    cliquescores = []
    cliquewords = []
    cliqueunion = []
    for c in cliques:
        cliquewords.append( c )
        cliquescores.append( sum( [wordscore(w) for w in c] ) )
        cliqueunion.append( unionize(c,lett) )
    isrt = np.argsort(cliquescores)[::-1]
    return [ cliquescores[i] for i in isrt ], [ cliquewords[i] for i in isrt ], [ cliqueunion[i] for i in isrt ]

In [162]:
def majorize_test( list1, list2 ):
    """
    returns True if every element of list 1 is >= the corresp elem of list2
    """
    return all( [ list1[i] >= list2[i] for i in range(len(list1)) ] )

def eliminate_letter( tiles, wordlist, lett, k=1 ):
    """
    Find the k best word clusters that use the letter lett exactly once
    given the tiles and wordlist available. Return:
    (winning cluster, cluster score, remaining tiles)
    """
    (words,scores) = getlettwords(wordlist,lett)
    print(len(words), lett, 'words')
    G = makegraph(words,lett)
    scores,_,unions = getbestcliques(G,lett)
    
    # only keep the ones we can actually make with the letters we have:
    tot_lettercount = np.array( [tiles.count(lett) for lett in ALPHABET] )
    lettercount     = np.array( [ [w.count(lett) for lett in ALPHABET] for w in unions ] )
    
    winning_clusters = []
    winning_scores = []
    
    numfound = 0
    for i in range(len(scores)):
        if majorize_test( tot_lettercount, lettercount[i] ):
            winning_clusters.append( unions[i] )
            winning_scores.append( scores[i] )
            numfound += 1
            if numfound >= k:
                break
            
    # assuming we will use the above list of words, what tiles + words are left over?
    tiles_remaining = []
    true_scores = []
    for c in winning_clusters:
        tmp = list(tiles)
        for tile in c:
            tmp.remove(tile)
        tiles_remaining.append( ''.join(tmp) )
        true_scores.append( stringscore_filledblanks(c,wordlist) )
        
    # sort winners by true score
    isrt = np.argsort(true_scores)[::-1]
    
    return [ winning_clusters[i] for i in isrt ], [ true_scores[i] for i in isrt ], [ tiles_remaining[i] for i in isrt ]

In [358]:
def knapsack( clusters, scores, tiles ):
    """
    given cluster and scores dicts, figure out which clusters to select
    so that we respect the tile quotas
    """
    agg_clusters = []
    agg_scores = []
    agg_lettcounts = np.zeros((0,L))
    cluster_lengths = []

    for key,val in clusters.items():
        cluster_lengths = len(val)
        agg_clusters.extend(val)
        agg_lettcounts = np.vstack( [agg_lettcounts, np.array( [ [w.count(lett) for lett in ALPHABET] for w in val ] )] )

    for key,val in scores.items():
        agg_scores.extend(val)
 
    tot_lettercount = np.array( [ tiles.count(lett) for lett in ALPHABET ] )
    

    m = Model("knapsack")
    m.setParam( 'OutputFlag', False )
    m.setParam( 'PoolSearchMode', 2 )  # find the k best solutions
    m.setParam( 'PoolSolutions', 10 )  # k; find this many solutions

    M = len(agg_clusters)

    # Create variables (how many of each meta-word to use).
    w = m.addVars(M, vtype=GRB.BINARY, name="w")

    # total number of words used and number of each letter used
    tot_score = quicksum(w[i]*agg_scores[i] for i in range(M))
    
    letters_used = [ quicksum(agg_lettcounts[i,j]*w[i] for i in range(M)) for j in range(L) ]
    points_used = quicksum(letters_used[j]*scoredic[ALPHABET[j]] for j in range(L))

    # Set objective (minimize number of words used)
    m.setObjective( tot_score, GRB.MAXIMIZE)

    # Constraint: must use as many letters as we have tiles of each sort
    m.addConstrs(  (letters_used[j] <= tot_lettercount[j] for j in range(L) ) )

    m.optimize()
    
    scount = m.SolCount  # number of solutions found
    sols = []
    for k in range(scount):
        m.setParam( 'SolutionNumber', k )
        sols.append( [(agg_clusters[i], agg_scores[i]) for i,v in enumerate(m.Xn) if v > 0.5] )
    
    return sols
#     return [(ix,int(v.x),agg_clusters[ix], agg_scores[ix]) for ix,v in enumerate(m.getVars()) if v.x > 0 ]

In [164]:
# Solve hybrid TSP/Knapsack problem to find the max-scoring permutation of letters
def full_optimize( tiles, wordlist, wordscores ):

    m = Model("TSP/Knapsack")
    m.setParam( 'OutputFlag', False )

    lett_present = [ lett for lett in ALPHABET if lett in tiles ]
    lett_counts = [ tiles.count(lett) for lett in lett_present ]

    p = len(tiles)          # total number of tile Positions (exclude blanks)
    d = len(lett_present)   # number of Distinct tiles remaining
    w = len(wordlist)       # number of Words in the list

    # Z variable: permutation matrix for the letters (distinct lett x tot lett)
    Z = m.addVars(d,p, vtype=GRB.BINARY, name="Z")
    # rows sum to number of distinct tiles of that type, columns sum to 1
    m.addConstrs( (quicksum(Z[i,j] for i in range(d)) == 1 for j in range(p)) )
    m.addConstrs( (quicksum(Z[i,j] for j in range(p)) == lett_counts[i] for i in range(d)) )

    # P variable: true if word i is used in position j
    # F variable: true if word i is found in the permutation at all (Fi is true if Pij >= 1 for some j)
    P = m.addVars(w,p, vtype=GRB.BINARY, name="P")
    F = m.addVars(w, vtype=GRB.BINARY, name="F")
    m.addConstrs( (quicksum(P[k,j] for j in range(p)) >= F[k] for k in range(w)) )

    # CONSTRAINT: template matching for each word
    for k in range(w):
        word = wordlist[k]
        wl = len(word)
        word_template = [ [word[jj] == lett_present[i] for jj in range(wl)] for i in range(d) ]
        for j in range(p):
            if j > p-wl:
                m.addConstr( P[k,j] == 0 )
            else:
                m.addConstr( quicksum( Z[i,j+jj]*word_template[i][jj] for i in range(d) for jj in range(wl) ) >= wl*P[k,j] )

    # total number of words used and number of each letter used
    tot_score = quicksum( F[k] * wordscores[k] for k in range(w))

    # Set objective (minimize number of words used)
    m.setObjective( tot_score, GRB.MAXIMIZE)

    m.optimize()
    
    Z = np.array( [ v.x for ix,v in enumerate(m.getVars()) if ix < d*p ] ).reshape((d,p))
    winstr = ''.join( [ [lett_present[ix] for ix,z in enumerate(Z[:,j]) if z][0] for j in range(p) ] )
    print(winstr)
    print(m.objVal)

In [165]:
def TSPsolve( C ):
    """
    Solve TSP problem where C is the cost matrix
    output is a vector u of the final node ordering. If nodes are N then final ordering is:
    [ N[i] for i in u ]
    Note: we do not loop back -- we solve TSP with arbitrary start/end.
    """
    m = Model("TSP")
    m.setParam( 'OutputFlag', False )

    # augment cost matrix by one
    n = C.shape[0]+1
    c = np.zeros((n,n))
    c[1:,1:] = C
    
    # X variable: permutation matrix for closed path
    X = m.addVars(n,n, vtype=GRB.BINARY, name="X")
    # enforce row and column sums are 1, no self-loops
    m.addConstrs( (quicksum(X[i,j] for i in range(n)) == 1 for j in range(n)) )
    m.addConstrs( (quicksum(X[i,j] for j in range(n)) == 1 for i in range(n)) )
    m.addConstrs( (X[i,i] == 0 for i in range(n)) )

    
    # U variable: ordering
    u = m.addVars(n, vtype=GRB.INTEGER, lb=1, ub=n, name="u")
    # Miller-Tucker-Zemlin constraint
    m.addConstrs( ( u[i] - u[j] + n*X[i,j] <= n-1 for i in range(n) for j in range(1,n) ))
    
    # total score we're trying to maximize
    tot_score = quicksum( X[i,j]*c[i,j] for i in range(n) for j in range(n) )
    
    # Set objective (minimize number of words used)
    m.setObjective( tot_score, GRB.MAXIMIZE)

    m.optimize()
    
    return np.array( [v.x for ix,v in enumerate(m.getVars()) if ix >= n*n ], int )[1:] - 2

In [176]:
def bestperm( wordlist ):

    n = len(wordlist)
    wordscores = [ stringscore_filledblanks(w, WORDLIST) for w in wordlist ]
    rewardmat = np.array( [ [stringscore_filledblanks(w1 + w2, WORDLIST) for w2 in wordlist] for w1 in wordlist ])
    u = TSPsolve(rewardmat)
    return ''.join([ wordlist[i] for i in u])

In [166]:
def remove_ss(string):
    """
    given a string containing 6x 's', find out which two we can remove to
    get the best score (i.e. turn them into blanks)
    """
    ix = findall(string,'s')
    newstrings = [ string[:j1] + 'S' + string[j1+1:j2] + 'S' + string[j2+1:] for (j1,j2) in itertools.combinations(ix,2) ]
    newstrings.sort(key=lambda x: stringscore_filledblanks(x, WORDLIST), reverse=True)
    
    print(newstrings[0])
    print(stringscore_filledblanks(newstrings[0], WORDLIST))
    return(newstrings[0])

In [208]:
def isvalid(string):
    """
    tests if a string is valid
    """
    out = True
    if len(string) > 100:
        print("string too long")
        out = False

    for lett in ALPHABET:
        if string.count(lett) != TILES.count(lett):
            print("incorrect number of", lett)
            out = False        
    return out

In [28]:
tiles_rem = ''.join(random.sample(TILES[:-2],12))
words_rem = cull_wordlist(WORDLIST, tiles_rem)
scores_rem = [ wordscore(w) for w in words_rem ]
print( "tiles:", tiles_rem )
print( "number of words:", len(words_rem) )

tiles: saothgskvaar
number of words: 307


In [29]:
%time full_optimize( tiles_rem, words_rem, scores_rem )

KeyboardInterrupt: 

In [150]:
wordlist = [ 'bluejacketsowe',  'deoxidizers',  'autobiographically',  'inadequatenesses',  'preformatting',  'uninformatively',  'overshadowing']
n = len(wordlist)
wordscores = [ stringscore_filledblanks(w, WORDLIST) for w in wordlist ]
rewardmat = np.array( [ [stringscore_filledblanks(w1 + w2, WORDLIST) for w2 in wordlist] for w1 in wordlist ])

In [151]:
u = TSPsolve(rewardmat)
soln = ''.join([wordlist[i] for i in u])
print(soln)
print(stringscore_filledblanks(soln,WORDLIST))

bluejacketsowedeoxidizersinadequatenessespreformattinguninformativelyautobiographicallyovershadowing
1418


In [154]:
remove_ss(soln)

bluejacketsowedeoxidizerSinadequatenesseSpreformattinguninformativelyautobiographicallyovershadowing
1410


'bluejacketsowedeoxidizerSinadequatenesseSpreformattinguninformativelyautobiographicallyovershadowing'

In [27]:
%time full_optimize( tiles_rem, words_rem, scores_rem )

qghathexif
78.0
Wall time: 13.1 s


In [25]:
%time full_optimize( tiles_rem, words_rem, scores_rem )

propturkr
30.0
Wall time: 2.42 s


In [23]:
%time full_optimize( tiles_rem, words_rem, scores_rem )

vlumirit
26.0
Wall time: 784 ms


In [10]:
full_optimize( 'aaabbcc', ['cab','abc', 'baa'], [7,4,6] )

cabaabc
17.0


In [167]:
# make list of the best stand-alone words
# cull the wordlist (do not use blanks)
tiles_ss = TILES[:-2] + 'ss'
wordlist_ss = cull_wordlist( WORDLIST, tiles_ss )

scores_ss = [ stringscore(w,wordlist_ss) for w in wordlist_ss ]

isrt = np.argsort(scores_ss)[::-1]
scores_ss = [ scores_ss[i] for i in isrt ]
wordlist_ss = [ wordlist_ss[i] for i in isrt ]

In [168]:
# cull the wordlist (assume the blanks will be s)
tiles = TILES[:-2] + 'ss'
wordlist = cull_wordlist( WORDLIST, tiles )

# gather the top 200 clusters from each of four highest-scoring tiles 
letts = list('jqxzk')

clusters = dict()
scores = dict()
tremain = dict()

for lett in letts:
    (ctmp, stmp, ttmp) = eliminate_letter( tiles, wordlist, lett, k=250 )
    clusters[lett] = ctmp
    scores[lett] = stmp
    tremain[lett] = ttmp

2383 j words
2504 q words
4549 x words
6600 z words
11947 k words


In [359]:
clusters['solo'] = wordlist_ss[:2000]
scores['solo'] = scores_ss[:2000]
sols = knapsack( clusters, scores, tiles_ss)

In [363]:
sols[0]

[('cabinetworkingsides', 264),
 ('codevelopers', 236),
 ('prequalifying', 210),
 ('emblazoners', 209),
 ('foreshadowing', 207),
 ('overtaxations', 165),
 ('methylated', 163)]

In [365]:
for sol in sols:
    words_used = [t[0] for t in sol]
    tiles_used = ''.join( words_used )

    tmp = list(tiles_ss)
    for tile in tiles_used:
        tmp.remove(tile)
    tiles_remaining = ''.join(tmp)

    print("words:", words_used)
    print("remaining:", tiles_remaining)
    print("score =", sum( t[1] for t in sol ))

words: ['cabinetworkingsides', 'codevelopers', 'prequalifying', 'emblazoners', 'foreshadowing', 'overtaxations', 'methylated']
remaining: aaiituuuj
score = 1454
words: ['prequalifying', 'cabinetworkingsides', 'codevelopers', 'emblazoners', 'foreshadowing', 'overtaxations', 'methylated']
remaining: aaiituuuj
score = 1454
words: ['prequalifying', 'emblazoners', 'cabinetworkingsides', 'codevelopers', 'foreshadowing', 'overtaxations', 'methylated']
remaining: aaiituuuj
score = 1454
words: ['emblazoners', 'cabinetworkingsides', 'codevelopers', 'prequalifying', 'foreshadowing', 'overtaxations', 'methylated']
remaining: aaiituuuj
score = 1454
words: ['thingamajigsawed', 'emblazoners', 'decarboxylated', 'codevelopers', 'afforestations', 'unthinkingly', 'previewers']
remaining: aiiiootuuuq
score = 1437
words: ['thingamajigsawed', 'decarboxylated', 'emblazoners', 'codevelopers', 'afforestations', 'unthinkingly', 'previewers']
remaining: aiiiootuuuq
score = 1437
words: ['thingamajigsawed', 'decar

In [351]:
tiles_used = ''.join( [t[2] for t in sol] )

tmp = list(tiles_ss)
for tile in tiles_used:
    tmp.remove(tile)
tiles_remaining = ''.join(tmp)
    
print("remaining:", tiles_remaining)
print("score =", sum( t[3] for t in sol ))
sol

remaining: aaiituuuj
score = 1454


[(991, 1, 'emblazoners', 209),
 (1079, 1, 'cabinetworkingsides', 264),
 (1345, 1, 'codevelopers', 236),
 (1517, 1, 'prequalifying', 210),
 (1564, 1, 'foreshadowing', 207),
 (2949, 1, 'overtaxations', 165),
 (3011, 1, 'methylated', 163)]

In [343]:
clusters['solo'] = wordlist_ss[:3000]
scores['solo'] = scores_ss[:3000]
sol = knapsack( clusters, scores, tiles_ss)

tiles_used = ''.join( [t[2] for t in sol] )

tmp = list(tiles_ss)
for tile in tiles_used:
    tmp.remove(tile)
tiles_remaining = ''.join(tmp)
    
print("remaining:", tiles_remaining)
print("score =", sum( t[3] for t in sol ))
sol

remaining: aaiituuuj
score = 1454


[(1079, 1, 'cabinetworkingsides', 264),
 (1345, 1, 'codevelopers', 236),
 (1517, 1, 'prequalifying', 210),
 (1546, 1, 'emblazoners', 209),
 (1564, 1, 'foreshadowing', 207),
 (2949, 1, 'overtaxations', 165),
 (3011, 1, 'methylated', 163)]

In [170]:
tiles_remain = 'aaiituuuj'
wordlist_remain = cull_wordlist( WORDLIST, tiles_remain )
scores_remain = [ wordscore(w) for w in wordlist_remain ]

In [172]:
full_optimize( tiles_remain, wordlist_remain, scores_remain )

jutauaiiu
22.0


In [182]:
wlist = [ t[2] for t in sol ]
wlist.append('jutauaiiu')
wlist

['emblazoners',
 'cabinetworkingsides',
 'codevelopers',
 'prequalifying',
 'foreshadowing',
 'overtaxations',
 'methylated',
 'jutauaiiu']

In [366]:
words_used = ['thingamajigsawed', 'emblazoners', 'decarboxylated', 'codevelopers', 'afforestations', 'unthinkingly', 'previewers']
tiles_remain = 'aiiiootuuuq'
wordlist_remain3 = cull_wordlist( WORDLIST, tiles_remain )
scores_remain3 = [ wordscore(w) for w in wordlist_remain3 ]

In [368]:
full_optimize( tiles_remain, wordlist_remain3, scores_remain3 )

iquaitoouiu
37.0


In [373]:
bestperm(words_used)

'unthinkinglycodevelopersemblazonersafforestationspreviewersdecarboxylatedthingamajigsawed'

In [407]:
s =  ['codevelopers','unthinkingly',  'emblazoners', 'afforestations','ou', 'previewers', 'qua', 'decarboxylated','o', 'thingamajigsawed','it', 'u','ii']
stmp = ''.join(s)
print(stringscore_filledblanks(stmp, WORDLIST))

1490


In [315]:
s = ['foreshadowing', 'overtaxations', 'u','methylated']
# s = ['foreshadowing', 'u','methylated', 'overtaxations']
stmp = ''.join(s)
print(stringscore_filledblanks(stmp,WORDLIST))

560


In [340]:
s = [
     'jut','emblazoners','cabinetworkingsides','i', 'codevelopers', 'u', 'prequalifying',
     'a','foreshadowing', 'overtaxations', 'a','methylated','ui',
    ]
stmp = ''.join(s)
print(stringscore_filledblanks(stmp, WORDLIST))

1509


In [341]:
sfinal = remove_ss(stmp)

jutemblazonerScabinetworkingsideSicodevelopersuprequalifyingaforeshadowingovertaxationsamethylatedui
1501


In [342]:
isvalid(sfinal)

True

In [188]:
remove_ss(stmp)

codevelopersovertaxationsmethylatedcabinetworkingsideSprequalifyingforeshadowingjutauaiiuemblazonerS
1440


'codevelopersovertaxationsmethylatedcabinetworkingsideSprequalifyingforeshadowingjutauaiiuemblazonerS'

In [468]:
# cull the wordlist (do not use blanks)
wordlist_remaining = cull_wordlist( WORDLIST, tiles_remaining )

# gather the top 200 clusters from each of four highest-scoring tiles 
letts = 'igmfvwy'

clusters2 = dict()
scores2 = dict()
tremain2 = dict()

for lett in letts:
    (ctmp, stmp, ttmp) = eliminate_letter( tiles_remaining, wordlist_remaining, lett, k=500 )
    clusters2[lett] = ctmp
    scores2[lett] = stmp
    tremain2[lett] = ttmp

20 i words
22 g words
25 m words
8 f words
10 v words
11 w words
17 y words


In [469]:
sol2 = knapsack( clusters2, scores2, tiles_remaining)

tiles_used = ''.join( [t[2] for t in sol2] )

tmp = list(tiles_remaining)
for tile in tiles_used:
    tmp.remove(tile)
tiles_remaining2 = ''.join(tmp)
    
print(tiles_remaining2)
sol2

ifv


[(76, 1, 'gamayaw', 78)]

In [472]:
[t[2] for t in sol]

['bluejackets',
 'autobiographically',
 'inadequatenesses',
 'preformatting',
 'deoxidizers',
 'uninformatively',
 'overshadowing']

In [510]:
'wover' in WORDLIST

False

In [474]:
sum( [t[3] for t in sol] )

1413

In [525]:
s = 'bluejackets' + 'owe' + 'deoxidizers' + 'autobiographically' + 'inadequatenesses' + 'preformatting'  + 'uninformatively' + 'overshadowing'
print(stringscore_filledblanks(s, WORDLIST), c)

1417 overshadowing


In [511]:
s = 'bluejackets' + 'ow' + 'overshadowing' + 'e' + 'deoxidizers' + 'uninformatively' + 'autobiographically' + 'inadequatenesses' + 'preformatting'
print(stringscore_filledblanks(s, WORDLIST), c)

1409 overshadowing


In [526]:
best_str = remove_ss(s)

bluejacketsowedeoxidizerSautobiographicallyinadequatenesseSpreformattinguninformativelyovershadowing
1410


In [517]:
%%time
maxscore = 0
wlist = ['bluejackets','owe','deoxidizers','autobiographically','inadequatenesses','preformatting','uninformatively','overshadowing']
# while True:
#     combo = ''.join(np.random.permutation(wlist))
for wl in itertools.permutations(wlist):
    combo = ''.join(wl)
    score = stringscore( combo , WORDLIST )
    if score > maxscore:
        maxscore = score
        print(maxscore)
        print(combo)
        
        scores = [ stringscore(''.join(wl),WORDLIST) for wl in itertools.permutations(wlist) ]

1417
bluejacketsowedeoxidizersautobiographicallyinadequatenessespreformattinguninformativelyovershadowing


KeyboardInterrupt: 

In [524]:
all_tiles = list(TILES[:-2]+'SS')
for t in best_str:
    all_tiles.remove(t)
print(all_tiles)

[]


In [425]:
s =  'emblazoners' + 'unorthodoxygenated' + 'overbejeweled' + 'i' + 'prequalifications' + 'agamayst' + 'outpolitickingfisherS' + 'i' + 'vawardenS' + 'u'
print(s)
stringscore_filledblanks(s, WORDLIST)

emblazonersunorthodoxygenatedoverbejewelediprequalificationsagamaystoutpolitickingfisherSivawardenSu


1396

In [374]:
s =  'emblazoners' + 'unorthodoxygenated' + 'overbejeweled' + 'getaway' + 'S' + 'aramidivan' + 'S' + 'outpolitickingfishers' + 'u' + 'prequalifications'
print(s)
stringscore_filledblanks(s, WORDLIST)

emblazonersunorthodoxygenatedoverbejeweledgetawaySaramidivanSoutpolitickingfishersuprequalifications


1401

In [364]:
s =  'emblazoners' + 'unorthodoxygenated' + 'overbejeweled' + 'getaway' + 'S' + 'aramidivan' + 'S' + 'outpolitickingfishers' + 'u' + 'prequalifications'
print(s)
stringscore_filledblanks(s, WORDLIST)

emblazonersunorthodoxygenatedoverbejeweledgetawaySaramidivanSoutpolitickingfishersuprequalifications


1401

In [331]:
s = 'unorthodoxygenated' + 'overbejeweled' + 'getaway' + 'S' + 'aramidivan' + 'S' + 'emblazoners' + 'outpolitickingfishers' + 'u' + 'prequalifications'
print(s)
stringscore_filledblanks(s, WORDLIST)

unorthodoxygenatedoverbejeweledgetawaySaramidivanSemblazonersoutpolitickingfishersuprequalifications


1400

In [303]:
s = 'CL' + 'overbejeweledunorthodoxygenationsoutpolitickingfishersiprequalifyinguavawaditamaceratedemblazoners'
print(s)
stringscore_filledblanks(s, WORDLIST)

CLoverbejeweledunorthodoxygenationsoutpolitickingfishersiprequalifyinguavawaditamaceratedemblazoners


1399

In [283]:
s = 'CL' + 'overbejeweled' + 'u' + 'prequalifying' + 'outpolitickingfishers' + 'unorthodoxygenations' + 'i' + 'emblazoners'  + 'a' + 'vawadi' + 'tamacerated'
print(s)
stringscore_filledblanks(s, WORDLIST)

CLoverbejeweleduprequalifyingoutpolitickingfishersunorthodoxygenationsiemblazonersavawaditamacerated


1394

In [299]:
s = 'a' + 'vawadi' + 'SC' + 'overbejeweled' + 'u' + 'prequalifying' + 'outpolitickingfishers' + 'unorthodoxygenations' + 'i' + 'emblazoners' + 'tamacerated'
print(s)
stringscore_filledblanks(s, WORDLIST)

avawadiSCoverbejeweleduprequalifyingoutpolitickingfishersunorthodoxygenationsiemblazonerstamacerated


1398

In [260]:
print(TILES)

eeeeeeeeeeeeaaaaaaaaaiiiiiiiiioooooooonnnnnnrrrrrrttttttllllssssuuuuddddgggbbccmmppffhhvvwwyykjxqz__


In [252]:
s = 'L' + 'overbejeweled' + 'prequalifications' + 'unorthodoxygenated' + 'emblazoners' + 'outpolitickingfishers' + 'wagamay' + 'S' + 'univariated' 
print(s)
stringscore_filledblanks(s, WORDLIST)

LoverbejeweledprequalificationsunorthodoxygenatedemblazonersoutpolitickingfisherswagamaySunivariated


1355

In [190]:
s = 'fumedii' + 'televiewing' + 'svarajaywalkers' + 'paradichlorobenzenes' + 't' + 'unorthodoxygenated' + 'U' + 'prequalifications' + 'gumbootie' + 'S'

stringscore_filledblanks(s, WORDLIST)

fumediiteleviewingsvarajaywalkersparadichlorobenzenestunorthodoxygenatedUprequalificationsgumbootieS


1276

In [165]:
s = 'televiewing' + 'svarajaywalkers' + 'paradichlorobenzenes' + 't' + 'unorthodoxygenated' + '_' + 'prequalifications' + 'gumbootie' + '_' + 'fumedii'

In [178]:
stringscore_filledblanks('televiewingsvarajaywalkersparadichlorobenzenestunorthodoxygenatedUprequalificationsgumbootieSfumedii', WORDLIST)

1274

In [97]:
# cull the wordlist (do not use blanks)
tiles = TILES[:-2]
wordlist = cull_wordlist( WORDLIST, tiles )

# gather the top cluster from each of four starting letters, proceeding recursively
# and eliminating letters as we go
letts = ['j','q','x','z','k']

clusters = []
scores = []
tremain = []

for lett in letts:
    (ctmp, stmp, ttmp) = eliminate_letter( tiles, wordlist, lett, k=1 )
    clusters.append(ctmp[0])
    scores.append(stmp[0])
    tremain.append(ttmp[0])
    
    tiles = ttmp[0]
    wordlist = cull_wordlist(wordlist, tiles)

2380 j words
971 q words
2058 x words
1629 z words
790 k words


In [102]:
# gather the top cluster from each of four starting letters, proceeding recursively
# and eliminating letters as we go
letts = ['w','v']

for lett in letts:
    (ctmp, stmp, ttmp) = eliminate_letter( tiles, wordlist, lett, k=1 )
    clusters.append(ctmp[0])
    scores.append(stmp[0])
    tremain.append(ttmp[0])
    
    tiles = ttmp[0]
    wordlist = cull_wordlist(wordlist, tiles)

45 w words
21 v words


In [103]:
clusters

['disjointednesses',
 'prequalifying',
 'neoorthodoxylographical',
 'emblazoned',
 'reembarkingcraft',
 'avoweet',
 'tavatu']

In [106]:
sum(scores)

1184

In [105]:
tremain

['eeeeeeeeeaaaaaaaaaiiiiiiiooooooonnnnrrrrrrtttttlllluuuuddgggbbccmmppffhhvvwwyykxqz',
 'eeeeeeeeaaaaaaaaiiiiiooooooonnnrrrrrtttttllluuuddggbbccmmpfhhvvwwykxz',
 'eeeeeeeaaaaaaiiiioonnrrrttttluuudgbbcmmfvvwwkz',
 'eeeeeaaaaaiiiionrrrttttuuugbcmfvvwwk',
 'eeeaaaiiiotttuuuvvww',
 'eaaiiittuuuvw',
 'eiiiuuw']

In [240]:
%%time
# create graphs to find the possible word clusters using
# all the singleton letters (J,Q,X,Z,K)

lett = 'j'
(jwords,jscores) = getlettwords(WORDLIST,lett)
print(len(jwords), lett, 'words')
Gj = makegraph(jwords,lett)

lett = 'q'
(qwords,qscores) = getlettwords(WORDLIST,lett)
print(len(qwords), lett, 'words')
Gq = makegraph(qwords,lett)

lett = 'x'
(xwords,xscores) = getlettwords(WORDLIST,lett)
print(len(xwords), lett, 'words')
Gx = makegraph(xwords,lett)

lett = 'z'
(zwords,zscores) = getlettwords(WORDLIST,lett)
print(len(zwords), lett, 'words')
Gz = makegraph(zwords,lett)

lett = 'k'
(kwords,kscores) = getlettwords(WORDLIST,lett)
print(len(kwords), lett, 'words')
Gk = makegraph(kwords,lett)

2429 j words
2519 q words
4578 x words
6627 z words
11995 k words
Wall time: 3min 56s


In [242]:
%%time
# retrieve word clusters and their scores

j_scores,j_words,j_unions = getbestcliques(Gj,'j')
q_scores,q_words,q_unions = getbestcliques(Gq,'q')
x_scores,x_words,x_unions = getbestcliques(Gx,'x')
z_scores,z_words,z_unions = getbestcliques(Gz,'z')
k_scores,k_words,k_unions = getbestcliques(Gk,'k')

Wall time: 6min 38s


In [272]:
Nmeta = 500
tot_lettercount = np.array( [ [TILES.count(lett) for lett in ALPHABET] ] )
j_lettercount   = np.array( [ [w.count(lett) for lett in ALPHABET] for w in j_unions[:Nmeta] ] )
q_lettercount   = np.array( [ [w.count(lett) for lett in ALPHABET] for w in q_unions[:Nmeta] ] )
x_lettercount   = np.array( [ [w.count(lett) for lett in ALPHABET] for w in x_unions[:Nmeta] ] )
z_lettercount   = np.array( [ [w.count(lett) for lett in ALPHABET] for w in z_unions[:Nmeta] ] )
k_lettercount   = np.array( [ [w.count(lett) for lett in ALPHABET] for w in k_unions[:Nmeta] ] )

metawords = np.array([j_unions[:Nmeta] + q_unions[:Nmeta] + x_unions[:Nmeta] + z_unions[:Nmeta] + k_unions[:Nmeta]])
metalettcounts = np.vstack( [j_lettercount, q_lettercount, x_lettercount, z_lettercount, k_lettercount] )
metacounts = np.array([j_scores[:Nmeta] + q_scores[:Nmeta] + x_scores[:Nmeta] + z_scores[:Nmeta] + k_scores[:Nmeta]])

In [273]:
truemetacounts = np.array([[ stringscore(mw,WORDLIST) for mw in metawords[0] ]])

In [270]:
# Solve generalized knapsack problem to find highest-scoring combinations of letters
# that use the singleton tiles (J,Q,X,Z,K)
m = Model("scrabble")
m.setParam( 'OutputFlag', False )

M = metawords.shape[1]
L = 26

# Create variables (how many of each meta-word to use).
w = m.addVars(M, vtype=GRB.INTEGER, lb=0, name="w")

# total number of words used and number of each letter used
tot_score = quicksum(w[i]*truemetacounts[0,i] for i in range(M))
letters_used = [ quicksum(metalettcounts[i,j]*w[i] for i in range(M)) for j in range(L) ]

# Set objective (minimize number of words used)
m.setObjective( tot_score, GRB.MAXIMIZE)

# Constraint: must use as many letters as we have tiles of each sort
m.addConstrs(  (letters_used[j] <= tot_lettercount[0,j] for j in range(L) ) )

m.optimize()

sol = [(ix,int(v.x),metawords[0,ix], truemetacounts[0,ix]) for ix,v in enumerate(m.getVars()) if v.x > 0 ]

In [274]:
print( "total score =", sum( item[3] for item in sol ) )
sol

total score = 1129


[(186, 1, 'hadjointworms', 168),
 (213, 1, 'prequalifying', 210),
 (511, 1, 'decarboxylated', 245),
 (632, 1, 'emblazoners', 209),
 (839, 1, 'outpolitickingfishers', 297)]

In [280]:
# assuming we will use the above list of words, what tiles are left over?
tiles_used = ''.join( s[2] for s in sol )
print(tiles_used)
tmp = list(TILES)
for z in tiles_used:
    tmp.remove(z)
tiles_remaining = ''.join(tmp)
print(tiles_remaining)

hadjointwormsprequalifyingdecarboxylatedemblazonersoutpolitickingfishers
eeeeeeaaaaiioonnrttuudgvvw__


In [306]:
# what words are leftover from the wordlist using only these tiles?
wordlist_remaining = cull_wordlist( WORDLIST, tiles_remaining )
N_remaining = len(wordlist_remaining)
print('The word list contains', N_remaining, 'words.')

The word list contains 56548 words.


In [320]:
# what words are leftover from the wordlist using only these tiles?
wordlist_remaining_noblanks = cull_wordlist( WORDLIST, tiles_remaining[:-2] )
N_remaining_noblanks = len(wordlist_remaining_noblanks)
scores_remaining_noblanks = [ wordscore(w) for w in wordlist_remaining_noblanks ]
print('The word list (blanks excluded) contains', N_remaining_noblanks, 'words.')

The word list (blanks excluded) contains 3199 words.


In [344]:
# now we must figure out what to do with the remaining letters.

# find which letters are present and how many of each we have
lett_present = [ lett for lett in ALPHABET if lett in tiles_remaining ]
lett_counts_remaining = [ tiles_remaining.count(lett) for lett in lett_present ]
print(lett_present)
print(lett_counts_remaining)

['a', 'd', 'e', 'g', 'i', 'n', 'o', 'r', 't', 'u', 'v', 'w']
[4, 1, 6, 1, 2, 2, 2, 1, 2, 2, 2, 1]


In [None]:
%%time
# let's do the best we can using the single W:
# create graphs to find the possible word clusters using W

lett = 'w'
(wwords,wscores) = getlettwords(wordlist_remaining_noblanks,lett)
print(len(wwords), lett, 'words')
Gw = makegraph(wwords,lett)
w_scores,w_words,w_unions = getbestcliques(Gw,'w')


In [None]:
#### WORKING ON THIS ! ###

In [388]:
Z = np.array( [ v.x for ix,v in enumerate(mod.getVars()) if ix < d*p ] ).reshape((d,p))
winstr = ''.join( [ [lett_present[ix] for ix,z in enumerate(Z[:,j]) if z][0] for j in range(p) ] )
print(winstr)
print(stringscore(winstr,wordlist_remaining_noblanks))

uoadnationieeeeaavrvtwgeue
31


In [310]:
z = np.array( [[1,0,0,0,1,0],[0,1,0,0,0,0],[0,0,0,1,0,1],[0,0,1,0,0,0]] )
t = np.array( [[0,1],[0,0],[1,0],[0,0]] )
print(z)
print(t)

[[1 0 0 0 1 0]
 [0 1 0 0 0 0]
 [0 0 0 1 0 1]
 [0 0 1 0 0 0]]
[[0 1]
 [0 0]
 [1 0]
 [0 0]]


In [316]:
j=0
[ np.sum( z[:,j:j+2] * t ) for j in range(0,5) ]

[0, 0, 0, 2, 0]

In [547]:
import itertools

In [550]:
high_roller_wordlist = [ s[2] for s in sol ]

In [555]:
bestss = ''
bestsc = 0
for ss in itertools.permutations(high_roller_wordlist):
    sss = ''.join(ss)
    ssc = stringscore(sss)
    if ssc > bestsc:
        bestsc = ssc
        bestss = sss
print(bestss)
print(bestsc)

prequalifyingemblazonershadjointwormsoutpolitickingfishersdecarboxylated
1141


In [536]:
# remaining letters after high-rolling words used
lett_remain = list(letterlist)
for z in high_rollers:
    lett_remain.remove(z)

''.join(lett_remain)

'eeeaaaaaiiiiiirttllubpvw__'

In [557]:
# find words that will never be usable because there does not exist enough of the right letters
remain_impossible_words = []
remain_lettercount = np.array( [ [lett_remain.count(lett) for lett in alphabet] ] )

In [570]:
for i in range(N):
    if sum( [j for j in remain_lettercount[0]-wordlist_lettercount[i] if j <= 0] ) < -2:
        remain_impossible_words.append( wordlist[i] )
remain_wordlist = list( set(wordlist) - set(remain_impossible_words) )

len(remain_wordlist)

33107

In [294]:
%%time
random.seed(0)
s = TILES[:-2]
n = len(s)
ss = ''.join(random.sample(s,n))

for j in range(5):
    print(ss)
    print(stringscore(ss,WORDLIST))
    swaps = [ stringscore( ss[:i] + s[i+1:] + ss[i], WORDLIST) for i in range(n) ]
    ix = np.argmax(swaps)
    ss = ss[:ix] + s[ix+1:] + ss[ix]

rteoustnsrgiuaofamodcanpensvvtniwljeylodexrqidtgulboiaaedfargoihzncyopiioleieaubhremntsatkweareeei
182
rteoustnsrgiuaofamodcanpensvvtniwljeylodexrqidtgulboiaaedfargoihzncyopiioleieaubhremntsatkweareeze
182
rteoustnsrgiuaofamodcanpensvvtniwljeylodexrqidtgulboiaaedfargoihzncyopiioleieaubhremntsatkweareezz
182
rteoustnsrgiuaofamodcanpensvvtniwljeylodexrqidtgulboiaaedfargoihzncyopiioleieaubhremntsatkweareezz
182
rteoustnsrgiuaofamodcanpensvvtniwljeylodexrqidtgulboiaaedfargoihzncyopiioleieaubhremntsatkweareezz
182
Wall time: 14.7 s


In [318]:
%%time
random.seed(0)
s = tiles_remaining[:-2]
n = len(s)
ss = ''.join(random.sample(s,n))

for j in range(10):
    print(ss)
    print(stringscore(ss,wordlist_remaining_noblanks))
    swaps = [ stringscore( ss[:i] + s[i+1:] + ss[i], wordlist_remaining_noblanks) for i in range(n) ]
    ix = np.argmax(swaps)
    ss = ss[:ix] + s[ix+1:] + ss[ix]

ovoearnwauitedeeugiaeatven
42
ovoearnwauitedeeugiaeatvwe
45
ovoearnwauitedeeugiaeatvwe
45
ovoearnwauitedeeugiaeatvwe
45
ovoearnwauitedeeugiaeatvwe
45
ovoearnwauitedeeugiaeatvwe
45
ovoearnwauitedeeugiaeatvwe
45
ovoearnwauitedeeugiaeatvwe
45
ovoearnwauitedeeugiaeatvwe
45
ovoearnwauitedeeugiaeatvwe
45
Wall time: 249 ms


In [183]:
%timeit stringscore('raulasrakvqhncd_epoegecitiofrpaelilhsaoatjayabnatburzitsiwgrerooxgfdnesoivilnmdnuyomteeiwtnoeueeied', WORDLIST)

47.4 ms ± 9.09 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [180]:
%timeit stringscore_filledblanks('raulasrakvqhncdEepoegecitiofrpaelilhsaoatjayabnatburzitsiwgrerooxgfdnesoivilnmdnuyomteeiwtnoeueeied', WORDLIST)

47.2 ms ± 3.39 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [182]:
%timeit stringscore_bestblanks('raulasrakvqhncd_epoegecitiofrpaelilhsaoatjayabnatburzitsiwgrerooxgfdnesoivilnmdnuyomteeiwtnoeueeied', WORDLIST)

1.53 s ± 123 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
