# **Scrabble superstring**
## [Riddler Classic, Jun. 28, 2019](https://fivethirtyeight.com/features/whats-your-best-scrabble-string/)

### solution by [Laurent Lessard](https://laurentlessard.com)

---

## Some preliminaries

Here I tabulate the tile distribution and collect the list of admissible words. I used the [ENABLE word list](https://norvig.com/ngrams/enable1.txt), as instructed in the problem statement.

In [408]:
import numpy as np
import pandas as pd
import random
from gurobipy import *
import networkx as nx
import matplotlib.pyplot as plt

In [351]:
"""
0: ?×2
1: E×12 A×9 I×9 O×8 N×6 R×6 T×6 L×4 S×4 U×4
2: D×4 G×3
3: B×2 C×2 M×2 P×2
4: F×2 H×2 V×2 W×2 Y×2
5: K
8: J X
10: Q Z
""";

In [352]:
# Make a list of all the letters
letterlist = 12*'e' + 9*'a' + 9*'i' + 8*'o' + 6*'n' + 6*'r' + 6*'t' + 4*'l' + 4*'s' \
            + 4*'u' + 4*'d' + 3*'g' + 2*'b' + 2*'c' + 2*'m' + 2*'p' + 2*'f' + 2*'h' \
            + 2*'v' + 2*'w' + 2*'y' + 'kjxqz' + 2*'_'

In [353]:
# How much is each letter worth, and how much is a word worth?
scoredic = dict()
for lett in '_':
    scoredic[lett] = 0
for lett in 'eaionrtlsu':
    scoredic[lett] = 1
for lett in 'dg':
    scoredic[lett] = 2
for lett in 'bcmp':
    scoredic[lett] = 3
for lett in 'fhvwy':
    scoredic[lett] = 4
for lett in 'k':
    scoredic[lett] = 5
for lett in 'jx':
    scoredic[lett] = 8
for lett in 'qz':
    scoredic[lett] = 10
    
def wordscore(word):
    """
    Evaluate the Scrabble score of a given word
    """
    return sum( [scoredic[lett] for lett in word] )

In [354]:
# Gather the list of legal words. There are N words total
f = open("enable1.txt","r")
wordlist = sorted( [ w[:-1] for w in f ], key=wordscore, reverse=True )
N = len(wordlist)
print('The word list contains', N, 'words.')

The word list contains 172820 words.


In [409]:
# create array that is N x L of letter occurences for each legal word
alphabet = 'abcdefghijklmnopqrstuvwxyz'
wordlist_lettercount = np.array( [ [w.count(lett) for lett in alphabet] for w in wordlist ] )
tot_lettercount = np.array( [ [letterlist.count(lett) for lett in alphabet] ] )

In [421]:
# find words that will never be usable because there does not exist enough of the right letters
impossible_words = []
for i in range(N):
    if sum( [j for j in tot_lettercount[0]-wordlist_lettercount[i] if j <= 0] ) < -2:
        impossible_words.append( wordlist[i] )
        
print(impossible_words)

['razzamatazzes', 'razzamatazz', 'knickknacks', 'expressionlessnesses', 'possessivenesses', 'possessednesses', 'classlessnesses', 'resistlessnesses', 'stresslessnesses', 'senselessnesses', 'stresslessness']


In [423]:
# update the wordlist to only contain possible words
wordlist = [word for word in wordlist if word not in impossible_words ]

In [355]:
def stringscore(ss):
    """
    Evaluate the total score of a superstring
    """
    scoretot = 0
    for word in wordlist:
        if word in ss:
            scoretot += wordscore(word)
    return scoretot

def dotheymatch(w1,w2,lett):
    """
    Returns True if words w1 and w2 can both match while sharing letter lett.
    We assume w1 and w2 each contain lett exactly once.
    """
    i1 = w1.find(lett)
    i2 = w2.find(lett)
    match_right = (w1[i1:] in w2) or (w2[i2:] in w1)
    match_left  = (w1[:i1+1] in w2) or (w2[:i2+1] in w1)
    return match_right and match_left

In [356]:
# get list of all words containint a particular letter
def getlettwords(lett):
    lwords = []
    lwordscores = []
    for w in wordlist:
        if w.count(lett) == 1:
            lwords.append(w)
            lwordscores.append(wordscore(w))
    return (lwords,lwordscores)

def makegraph(lwords,lett):
    n = len(lwords)
    elist = [ (i,j) for i in range(n) for j in range(i) if dotheymatch(lwords[i],lwords[j],lett) ]
    return nx.Graph(elist)

def getbestcliques(G,lwords,lscores,lett):
    cliques = nx.find_cliques(G)
    cliquescores = []
    cliquewords = []
    cliqueunion = []
    for (i,c) in enumerate(cliques):
        wrds = [ lwords[j] for j in c ]
        cliquewords.append( wrds )
        cliquescores.append( sum( [wordscore(w) for w in wrds] ) )
        cliqueunion.append( unionize(wrds,lett) )
    isrt = np.argsort(cliquescores)[::-1]
    return [ cliquescores[i] for i in isrt ], [ cliquewords[i] for i in isrt ], [ cliqueunion[i] for i in isrt ]
        
def unionize(lwords,lett):
    leftwords = []
    rightwords = []
    for w in lwords:
        ix = w.find(lett)
        leftwords.append( w[:ix] )
        rightwords.append( w[ix+1:] )
    leftwords.sort(key=len)
    rightwords.sort(key=len)
    return leftwords[-1] + lett + rightwords[-1]

In [386]:
## VERY TIME CONSUMING

# find all the J-words
lett = 'j'
(jwords,jscores) = getlettwords(lett)
print(len(jwords), lett, 'words')
Gj = makegraph(jwords,lett)

# find all the Q-words
lett = 'q'
(qwords,qscores) = getlettwords(lett)
print(len(qwords), lett, 'words')
Gq = makegraph(qwords,lett)

# find all the X-words
lett = 'x'
(xwords,xscores) = getlettwords(lett)
print(len(xwords), lett, 'words')
Gx = makegraph(xwords,lett)

# find all the Z-words
lett = 'z'
(zwords,zscores) = getlettwords(lett)
print(len(zwords), lett, 'words')
Gz = makegraph(zwords,lett)

2429 j words
2519 q words
4578 x words
6627 z words


In [460]:
# find all the K-words
lett = 'k'
(kwords,kscores) = getlettwords(lett)
print(len(kwords), lett, 'words')
Gk = makegraph(kwords,lett)

11995 k words


In [462]:
j_scores,j_words,j_unions = getbestcliques(Gj,jwords,jscores,'j')
q_scores,q_words,q_unions = getbestcliques(Gq,qwords,qscores,'q')
x_scores,x_words,x_unions = getbestcliques(Gx,xwords,xscores,'x')
z_scores,z_words,z_unions = getbestcliques(Gz,zwords,zscores,'z')
k_scores,k_words,k_unions = getbestcliques(Gk,kwords,kscores,'k')

In [538]:
alphabet = 'abcdefghijklmnopqrstuvwxyz'
Nmeta = 200
tot_lettercount = np.array( [ [letterlist.count(lett) for lett in alphabet] ] )
j_lettercount   = np.array( [ [w.count(lett) for lett in alphabet] for w in j_unions[:Nmeta] ] )
q_lettercount   = np.array( [ [w.count(lett) for lett in alphabet] for w in q_unions[:Nmeta] ] )
x_lettercount   = np.array( [ [w.count(lett) for lett in alphabet] for w in x_unions[:Nmeta] ] )
z_lettercount   = np.array( [ [w.count(lett) for lett in alphabet] for w in z_unions[:Nmeta] ] )
k_lettercount   = np.array( [ [w.count(lett) for lett in alphabet] for w in k_unions[:Nmeta] ] )

metawords = np.array([j_unions[:Nmeta] + q_unions[:Nmeta] + x_unions[:Nmeta] + z_unions[:Nmeta] + k_unions[:Nmeta]])
metalettcounts = np.vstack( [j_lettercount, q_lettercount, x_lettercount, z_lettercount, k_lettercount] )
metacounts = np.array([j_scores[:Nmeta] + q_scores[:Nmeta] + x_scores[:Nmeta] + z_scores[:Nmeta] + k_scores[:Nmeta]])

In [539]:
truemetacounts = np.array([[ stringscore(mw) for mw in metawords[0] ]])

In [540]:
print( tot_lettercount.shape )
print( metalettcounts.shape )
print( metawords.shape )
print( metacounts.shape )
print( truemetacounts.shape )

(1, 26)
(1000, 26)
(1, 1000)
(1, 1000)
(1, 1000)


In [541]:
m = Model("scrabble")
M = metawords.shape[1]
L = 26

# Create variables (how many of each meta-word to use).
w = m.addVars(M, vtype=GRB.INTEGER, lb=0, name="w")

# total number of words used and number of each letter used
tot_score = quicksum(w[i]*truemetacounts[0,i] for i in range(M))
letters_used = [ quicksum(metalettcounts[i,j]*w[i] for i in range(M)) for j in range(L) ]

# Set objective (minimize number of words used)
m.setObjective( tot_score, GRB.MAXIMIZE)

# Constraint: must use as many letters as we have tiles of each sort
m.addConstrs(  (letters_used[j] <= tot_lettercount[0,j] for j in range(L) ) )

m.optimize()

sol = [(ix,int(v.x),metawords[0,ix]) for ix,v in enumerate(m.getVars()) if v.x > 0 ]

Optimize a model with 26 rows, 1000 columns and 12110 nonzeros
Variable types: 0 continuous, 1000 integer (0 binary)
Coefficient statistics:
  Matrix range     [1e+00, 8e+00]
  Objective range  [1e+02, 4e+02]
  Bounds range     [0e+00, 0e+00]
  RHS range        [1e+00, 1e+01]
Found heuristic solution: objective 760.0000000
Presolve removed 0 rows and 186 columns
Presolve time: 0.06s
Presolved: 26 rows, 814 columns, 9881 nonzeros
Variable types: 0 continuous, 814 integer (814 binary)

Root relaxation: objective 1.174305e+03, 59 iterations, 0.00 seconds

    Nodes    |    Current Node    |     Objective Bounds      |     Work
 Expl Unexpl |  Obj  Depth IntInf | Incumbent    BestBd   Gap | It/Node Time

     0     0 1174.30508    0   13  760.00000 1174.30508  54.5%     -    0s
H    0     0                    1094.0000000 1174.30508  7.34%     -    0s
     0     0 1172.13333    0   15 1094.00000 1172.13333  7.14%     -    0s
H    0     0                    1129.0000000 1172.13333  3.82%   

In [542]:
sol

[(190, 1, 'hadjointworms'),
 (213, 1, 'prequalifying'),
 (507, 1, 'decarboxylated'),
 (631, 1, 'emblazoners'),
 (840, 1, 'outpolitickingfishers')]

In [547]:
import itertools

In [550]:
high_roller_wordlist = [ s[2] for s in sol ]

In [555]:
bestss = ''
bestsc = 0
for ss in itertools.permutations(high_roller_wordlist):
    sss = ''.join(ss)
    ssc = stringscore(sss)
    if ssc > bestsc:
        bestsc = ssc
        bestss = sss
print(bestss)
print(bestsc)

prequalifyingemblazonershadjointwormsoutpolitickingfishersdecarboxylated
1141


In [536]:
# remaining letters after high-rolling words used
lett_remain = list(letterlist)
for z in high_rollers:
    lett_remain.remove(z)

''.join(lett_remain)

'eeeaaaaaiiiiiirttllubpvw__'

In [557]:
# find words that will never be usable because there does not exist enough of the right letters
remain_impossible_words = []
remain_lettercount = np.array( [ [lett_remain.count(lett) for lett in alphabet] ] )

In [570]:
for i in range(N):
    if sum( [j for j in remain_lettercount[0]-wordlist_lettercount[i] if j <= 0] ) < -2:
        remain_impossible_words.append( wordlist[i] )
remain_wordlist = list( set(wordlist) - set(remain_impossible_words) )

len(remain_wordlist)

33107

In [459]:
%%time

s = letterlist[:-2]
ss = ''.join(random.sample(s,len(s)))
print(ss)
print(stringscore(ss))

ateaniovlotgthfyhaaqeecreamnysotetoxeoanndfsceskriniureebirbwjilepgadierludamonouzwiuetplivadgrsio
130
Wall time: 68 ms
