In [1]:
import requests, math
import pandas as pd

In [2]:
words = pd.Series(requests.get('https://raw.githubusercontent.com/derekchuank/high-frequency-vocabulary/master/30k.txt').text.split())

In [3]:
words5 = words[words.apply(len)==5]

Common 5 letter words

In [4]:
words5.head()

35    about
45    other
56    which
57    their
62    there
dtype: object

In [5]:
letter_position_freq = pd.DataFrame({
    i:words5.apply(lambda x: x[i]).value_counts(normalize=True)
    for i in range(5)
})

Frequencies of each letter at each position

In [6]:
letter_position_freq.sort_values(0,ascending=False).head()

Unnamed: 0,0,1,2,3,4
s,0.118569,0.01403,0.041265,0.047043,0.222558
c,0.08088,0.01458,0.033287,0.048143,0.011554
b,0.065475,0.005502,0.024209,0.016781,0.003026
t,0.064924,0.025309,0.058047,0.072627,0.062724
p,0.062999,0.017607,0.020633,0.021458,0.012105


5 letter words letter frequencies

In [8]:
word_freqs = pd.Series(list(''.join(words5.tolist()))).value_counts(normalize=True)
word_freqs.head()

e    0.108281
a    0.093205
s    0.088693
r    0.070426
o    0.068336
dtype: float64

We define the following scoring system. Letters in the correct positions give 1 point, while letters present in word give 0.5 points. Then:

In [12]:
def compute_score(word):
    score = 0
    for pos, letter in enumerate(word):
        score += letter_position_freq.loc[letter, pos] + 0.5 * word_freqs.loc[letter]
    return score
print(f"house: {compute_score('house'):.3f}")
print(f"adieu: {compute_score('adieu'):.3f}")
print(f"table: {compute_score('table'):.3f}")

house: 0.577
adieu: 0.506
table: 0.626


In [11]:
words5.to_frame('word').assign(score=words5.apply(compute_score)).sort_values('score',ascending=False).head(30).set_index('word')

Unnamed: 0_level_0,score
word,Unnamed: 1_level_1
sales,0.968088
sores,0.961596
cares,0.937799
canes,0.925722
saves,0.904264
lanes,0.903961
ceres,0.901045
cases,0.89879
tales,0.898459
cores,0.898404
