# PREPARE DATA

In [75]:
import pandas as pd

In [6]:
words_freq = pd.read_csv("./unigram_freq.csv")
words_type = pd.read_csv("./words_pos.csv", index_col=0)

In [7]:
words_freq.head()

Unnamed: 0,word,count
0,the,23135851162
1,of,13151942776
2,and,12997637966
3,to,12136980858
4,a,9081174698


In [8]:
words_type.head()

Unnamed: 0,word,pos_tag
0,aa,NN
1,aaa,NN
2,aah,NN
3,aahed,VBN
4,aahing,VBG


In [9]:
words = pd.merge(words_freq, words_type, on='word', how='outer')
words.head()

Unnamed: 0,word,count,pos_tag
0,a,9081175000.0,
1,aa,30523330.0,NN
2,aaa,10243980.0,NN
3,aaaa,1595769.0,
4,aaaah,52821.0,


Unnamed: 0,word,count,pos_tag,letter_count
0,a,9.081175e+09,,1.0
1,aa,3.052333e+07,NN,2.0
2,aaa,1.024398e+07,NN,3.0
3,aaaa,1.595769e+06,,4.0
4,aaaah,5.282100e+04,,5.0
...,...,...,...,...
607129,zzzk,1.556500e+04,,4.0
607130,zzzt,2.248700e+04,,4.0
607131,zzzz,3.625200e+05,,4.0
607132,,3.073916e+07,,


In [66]:
words["letter_count"] = words.word.str.len()
used_words = words[words["count"] > 10000000]
words_wordle = used_words[(used_words.letter_count >= 4) & (used_words.letter_count <= 8)]
words_wordle


Unnamed: 0,word,count,pos_tag,letter_count
228,aaron,1.169878e+07,NN,5.0
1029,ability,5.200429e+07,NN,7.0
1163,able,1.093890e+08,JJ,4.0
1423,about,1.226734e+09,IN,5.0
1431,above,1.418946e+08,IN,5.0
...,...,...,...,...
604479,zealand,4.167737e+07,NN,7.0
604943,zero,3.073541e+07,NN,4.0
605353,zimbabwe,1.045920e+07,NN,8.0
605997,zone,4.689737e+07,NN,4.0


In [68]:
words_wordle.pos_tag.value_counts(dropna=False)

pos_tag
NN      2410
NNS      750
JJ       279
VBN      193
VBG      166
NaN      121
RB       106
VBD       44
IN        43
VB        31
VBZ       19
JJS       13
JJR       11
DT        10
CD         8
MD         7
PRP        6
RBR        4
PRP$       2
WP         2
WDT        2
WRB        2
CC         1
WP$        1
Name: count, dtype: int64

NN + from 1_000_000

JJ + from 1_000_000

NaN + need manual filtration

RB +

VB +

CD +

In [77]:
words_wordle[words_wordle.pos_tag == "NN"].letter_count.value_counts()

letter_count
6.0    558
4.0    547
5.0    537
7.0    454
8.0    314
Name: count, dtype: int64

# SPLIT INTO GROUPS

## Plane difficulty

In [None]:
common_nouns = ((words.pos_tag == "NN") & (words["count"] > 1_000_000))
length_4_5 = ((words.letter_count == 4) | (words.letter_count == 5))

In [166]:
common_nouns = ((words.pos_tag == "NN") & (words["count"] > 5_000_000))
length_4_5 = ((words.letter_count == 4) | (words.letter_count == 5))
words_plane = words[common_nouns & length_4_5]
words_plane

Unnamed: 0,word,count,pos_tag,letter_count
228,aaron,11698784.0,NN,5.0
1977,abuse,36269685.0,NN,5.0
3161,acer,8160550.0,NN,4.0
3776,acid,30867887.0,NN,4.0
4030,acne,6215449.0,NN,4.0
...,...,...,...,...
603521,yukon,6061476.0,NN,5.0
604943,zero,30735412.0,NN,4.0
605387,zinc,5671140.0,NN,4.0
605997,zone,46897368.0,NN,4.0


In [167]:
prob = 1
num = words_plane.shape[0]
for i in range(100):
    prob *= 1 - (i + 1) / num
"{%.5f}" % (prob * 100)

'{4.60927}'

In [163]:
common_nouns = ((words.pos_tag == "NN") & (words["count"] > 10_000_000))
common_adjectives = ((words.pos_tag == "JJ") & (words["count"] > 1_000_000))
common_adverbs = ((words.pos_tag == "RB") & (words["count"] > 100_000))
common_verbs = ((words.pos_tag == "VB") & (words["count"] > 10_000))
length_5_6 = ((words.letter_count == 5) | (words.letter_count == 6))
words_hill = words[length_5_6 & (common_nouns | common_adjectives | common_adverbs | common_verbs)]
words_hill

Unnamed: 0,word,count,pos_tag,letter_count
228,aaron,11698784.0,NN,5.0
1583,abroad,14837382.0,RB,6.0
1977,abuse,36269685.0,NN,5.0
2408,accept,43171429.0,NN,6.0
2459,access,217986984.0,NN,6.0
...,...,...,...,...
601826,yearly,5822036.0,RB,6.0
601963,yellow,82024459.0,NN,6.0
602351,yield,15022494.0,NN,5.0
603037,young,136341684.0,JJ,5.0


In [185]:
common_nouns = ((words.pos_tag == "NN") & (words["count"] > 1_000_000))
common_adjectives = ((words.pos_tag == "JJ") & (words["count"] > 100_000))
common_adverbs = ((words.pos_tag == "RB") & (words["count"] > 10_000))
common_verbs = ((words.pos_tag == "VB") & (words["count"] > 10_000))
other_types = ((words.pos_tag != "NN") & (words.pos_tag != "JJ") & (words.pos_tag != "RB") & (words.pos_tag != "VB") & (words["count"] > 10_000_000))
length_6_8 = ((words.letter_count >= 6) & (words.letter_count <= 8))
words_mountain = words[length_6_8 & (common_nouns | common_adjectives | common_adverbs | common_verbs | other_types)]
words_mountain

Unnamed: 0,word,count,pos_tag,letter_count
338,abacus,1017068.0,NN,6.0
389,abandon,2747961.0,NN,7.0
580,abbott,3143893.0,NN,6.0
791,abelian,531258.0,JJ,7.0
828,aberdeen,5660658.0,NN,8.0
...,...,...,...,...
605961,zombie,4406565.0,NN,6.0
605982,zonally,21708.0,RB,7.0
606189,zoology,1672374.0,NN,7.0
606250,zoonotic,112699.0,JJ,8.0


In [196]:
def words_save_to_file(file_name, ds):
    with open(file_name, "w") as f:
        f.write("")
    with open(file_name, "a") as f:
        for word in ds["word"]:
            f.write(word + "\n")

In [210]:
words_save_to_file("words_plane", words_plane)
words_save_to_file("words_hill", words_hill)
words_save_to_file("words_mountain", words_mountain)
# kinda fucked up
words_save_to_file("dictionary", words[(words.letter_count >= 4) & (words.letter_count <= 8)])