In [1]:
import preprocessing

In [2]:
_, sentences, _ = preprocessing.extract_sentences_from_raw_txt(skip_no_comma=False, korpus="1990")

Extracting sentences from raw data in korpus 1990


In [3]:
all_tokens = []
for sentence in sentences:
    all_tokens.extend(sentence)
    
all_tokens[:100]

['stå',
 'JuniBevægelsen',
 'for',
 'en',
 'alternativ',
 'inden',
 'for',
 'EF',
 'medlemskab',
 ',',
 'eller',
 'være',
 'realitet',
 'ikke',
 ',',
 'at',
 'JuniBevægelsens',
 'lede',
 'kraft',
 'ønske',
 'Danmark',
 'ud',
 'af',
 'EF',
 ',',
 'men',
 'undlade',
 'at',
 'sige',
 'det',
 'af',
 'taktisk',
 'grund',
 'minut',
 'dessert',
 'smelte',
 'på',
 'tunge',
 ',',
 'men',
 'den',
 'stor',
 'stykke',
 'valnøddefragilité',
 ',',
 'anrette',
 'i',
 'en',
 'abrikoscouli',
 ',',
 'der',
 'overdøve',
 'smag',
 'af',
 'kirsch',
 ',',
 'være',
 'en',
 'kaloriebombe',
 'så',
 'stor',
 'og',
 'mætte',
 ',',
 'at',
 'jeg',
 'måtte',
 'levne',
 'den',
 'blot',
 'mistanke',
 'om',
 'forurening',
 'udgøre',
 'den',
 'tilstrækkelig',
 'retfærdiggørelse',
 'for',
 'ny',
 'felttog',
 'mod',
 'næringsliv',
 'men',
 'sådan',
 'kunne',
 'man',
 'jo',
 'også',
 'indrette',
 'indkomstskat',
 ',',
 'hvis',
 'man',
 'ville',
 'denne',
 'urimelighed',
 'kunne',
 'få',
 'ungarer',
 'til']

In [4]:
print("Number of unique lemmatized words: {}".format(len(set(all_tokens))))

Number of unique lemmatized words: 493668


In [5]:
len(all_tokens)

33983300

In [6]:
import pandas as pd
df_dic = {"token" : all_tokens}
df = pd.DataFrame(df_dic)
value_count = df["token"].value_counts()

In [8]:
value_count[:5]

,       2117740
og      1000890
være     920798
i        918440
en       774947
Name: token, dtype: int64

In [9]:
value_count[:50000].sum()

32940014

I want to figure out how many of the most frequent words I need to include in a list in order to cover 90 % of all words 

In [27]:
explain = value_count.sum() * 0.9 # what is 90 % of all words?
explain

30584970.0

In [30]:
# How many of the most frequent unique words do I need to cover 90 % of all words?
def get_idx(percentage):
    """percentage (float) : What percentage of word occurences should the ith word in value_count account for? Between 0 and 1"""
    explain = value_count.sum() * percentage # what is 90 % of all words?
    cont = True
    total = 0
    for i, num in enumerate(value_count):
        total += num
        if total >= explain:
            print("Index of last word to include in value_count: {}".format(i))
            return i
    

In [31]:
i = get_idx(0.9)
print(value_count[:i+1].sum())


Index of last word to include in value_count: 7659
30585149


In [46]:
def percentage_accounted_for(num_unique_words):
    pct = value_count[:num_unique_words+1].sum() / value_count.sum() * 100
    print("{} unique words account for {} % of all words in korpus".format(num_unique_words, pct))   
    return pct

In [37]:
percentage_accounted_for(8000)

8000 unique words account for 90.2366544744036 % of all words in korpus


In [68]:
percentage_accounted_for(50000)

50000 unique words account for 96.93004799416184 % of all words in korpus


96.93004799416184

In [41]:
value_count[-1000:]

Dyerberg                 1
forvaltningsdistrikts    1
Shysss                   1
342500                   1
894000                   1
                        ..
fornuftslov              1
Hvalfangskommision       1
fangststoppe             1
nyindspille              1
tvangsmæssighed          1
Name: token, Length: 1000, dtype: int64

In [44]:
len(value_count[value_count > 1])

221038

In [None]:
vocab = value_count[value_count > 1]


In [47]:
percentage_accounted_for(10000)

10000 unique words account for 91.3711617176672 % of all words in korpus


91.3711617176672

In [59]:
percentages = []
pct_gain = []
num_words = []
for n in range(1000, len(value_count), 1000):
    num_words.append(n)
    pct_explained = percentage_accounted_for(n)
    print("pct_explained: {}".format(pct_explained))
    percentages.append(pct_explained)
    if n == 1000:
        gain = pct_explained - percentages[-1]
    else:
        gain = pct_explained - percentages[-2]
    print("Adding the next 1000 words added a gain of {}".format(gain))
    pct_gain.append(gain)
    if n == 5000:
        break

1000 unique words account for 74.16523115765685 % of all words in korpus
pct_explained: 74.16523115765685
Adding the next 1000 words added a gain of 0.0
2000 unique words account for 80.50416822380404 % of all words in korpus
pct_explained: 80.50416822380404
Adding the next 1000 words added a gain of 6.338937066147196
3000 unique words account for 83.80597528786198 % of all words in korpus
pct_explained: 83.80597528786198
Adding the next 1000 words added a gain of 3.3018070640579396
4000 unique words account for 85.93485918083293 % of all words in korpus
pct_explained: 85.93485918083293
Adding the next 1000 words added a gain of 2.128883892970947
5000 unique words account for 87.45002398236781 % of all words in korpus
pct_explained: 87.45002398236781
Adding the next 1000 words added a gain of 1.5151648015348798


Save list with 50k most frequent words

In [62]:
most_frequent_words = value_count[:50001].index # 50,000 most frequent
most_frequent_words = most_frequent_words[1:] # removing first "word" which is a comma
most_frequent_words[0]
import pickle
pickle.dump(list(most_frequent_words), open('most_frequent_words_50k.pkl', 'wb'))   

Save list with 8k most frequent words

In [63]:
most_frequent_words = value_count[:8001].index # 50,000 most frequent
most_frequent_words = most_frequent_words[1:] # removing first "word" which is a comma
most_frequent_words[0]
import pickle
pickle.dump(list(most_frequent_words), open('most_frequent_words_8k.pkl', 'wb'))   

In [70]:
most_frequent_words = value_count[:100001].index # 50,000 most frequent
most_frequent_words = most_frequent_words[1:] # removing first "word" which is a comma
most_frequent_words[0]
import pickle
pickle.dump(list(most_frequent_words), open('most_frequent_words_100k.pkl', 'wb'))   

In [7]:
most_frequent_words = value_count[:200001].index # 50,000 most frequent
most_frequent_words = most_frequent_words[1:] # removing first "word" which is a comma
most_frequent_words[0]
import pickle
pickle.dump(list(most_frequent_words), open('most_frequent_words_200k.pkl', 'wb'))  

In [65]:
with open("most_frequent_words_8k.pkl", "rb") as fp: 
    test = pickle.load(fp)

In [66]:
print(len(test))
print(len(most_frequent_words))

8000
8000


In [67]:
test[:10]

['og', 'være', 'i', 'en', 'den', 'at', 'til', 'det', 'af', 'på']