# Step 1: Extract top bigrams from the corpus

## Set up

In [1]:
# paths
PATH_DATA_FOLDER = "../data/"
PATH_WORDS_CORPUS = PATH_DATA_FOLDER + "fil9"

In [35]:
# constants
BIGRAM_MIN_FREQ = 15

In [2]:
import pickle

import nltk
from nltk.collocations import *
from nltk.corpus import PlaintextCorpusReader

In [3]:
%%time
with open(PATH_WORDS_CORPUS, 'r') as f:
    corpus = f.read()

CPU times: user 171 ms, sys: 846 ms, total: 1.02 s
Wall time: 1.7 s


In [4]:
%%time
corpus = corpus.split()

CPU times: user 4.68 s, sys: 2.16 s, total: 6.85 s
Wall time: 6.81 s


In [5]:
print(len(corpus))

124301826


## Score and save bigrams

In [6]:
%%time
finder = BigramCollocationFinder.from_words(corpus)

CPU times: user 5min 50s, sys: 1.18 s, total: 5min 51s
Wall time: 5min 51s


In [7]:
finder.apply_freq_filter(BIGRAM_MIN_FREQ)

In [24]:
def save_bigrams(scoring_method_name, bigrams_list):
    bigrams_list = [t[0] for t in bigrams_list]
    with open(PATH_DATA_FOLDER + '/bigrams_' + scoring_method_name + '.pkl', 'wb') as f:
        pickle.dump(bigrams_list, f)
    
    with open(PATH_DATA_FOLDER + 'bigrams_' + scoring_method_name + '.txt', 'w') as f:
        f.write("\n".join([" ".join(t) for t in bigrams_list]))

In [25]:
bigram_measures = nltk.collocations.BigramAssocMeasures()

In [26]:
# %%time
# scored_pmi = finder.score_ngrams(bigram_measures.pmi)
# save_bigrams('pmi', scored_pmi)

CPU times: user 4.47 s, sys: 56 ms, total: 4.53 s
Wall time: 4.52 s


In [27]:
%%time
scored_likelihood_ratio = finder.score_ngrams(bigram_measures.likelihood_ratio)
save_bigrams('likelihood_ratio', scored_likelihood_ratio)

CPU times: user 6.07 s, sys: 56 ms, total: 6.13 s
Wall time: 6.13 s


In [28]:
%%time
scored_raw_freq = finder.score_ngrams(bigram_measures.raw_freq)
save_bigrams('raw_freq', scored_raw_freq)

CPU times: user 5.12 s, sys: 67.9 ms, total: 5.19 s
Wall time: 5.18 s


In [29]:
# %%time
# scored_poisson_stirling = finder.score_ngrams(bigram_measures.poisson_stirling)
# save_bigrams('poisson_stirling', scored_poisson_stirling)

CPU times: user 4.53 s, sys: 56 ms, total: 4.58 s
Wall time: 4.58 s


In [30]:
# %%time
# scored_chi_sq = finder.score_ngrams(bigram_measures.chi_sq)
# save_bigrams('chi_sq', scored_chi_sq)

CPU times: user 5.14 s, sys: 95.8 ms, total: 5.23 s
Wall time: 5.36 s


In [31]:
# %%time
# scored_dice = finder.score_ngrams(bigram_measures.dice)
# save_bigrams('dice', scored_dice)

CPU times: user 5.55 s, sys: 84.1 ms, total: 5.63 s
Wall time: 5.62 s


In [32]:
# %%time
# scored_jaccard = finder.score_ngrams(bigram_measures.jaccard)
# save_bigrams('jaccard', scored_jaccard)

CPU times: user 4.24 s, sys: 80 ms, total: 4.32 s
Wall time: 4.32 s


In [33]:
print(len(scored_likelihood_ratio))

602733


#### 'likelihood_ratio' and 'raw_freq' look more suitable for our purposes; other scores strongly prefer non-compositional bigrams so might be usefull in future work on stable expressions search