In [12]:
import datetime
import os
import string

from nltk import BigramCollocationFinder
from nltk.corpus.reader import BNCCorpusReader
from nltk.collocations import BigramAssocMeasures

In [13]:
def get_curr_time():
    return f'{datetime.datetime.now().strftime("%H:%M:%S")}'

In [2]:
# read British National Corpus

bnc_texts_dir = os.path.join('..', 'storage', 'bnc', 'raw_data', 'BNC', 'Texts')

bnc_reader = BNCCorpusReader(root=bnc_texts_dir, fileids=r'[A-K]/\w*/\w*\.xml')

In [3]:
# import bigram association measures

bigram_measures = BigramAssocMeasures()

measure_dict = {'pmi' : bigram_measures.pmi,
                'dice' : bigram_measures.dice,
                'chi2' : bigram_measures.chi_sq}

In [4]:
# initialize finder and preprocess BNC corpus

# treat each sentence as another document (find MWEs inside sentences)
finder = BigramCollocationFinder.from_documents(bnc_reader.sents())

# treat whole corpus as a long sentence (find MWEs inside and BETWEEN sentences)
# finder = BigramCollocationFinder.from_words(bnc_reader.words())


In [5]:
# check word correctness
def check_word_correctness(word) -> bool:
    if len(word) < 1:
        return False

    if any(char in string.punctuation for char in word):
        return False

    if any(char in ['`', '~', '‘', '—'] for char in word):
        return False

    if any(char.isdigit() for char in word):
        return False

    if word[0].isupper():
        return False
        
    else:
        return True

In [6]:
# get number of occurrences for MWE
def get_mwe_freq(mwe, freq_list):
    # mwe_tuple = [mwe_tuple for mwe_tuple in freq_list if mwe_tuple[0] == mwe]
    return freq_list[mwe]

In [7]:
def save_mwe_list(mwe_list, measure_name, dataset_name, dataset_dir):
    out_filepath = os.path.join(dataset_dir, f'{dataset_name}_{measure_name}_incorrect_mwe.tsv')
    with open(out_filepath, 'w') as out_file:
        out_file.write('\t'.join(['first_word', 'second_word', 'measure_value', 'frequency']) + '\n')

        for mwe_tuple in mwe_list:
            out_file.write('\t'.join(mwe_tuple) + '\n')

In [11]:
# get cleaned list of MWEs for specific measures

dataset_name = 'bnc'
dataset_dir = os.path.join('..', 'storage', dataset_name, 'preprocessed_data')

for measure_name in measure_dict.keys():
    print(f'{get_curr_time()} : Generating incorrect MWEs list for {measure_name}')

    # get list of MWEs with spoecified measure
    desc_mwe_list = finder.score_ngrams(measure_dict[measure_name])

    # get frequencies of MWE
    freq_mwe_list = {k: v for k, v in finder.ngram_fd.items()}

    # clean list of MWEs
    desc_mwe_list_cleaned = [mwe_tuple for mwe_tuple in desc_mwe_list[::-1] if all([check_word_correctness(mwe_word) for mwe_word in mwe_tuple[0]])][:100000]

    # get list with MWE, measure value and frequency
    mwe_with_freq = [[mwe_tuple[0][0], mwe_tuple[0][1], str(mwe_tuple[1]), str(get_mwe_freq(mwe_tuple[0], freq_mwe_list))] for mwe_tuple in desc_mwe_list_cleaned]

    # save dataset
    save_mwe_list(mwe_with_freq, measure_name, dataset_name, dataset_dir)

Generating incorrect MWEs list for pmi
Generating incorrect MWEs list for dice
Generating incorrect MWEs list for chi2


In [10]:
get_mwe_freq(('burps', 'last'), freq_mwe_list)

1