In [1]:
import datetime
import os
import string

import nltk
from nltk import BigramCollocationFinder, word_tokenize, pos_tag
from nltk.corpus.reader import BNCCorpusReader
from nltk.collocations import BigramAssocMeasures

In [2]:
# download perceptron PoS tagger
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/netherwulf/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
# read British National Corpus

bnc_texts_dir = os.path.join('..', 'storage', 'bnc', 'raw_data', 'BNC', 'Texts')

bnc_reader = BNCCorpusReader(root=bnc_texts_dir, fileids=r'[A-K]/\w*/\w*\.xml')

In [4]:
# import bigram association measures

bigram_measures = BigramAssocMeasures()

measure_dict = {'pmi' : bigram_measures.pmi,
                'dice' : bigram_measures.dice,
                'chi2' : bigram_measures.chi_sq}

In [5]:
# initialize finder and preprocess BNC corpus

# treat each sentence as another document (find MWEs inside sentences)
finder = BigramCollocationFinder.from_documents(bnc_reader.sents())

# treat whole corpus as a long sentence (find MWEs inside and BETWEEN sentences)
# finder = BigramCollocationFinder.from_words(bnc_reader.words())


In [6]:
def get_curr_time():
    return f'{datetime.datetime.now().strftime("%H:%M:%S")}'

In [7]:
def get_pos_tag(nltk_pos_tag):
    if nltk_pos_tag[0] == 'J':
        return 'ADJ'
        
    if nltk_pos_tag[0] == 'N':
        return 'NOUN'

    if nltk_pos_tag[0] == 'V':
        return 'VERB'

In [8]:
# check word correctness
def check_word_correctness(word) -> bool:
    if len(word) < 1:
        return False
    # print(f'word: {word}',
    #       f'pos_tag: {pos_tag([word])}',
    #       f'pos_tag([word])[0][1][0]: {pos_tag([word])[0][1][0]}',
    #       sep='\n')

    if any(char in string.punctuation for char in word):
        return False

    if any(char in ['`', '~', '‘', '—', '\''] for char in word):
        return False

    if any(char.isdigit() for char in word):
        return False

    if word[0].isupper():
        return False

    if pos_tag([word])[0][1][0] not in ['J', 'N', 'V']:
        return False

    return True

In [9]:
# get number of occurrences for MWE
def get_mwe_freq(mwe, freq_list):
    # mwe_tuple = [mwe_tuple for mwe_tuple in freq_list if mwe_tuple[0] == mwe]
    return freq_list[mwe]

In [10]:
# save list of measures to the TSV file
def save_mwe_list(mwe_list, measure_name, dataset_name, dataset_dir):
    out_filepath = os.path.join(dataset_dir, f'{dataset_name}_{measure_name}_incorrect_mwe.tsv')
    with open(out_filepath, 'w') as out_file:
        out_file.write('\t'.join(['first_word_tag', 'second_word_tag', 'first_word', 'second_word', 'measure_value', 'frequency']) + '\n')

        for mwe_tuple in mwe_list:
            out_file.write('\t'.join(mwe_tuple) + '\n')

In [12]:
# get cleaned list of MWEs for list of measures

dataset_name = 'bnc'
dataset_dir = os.path.join('..', 'storage', dataset_name, 'preprocessed_data')

for measure_name in measure_dict.keys():
    if measure_name != 'pmi':
        continue

    print(f'{get_curr_time()} : Generating incorrect MWEs list for {measure_name}')

    # get list of MWEs with spoecified measure
    desc_mwe_list = finder.score_ngrams(measure_dict[measure_name])

    # get frequencies of MWE
    freq_mwe_list = {k: v for k, v in finder.ngram_fd.items()}

    # clean list of MWEs
    desc_mwe_list_cleaned = [mwe_tuple for mwe_tuple in desc_mwe_list[::-1] if all([check_word_correctness(mwe_word) for mwe_word in mwe_tuple[0]]) and mwe_tuple[1] > 1][:100000]

    # get list with MWE, measure value and frequency
    mwe_with_freq = [[get_pos_tag(pos_tag([mwe_tuple[0][0]])[0][1]), get_pos_tag(pos_tag([mwe_tuple[0][1]])[0][1]), mwe_tuple[0][0], mwe_tuple[0][1], str(mwe_tuple[1]), str(get_mwe_freq(mwe_tuple[0], freq_mwe_list))] for mwe_tuple in desc_mwe_list_cleaned]

    # save dataset
    save_mwe_list(mwe_with_freq, measure_name, dataset_name, dataset_dir)

22:20:20 : Generating incorrect MWEs list for pmi


In [12]:
# get cleaned list of MWEs for single measure

measure_name = 'chi2'

dataset_name = 'bnc'

dataset_dir = os.path.join('..', 'storage', dataset_name, 'preprocessed_data')

print(f'{get_curr_time()} : Generating incorrect MWEs list for {measure_name}')

# get list of MWEs with spoecified measure
desc_mwe_list = finder.score_ngrams(measure_dict[measure_name])

# get frequencies of MWE
freq_mwe_list = {k: v for k, v in finder.ngram_fd.items()}

# clean list of MWEs
# desc_mwe_list_cleaned = [mwe_tuple for mwe_tuple in desc_mwe_list[::-1] if all([check_word_correctness(mwe_word) for mwe_word in mwe_tuple[0]]) and mwe_tuple[1] > 1]
desc_mwe_list_cleaned = [mwe_tuple for mwe_tuple in desc_mwe_list[::-1] if all([check_word_correctness(mwe_word) for mwe_word in mwe_tuple[0]])]

# get last values from the end of the 3rd quartile
desc_mwe_list_cleaned = desc_mwe_list_cleaned[int(0.75 * len(desc_mwe_list_cleaned)) - 100000:int(0.75 * len(desc_mwe_list_cleaned))]

# get list with MWE, measure value and frequency
mwe_with_freq = [[get_pos_tag(pos_tag([mwe_tuple[0][0]])[0][1]), get_pos_tag(pos_tag([mwe_tuple[0][1]])[0][1]), mwe_tuple[0][0], mwe_tuple[0][1], str(mwe_tuple[1]), str(get_mwe_freq(mwe_tuple[0], freq_mwe_list))] for mwe_tuple in desc_mwe_list_cleaned]

# save dataset
save_mwe_list(mwe_with_freq, f'{measure_name}_end_of_3rd_quartile', dataset_name, dataset_dir)

23:31:14 : Generating incorrect MWEs list for chi2


In [None]:
get_pos_tag(pos_tag(['banana'])[0][1])

'NOUN'

In [None]:
len(desc_mwe_list)

12374827