In [4]:
from praw.models import MoreComments
from pymongo import MongoClient
from tqdm.autonotebook import tqdm

import praw

In [5]:
# For crawling reddit corpus
REDDIT_CLIENT_ID = '[REDACTED]'
REDDIT_CLIENT_SECRET = '[REDACTED]'
REDDIT_SOURCES = ['wordplay']

# For finding heteronyms from wiktionary
WIKTIONARY_PATH = './dataset/enwiktionary-20200520-pages-articles.xml.bz2'

# For save items in mongodb
# NOTE: If it doesn't connect, please change here
MONGO_URL = 'mongodb://localhost:27017/'
MONGO_DATABASE = 'nenwnlp3-v1'

# Stages which will be done
STAGES = [
    # 'CrawlReddit',
    'PreprocessReddit',
    'PreprocessWiktionary'
]

In [6]:
client = MongoClient(MONGO_URL)
db = client[MONGO_DATABASE]

In [7]:
def crawl_reddit(subreddit_name):
    coll_submission = db['%s-submissions' % subreddit_name]
    coll_comments = db['%s-comments' % subreddit_name]
    
    reddit = praw.Reddit(
        client_id = REDDIT_CLIENT_ID,
        client_secret = REDDIT_CLIENT_SECRET,
        user_agent = 'NenwNLP/3.0'
    )

    reddit.read_only = True

    subreddit = reddit.subreddit(subreddit_name)
    submission_count = 0
    comment_count = 0

    for submission in tqdm(subreddit.new(limit = 1000), total=1000):
        # Add submission to the database
        coll_submission.insert_one({
            'id': submission.id,
            'title': submission.title,
            'selftext': submission.selftext,
            'selftext_html': submission.selftext_html
        })

        # Crawling comments
        forests = [submission.comments]
        while len(forests) > 0:
            comments = forests.pop()
            comment_list = []
            for comment in comments:
                if isinstance(comment, MoreComments):
                    # If some comment is omitted, expand it
                    forests.append(comment.comments())
                    continue

                comment_list.append({
                    'id': comment.id,
                    'body': comment.body
                })
                comment_count += 1

                if comment.replies and len(comment.replies) > 0:
                    # If there are replies to the comments, expand it
                    forests.append(comment.replies)
            
            # Batch add crawled comments to the database
            coll_comments.insert_many(comment_list)

        submission_count += 1

        if submission_count % 50 == 1:
            tqdm.write("%d Submission, %d Comments have been crawled from %s!" % (
                submission_count,
                comment_count,
                subreddit_name
            ))


if REDDIT_CLIENT_ID is not None and 'CrawlReddit' in STAGES:
    for source in REDDIT_SOURCES:
        crawl_reddit(source)

In [8]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize.casual import TweetTokenizer

import re

# Regular expressions / tokenizer to preprocess text
#     reddit_identifier_regex: matches r/subredditname or u/username
#     link_regex: matches URL
#     word_regex: matches non-word character
#     word_tokenizer: Used TweetTokenizer to deal well with internet data
reddit_identifier_regex = re.compile(r'\/?(?:ru)\/[a-zA-Z0-9-_]+')
link_regex = re.compile(r'https?:\/\/[a-zA-Z0-9@.]+\.[a-zA-Z0-9]{1,6}(?:\:\d{5})?(?:[\/#?&%=a-zA-Z0-9._~+-]+)?')
word_regex = re.compile('[^a-zA-Z0-9.,\'"!?-]')
word_tokenizer = TweetTokenizer()


# preprocess_text: preprocesses text data
#     Substitute reddit identifier with 'it'
#     Substitute url with 'link'
#     Tokenize
#     Substitute non-word with ''
def preprocess_text(text):
    text = re.sub(reddit_identifier_regex, 'it', text)
    text = re.sub(link_regex, 'link', text)
    sents = sent_tokenize(text)
    sentences = []
    
    for sent in sents:
        words = [
            re.sub(word_regex, '', word) for word in word_tokenizer.tokenize(sent)
        ]
        
        words = [
            word for word in words if len(word) > 0
        ]
        
        sentences.append({
            'sent': words
        })
    
    return sentences


def preprocess_submission(submission):
    return preprocess_text(
        submission['title'] + '\n' + submission['selftext']
    )


def preprocess_comment(comment):
    return preprocess_text(
        comment['body']
    )


# preprocess_rows_to_sents: Find items from dataset and preprocess
#     Extract items from database collection
#     Preprocess it using given function
#     Save items to the output collection
def preprocess_rows_to_sents(coll, output_coll, preprocess_fn):
    buffer = []
    counts = coll.count_documents({})
    sentence_counts = 0
    
    for i, item in tqdm(enumerate(coll.find(batch_size = 50)), total = counts):
        sentences = preprocess_fn(item)
        sentence_counts += len(sentences)
        buffer.extend(sentences)

        if i % 50 == 0:
            output_coll.insert_many(buffer)
            buffer = []

    if len(buffer) > 0:
        output_coll.insert_many(buffer)
    
    return sentence_counts


def preprocess_subreddit(subreddit_name):
    coll_sentences = db['sentences']
    coll_submissions = db['%s-submissions' % subreddit_name]
    coll_comments = db['%s-comments' % subreddit_name]
    
    sentence_counts = preprocess_rows_to_sents(
        coll_submissions, coll_sentences, preprocess_submission
    )
    print("%d Sentences have been imported from %s-submissions" % (
        sentence_counts, subreddit_name
    ))

    sentence_counts = preprocess_rows_to_sents(
        coll_comments, coll_sentences, preprocess_comment
    )
    print("%d Sentences have been imported from %s-comments" % (
        sentence_counts, subreddit_name
    ))


if 'PreprocessReddit' in STAGES:
    for source in REDDIT_SOURCES:
        preprocess_subreddit(source)

HBox(children=(FloatProgress(value=0.0, max=974.0), HTML(value='')))


2510 Sentences have been imported from wordplay-submissions


HBox(children=(FloatProgress(value=0.0, max=2506.0), HTML(value='')))


4530 Sentences have been imported from wordplay-comments


In [9]:
from collections import Counter
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

# lemmatizer = WordNetLemmatizer()
stopwords_eng = stopwords.words('english')
word_filtered = Counter()

coll_sentences = db.sentences
counts = coll_sentences.count_documents({})

# Find candidate of heteronyms
#     Which tag is noun, adv, adj, or verb
#     Which is consisted with alphabet
#     Which is not 1-letter word
#     Which is not stopword
for item in tqdm(coll_sentences.find(batch_size = 50), total = counts):
    sent = item['sent']
    tag_sets = pos_tag(sent)
    valid_words = []
    
    for (word, tag) in tag_sets:
        word = word.lower()
        
        if tag not in ('NN', 'RB', 'JJ', 'VB'):
            continue
        
        if not word.isalpha():
            continue
        
        if len(word) == 1:
            continue
            
        if word in stopwords_eng:
            continue
        
        valid_words.append(word)
    
    word_filtered.update(valid_words)


HBox(children=(FloatProgress(value=0.0, max=7040.0), HTML(value='')))




In [10]:
# No filtering since using local db
min_appearance = 1
word_querying = {word: count for word, count in word_filtered.items() if count >= min_appearance}

print(len(word_querying), word_querying.keys())



In [11]:
from wiktextract import parse_wiktionary, WiktionaryConfig

coll_vocabularies = db['vocabs']

# Find whether a word is heteronym candidate or not
def capture_callback(title, text):
    if title.lower() in word_querying:
        return True
    
    return False

# Encodes a word output into database item
def encode_word(word, meaning_count):
    for pronunciation in word['pronunciations'].values():
        pronunciation['pos'] = list(pronunciation['pos'])
        pronunciation['sense'] = list(pronunciation['sense'])
        pronunciation['glosses'] = list(pronunciation['glosses'])
        pronunciation['indexes'] = list(pronunciation['indexes'])
    
    word['pronunciations'] = list(word['pronunciations'].values())
    word['meaning_count'] = meaning_count
    
    return word

previous_word = {
    'name': None,
    'pronunciations': {}
}
word_index = 0
word_count = 0

# word_callback: Called when wiktextract finds a word
#     Determine if it is an entirely new word
#         If it is a new word, add it to the database and reset previous_word
#         If it is not a new word (Same Word, Different Definition), add entries to previous_word
#     Per pronunciations, accumulate IPA, POS, senses (not used), glosses and indexes
#         Word Index is used to determine how much word this pronunciation can represent
def word_callback(item):
    try:
        global previous_word, word_index, word_count

        if 'pronunciations' not in item:
            return

        if len(item['pronunciations']) == 0:
            return

        if previous_word['name'] != item['word']:
            if previous_word['name'] is not None:
                coll_vocabularies.insert_one(encode_word(previous_word, word_index))
                word_count += 1
                
                if word_count % 100 == 0:
                    print("%d words have been processed." % word_count)

            previous_word = {
                'name': item['word'],
                'pronunciations': {}
            }
            word_index = 0

        for pronunciation in item['pronunciations']:
            # Use only RP pronunciation. Does not use pronunciations in other locale
            if 'accent' in pronunciation and 'RP' not in pronunciation['accent']:
                continue

            if 'ipa' not in pronunciation:
                continue

            # If IPA is a list, take the first item
            pronunciation['ipa'] = pronunciation['ipa'][0] \
                if isinstance(pronunciation['ipa'], list) \
                else pronunciation['ipa']

            if pronunciation['ipa'] not in previous_word['pronunciations']:
                previous_word['pronunciations'][pronunciation['ipa']] = {
                    'ipa': pronunciation['ipa'],
                    'pos': set(),
                    'sense': set(),
                    'glosses': set(),
                    'indexes': set()
                }

            # Find which things this pronunciation can represent
            pronunciation_spec = previous_word['pronunciations'][pronunciation['ipa']]
            pronunciation_spec['pos'].add(item['pos'])
            pronunciation_spec['indexes'].add(word_index)

            if 'sense' in pronunciation:
                pronunciation_spec['sense'].add(pronunciation['sense'])

            for sense in item['senses']:
                if 'glosses' not in sense:
                    continue

                pronunciation_spec['glosses'].update(sense['glosses'])
        
        word_index += 1
    
    except BaseException as error:
        if 'word' in item:
            print("Exception while processing %s: %s" % (item['word'], str(error)))
        
        else:
            print("Exception while processing an item: %s" % str(error))
    
    except:
        if 'word' in item:
            print("Unknown exception while processing %s!" % (item['word']))
        
        else:
            print("Unknown exception while processing an item!")
    

# Parse Wiktionary
if 'PreprocessWiktionary' in STAGES:
    config = WiktionaryConfig(
        capture_languages = ['English'],
        capture_pronunciation = True
    )

    parse_wiktionary(
        WIKTIONARY_PATH,
        config,
        word_callback,
        capture_callback
    )

100 words have been processed.
thou IN English/pron: ERROR: Unrecognized work form specifier: 'Nominative singular'
200 words have been processed.
300 words have been processed.
400 words have been processed.
500 words have been processed.
600 words have been processed.
700 words have been processed.
800 words have been processed.
900 words have been processed.
1000 words have been processed.
1100 words have been processed.
1200 words have been processed.
1300 words have been processed.
1400 words have been processed.
1500 words have been processed.
1600 words have been processed.
1700 words have been processed.
1800 words have been processed.
1900 words have been processed.
2000 words have been processed.
2100 words have been processed.
2200 words have been processed.
2300 words have been processed.
past IN English/noun: ERROR: unrecognized template in inside gloss: {{Daniel Webster}}
past IN English/noun: ERROR: unrecognized template in inside gloss: {{Daniel Webster}}
2400 words hav

In [12]:
# Find potential heteronyms from the database
#     More or equal than 2 pronunciations
#     More or equal than 2 meanings
potential_heteronyms = coll_vocabularies.find({
    '$and': [
        {'$expr': {'$gt': [{'$size': "$pronunciations"}, 1]}},
        {'meaning_count': {'$gt': 1}}
    ]
})

heteronyms = {}
# Find heteronyms from the potential heteronyms
#     Has different meaning (glosses) per pronunciations
for potential_heteronym in potential_heteronyms:
    pronunciations = potential_heteronym['pronunciations']
    first_meaning = pronunciations[0]['glosses']
    is_heteronym = False
    
    for other_item in pronunciations:
        other_meaning = other_item['glosses']
        
        if other_meaning != first_meaning:
            is_heteronym = True
            break
    
    if is_heteronym:
        heteronyms[potential_heteronym['name']] = potential_heteronym

print("Found %d heteronyms" % len(heteronyms))
print(heteronyms.keys())

Found 75 heteronyms
dict_keys(['number', 'second', 'beer', 'thou', 'abuse', 'accent', 'close', 'thee', 'subject', 'ai', 'tear', 'read', 'lead', 'dig', 'grind', 'prove', 'put', 'wind', 'low', 'house', 'heavy', 'liver', 'river', 'live', 'crown', 'mare', 'peer', 'bass', 'em', 'produce', 'pure', 'bow', 'affect', 'cos', 'process', 'row', 'nun', 'discord', 'apply', 'singer', 'fine', 'ay', 'aye', 'math', 'tush', 'reg', 'yea', 'resort', 'prime', 'ere', 'champ', 'digest', 'hinder', 'redress', 'toot', 'nope', 'evening', 'gallant', 'irony', 'windy', 'gin', 'console', 'ya', 'punt', 'implement', 'sith', 'underage', 'repetition', 'meta', 'misuse', 'submission', 'commune', 'gravely', 'refill', 'nachos'])


In [14]:
from collections import Counter
from nltk.corpus import wordnet as wn

# generator_key: A function which takes generator function,
# and makes it into a key function, which is used in sort
#
# Compare 1st yield, 2nd yield, ... as a lexicographical order
def generator_key(gen):
    def comp_round(gen1, gen2):
        v1 = None
        try :
            v1 = next(gen1)

        except StopIteration:
            pass

        v2 = None
        try:
            v2 = next(gen2)

        except StopIteration:
            pass

        if v1 is None and v2 is None:
            return None

        elif v1 is None:
            return 1

        elif v2 is None:
            return -1

        if v1 == v2:
            return 0

        elif v1 > v2:
            return -1

        else:
            return 1
    
    class Key:
        def __init__(self, obj):
            self.obj = obj
        
        def comp(self, other):
            g1 = gen(self.obj)
            g2 = gen(other.obj)
            
            while True:
                res = comp_round(g1, g2)
                if res is None:
                    return 0
                
                if res != 0:
                    return res
        
        def __lt__(self, other):
            return self.comp(other) > 0
        
        def __gt__(self, other):
            return self.comp(other) < 0
        
        def __eq__(self, other):
            return self.comp(other) == 0
        
        def __le__(self, other):
            return self.comp(other) >= 0
        
        def __ge__(self, other):
            return self.comp(other) <= 0
        
        def __ne__(self, other):
            return self.comp(other) != 0
    
    return Key


# merge_round: takes two sorted array, and make a sorted array by doing an mergesort
# cutlen: return the top n value
def merge_round(sorted_arr1, sorted_arr2, key, reverse = False, cutlen = None):
    result = []
    arr2_idx = 0
    top_item = sorted_arr2[arr2_idx]
    top_key = key(top_item)
    
    for item in sorted_arr1:
        item_key = key(item)
        
        while (
            top_key is not None and
            (
                (not reverse and item_key > top_key) or
                (reverse and item_key < top_key)
            )
        ):
            result.append(top_item)
            if cutlen is not None and len(result) >= cutlen:
                return result
            
            arr2_idx += 1
            if arr2_idx not in sorted_arr2:
                top_key = None
                break
            
            top_item = sorted_arr2[arr2_idx]
            top_key = key(top_item)
        
        result.append(item)
        if cutlen is not None and len(result) >= cutlen:
            return result
    
    result += sorted_arr2[arr2_idx:]
    
    if cutlen is not None:
        return result[:cutlen]
    
    return result


# Find heteronyms in sentence, and return its indexes
def find_heteronyms(sent):
    sent_heteronym_idx = []
    
    for idx, word in enumerate(sent):
        if word.lower() in heteronyms:
            sent_heteronym_idx.append(idx)
    
    return sent_heteronym_idx


# Takes a sentence and indexes of heteronyms,
# and return homograph and its appearance count
def count_homographs(sent, idx):
    counter = Counter()
    
    for i in idx:
        counter.update((sent[i], ))
    
    return {
        key: value for key, value in counter.items() if value > 1
    }


# Takes a tagged sentence and disambiguation result
# and return how much homographs with same POS are there
def count_samepos_homograph(tagged_sent, disambiguation):
    counter_dict = {}
    
    for idx, pronunciation in disambiguation.items():
        if pronunciation is None:
            continue
        
        word, tag = tagged_sent[idx]
        word_pos = '%s/%s' % (word.lower(), tag)
        
        if word_pos not in counter_dict:
            counter_dict[word_pos] = {}
        
        counter_dict[word_pos][pronunciation['ipa']] = 1
    
    # If a word with POS have multiple pronunciations, accumulate one
    return sum([1 for item in counter_dict.values() if len(item) > 1])


# Convert PENN POS tag into Wiktionary POS tag
def penn_to_wiktionary(pos):
    if pos in ('JJ', 'JJR', 'JJS'):
        return 'adj'
    
    elif pos in ('NN', 'NNP', 'NNS'):
        return 'noun'

    elif pos in ('RB', 'RBR', 'RBS'):
        return 'adv'

    elif pos in ('VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'):
        return 'verb'
    
    return None


disambiguation_ngram = 5
# disambiguate_word: Find pronunciation of idx-th word of tagged sents
def disambiguate_word(tagged_sent, idx):
    word, pos = tagged_sent[idx]
    wiktionary_pos = penn_to_wiktionary(pos)
        
    dict_word = coll_vocabularies.find_one({
        'name': word.lower()
    })
    
    # Find available pronunciations based on its POS and POS that a pronunciation can represent
    #     If there's only one pronunciation available, return that
    first_pronunciation = None
    avail_pronunciations = []
    for pronunciation in dict_word['pronunciations']:
        if first_pronunciation is None:
            first_pronunciation = pronunciation

        if wiktionary_pos in pronunciation['pos']:
            avail_pronunciations.append(pronunciation)
    
    if len(avail_pronunciations) == 0:
        return first_pronunciation
    
    if len(avail_pronunciations) == 1:
        return avail_pronunciations[0]
    
    # Find context from n-gram
    #     Alphabetical word from +- 2 words
    context = set()
    
    window_size = (disambiguation_ngram - 1) // 2
    for i in range(
        max(0, idx - window_size),
        min(len(tagged_sent), idx + window_size + 1)
    ):
        ctx_word = tagged_sent[i][0]
        if ctx_word.isalpha():
            context.add(ctx_word.lower())
    
    max_score = None
    # For glosses, extract words
    # Find maximum matching pronunciation by intersection of its glosses and context
    for pronunciation in avail_pronunciations:
        gloss_words = set()
        for gloss in pronunciation['glosses']:
            gloss_words = gloss_words | set([
                gloss_word.lower()
                for gloss_word in word_tokenizer.tokenize(gloss)
                if gloss_word.isalpha()
            ])
        
        current_score = len(context & gloss_words)
        if max_score is None:
            max_score = {
                'pronunciation': pronunciation,
                'score': current_score
            }
        
        if max_score['score'] > current_score:
            max_score['pronunciation'] = pronunciation
            max_score['score'] = current_score
    
    return max_score['pronunciation']


disambiguation_cache = {}
# For every heteronyms, does an disambiguation
# The result is memoized
def disambiguate_sent(sent, sent_heteronym_idx = None):
    if sent_heteronym_idx is None:
        sent_heteronym_idx = find_heteronyms(sent)
    
    sent_hash = hash(tuple(sent))
    if sent_hash in disambiguation_cache:
        return disambiguation_cache[sent_hash]
    
    tagged_sent = pos_tag(sent)
    result = {}    
    for idx in sent_heteronym_idx:
        result[idx] = disambiguate_word(tagged_sent, idx)
    
    disambiguation_cache[sent_hash] = result
    return result


# A key function to sort sentences, sorted in lexicographical order and lazy-evaluated
# 1. Count of heteronyms
# 2. Count of homographs
# 3. Count of homographs which have same POS
def key_sentence(sent):
    sent_heteronym_idx = find_heteronyms(sent)
    yield len(sent_heteronym_idx)
    
    sent_homograph_cnt = count_homographs(sent, sent_heteronym_idx)
    yield len(sent_homograph_cnt)

    if len(sent_homograph_cnt) == 0:
        yield 0
    
    else:
        disambiguation = disambiguate_sent(sent, sent_heteronym_idx)
        yield count_samepos_homograph(pos_tag(sent), disambiguation)


coll_sentences = db.sentences
counts = coll_sentences.count_documents({})
buffer = []
top_sents = []
sort_key = generator_key(key_sentence)
top_n = 30
batch_size = min(50, top_n)

# Batch fetch from the sentence collections
# Then sort its result, does 1 round of merge sort and cut top 30 sentences
for i, item in tqdm(enumerate(
    coll_sentences.find(batch_size = batch_size)
), total = counts):
    
    buffer.append(item['sent'])
    
    if i % batch_size == 0:
        buffer_sorted = sorted(buffer, key = sort_key, reverse = True)
        top_sents = merge_round(top_sents, buffer_sorted, key = sort_key, cutlen = top_n, reverse = True)
        buffer = []

if len(buffer) > 0:
    buffer_sorted = sorted(buffer, key = sort_key, reverse = True)
    top_sents = merge_round(top_sents, buffer_sorted, key = sort_key, cutlen = top_n, reverse = True)

# Print results
results = []
for sent in top_sents:
    disambiguation = disambiguate_sent(sent)
    pronunciations = [
        "%s%s" % (sent[key], value['ipa'])
        for key, value in sorted(
            list(disambiguation.items()),
            key=lambda item: item[0]
        )
    ]
    
    print(tuple(key_sentence(sent)), ' '.join(sent))
    print(', '.join(pronunciations))
    
    results.append((' '.join(sent), *pronunciations))


import csv

with open('./results.csv', 'w', encoding='utf-8') as csvfile:
    writer = csv.writer(csvfile)
    
    for result in results:
        writer.writerow(result)


HBox(children=(FloatProgress(value=0.0, max=7040.0), HTML(value='')))


(5, 1, 0) The following poem is taken from here link " A bass might eat a bass when dining on the town A bass might eat the bass if the singer chanced to drown .
bass/bæs/, bass/bæs/, bass/bæs/, bass/bæs/, singer/ˈsɪŋə/
(4, 2, 1) However , read does not rhyme with lead , nor does lead with read .
read/ɹiːd/, lead/lɛd/, lead/lɛd/, read/ɹɛd/
(4, 2, 0) Rhyming with Heteronyms In a book you may have read about the toxic traits of lead it says to each word , with care , read for to better health it aims to lead .
read/ɹiːd/, lead/lɛd/, read/ɹiːd/, lead/lɛd/
(4, 1, 0) - - - another thing I just noticed , S resembles number 5 , and Z partially resembles number 7 , when saying number of letters long more commonly uses third letter that resembles number of letters long for the more common third letters of each variant , coincidence ?
number/ˈnʌmbə/, number/ˈnʌmbə/, number/ˈnʌmbə/, number/ˈnʌmbə/
(4, 0, 0) Neither does Read past tense of read rhymes with Lead to lead from the front .
Read/ɹɛd/,