In [None]:
from nltk.corpus import wordnet as wn
from collections import defaultdict
from nltk.corpus import stopwords
from datasets import load_dataset
from tqdm import tqdm
from utils import *
import json

# Dataset

https://github.com/thunlp/MultiRD/tree/master/EnglishReverseDictionary/data

- Wordnet
- The American Heritage Dictionary
- The Collaborative International Dictionary of English
- Wiktionary
- Webster’s

In [None]:
data_5d = defaultdict(set)
examples_5d = defaultdict(set)

# wantwords
for split in ['train', 'dev', 'test']:
    with open(f'data/data_5d_{split}.json', 'r') as f:
        for i in json.load(f):
            word = i['word']
            defi = i['definitions']
            data_5d = add_new_data(data_5d, word, defi)

# wordnet (additional 54558 words or phrases)
for syn in wn.all_synsets():
    word = syn.name().split('.')[0]
    defi = syn.definition()
    data_5d = add_new_data(data_5d, word, defi)
    for example in syn.examples():
        examples_5d = add_new_data(examples_5d, word, example)

save_data_to_json('data/data_5d.json', data_5d, examples_5d)

In [None]:
data_5d = load_data_from_json('data/data_5d.json')
show_data_stats(data_5d)

In [None]:
data_5d_with_examples = load_data_from_json('data/data_5d.json', use_examples=True)
show_data_stats(data_5d_with_examples)

# Data Supplement

- Oxford English Dictionary (https://developer.oxforddictionaries.com)

In [None]:
data_oed = load_data_from_json('data/data_oed.json')
show_data_stats(data_oed)

In [None]:
data_oed_with_examples = load_data_from_json('data/data_oed.json', use_examples=True)
show_data_stats(data_oed_with_examples)

- Wikipedia (https://huggingface.co/datasets/wikipedia)

In [None]:
wikipedia = load_dataset('wikipedia', '20200501.en', split='train')
data_wiki = defaultdict(set)

for i in tqdm(wikipedia):
    word = process_word_name(i['title'])
    # only use the first paragraph
    defi = i['text'].split('\n')[0]
    data_wiki = add_new_data(data_wiki, word, defi)

# remove words that are not in dictionary
for word in set(data_wiki.keys()).difference(data_5d.keys()):
    data_wiki.pop(word)
  
save_data_to_json('data/data_wiki.json', data_wiki)

In [9]:
data_wiki = load_data_from_json('data/data_wiki.json')
show_data_stats(data_wiki)

num of words: 51664
num of docs: 56065
max doc length: 359
min doc length: 1
mean doc length: 45.67
median doc length: 38.0


# Merge

In [None]:
data_merged = defaultdict(set)

for data in [data_5d, data_oed, data_wiki]:
    for word, defi_set in data.items():
        for defi in defi_set:
            data_merged[word].add(defi)

save_data_to_json('data/data.json', data_merged)

In [10]:
data_merged = load_data_from_json('data/data.json')
show_data_stats(data_merged)

num of words: 103874
num of docs: 1007305
max doc length: 359
min doc length: 1
mean doc length: 12.88
median doc length: 9.0


In [None]:
data_merged_with_examples = defaultdict(set)

for data in [data_5d_with_examples, data_oed_with_examples, data_wiki]:
    for word, defi_set in data.items():
        for defi in defi_set:
            data_merged_with_examples[word].add(defi)

save_data_to_json('data/data_with_examples.json', data_merged_with_examples)

In [None]:
data_merged_with_examples = load_data_from_json('data/data_with_examples.json')
show_data_stats(data_merged_with_examples)

# Data Augmentation

In [None]:
def augment_data(data, out_name):
    # back translation
    for word in tqdm(words_to_aug(data, 60000)):
        for defi in data[word].copy():
            trans_defi = back_translate(defi)
            if trans_defi:
                data[word].add(trans_defi)

    save_data_to_json(f'data/{out_name}_trans.json', data)    

    # synonym augmentation
    synonym_aug = naw.SynonymAug(stopwords=stopwords.words('english'))
    for word in tqdm(words_to_aug(data)):
        for defi in data[word].copy():
            for augmented_text in synonym_aug.augment(defi, n=10):
                data[word].add(augmented_text)
    
    save_data_to_json(f'data/{out_name}_trans_synonym.json', data)    

    return data

In [None]:
augment_data(data_merged, 'data_augmented')
show_data_stats(load_data_from_json('data/data_augmented_trans.json'))
show_data_stats(load_data_from_json('data/data_augmented_trans_synonym.json'))

In [12]:
# augment_data(data_merged_with_examples, 'data_augmented_with_examples')
show_data_stats(load_data_from_json('data/data_augmented_with_examples_trans.json'))
show_data_stats(load_data_from_json('data/data_augmented_with_examples_trans_synonym.json'))

num of words: 103874
num of docs: 1239899
max doc length: 359
min doc length: 1
mean doc length: 11.96
median doc length: 8.0
