In [102]:
import pandas as pd

# Preprocess
We leverage Google Research's synthetic word generator (published as part of the WinoDict project) to generate a large number of synthetic words. We then combine this with a high quality public dataset that contains words with their associated definition and a few usage examples. We use join the synthetic words with `word_def_usage` dataset to replace the real words with synthetic words to create a new dataset `word_def_usage_synthetic` that contains synthetic words with their associated definition and usage examples.

In [103]:
word_def_usage_path = "../data/word-meaning-examples.csv"
synthetic_words_path = "../data/words_300k.tsv"

In [104]:
# Load data
word_def_usage_df = pd.read_csv(word_def_usage_path)

word_def_usage_df = word_def_usage_df.dropna(
    subset=["Word", "Meaning", "Examples/0"]    # Drop rows with missing values (without at least one example), reset index
).reset_index(drop=True)
word_def_usage_df.shape

(13143, 12)

In [105]:
# 'Word's that are not unique
word_def_usage_df[word_def_usage_df['Word'].duplicated(keep=False)].shape

(866, 12)

In [106]:
synthetic_words_df = pd.read_csv(synthetic_words_path, sep='\t', names=['Word', 'Score', 'Rules'])
synthetic_words_df = synthetic_words_df.dropna().reset_index(drop=True)
synthetic_words_df.shape

(55084, 3)

In [107]:
# replace words in word_def_usage_df with words from synthetic_words_df
word_def_usage_synthetic_df = word_def_usage_df.copy()


def replace_word(i: int):
    # replace word
    word_def_usage_synthetic_df.Word[i] = synthetic_words_df.Word[i]

    # replace word in meaning
    word_def_usage_synthetic_df.Meaning[i] = str(
        word_def_usage_synthetic_df.Meaning[i]
    ).replace(
        str(word_def_usage_df.Word[i]),
        str(synthetic_words_df.Word[i]),
    )

    # replace word in examples
    for ei in range(10):
        example_col_id = f"Examples/{ei}"
        if word_def_usage_synthetic_df[example_col_id][i] != "nan":
            word_def_usage_synthetic_df[example_col_id][i] = str(
                word_def_usage_synthetic_df[example_col_id][i]
            ).replace(
                str(word_def_usage_df.Word[i]),
                str(synthetic_words_df.Word[i]),
            )


word_def_usage_df.index.map(lambda i: replace_word(i))
word_def_usage_synthetic_df.head()

Unnamed: 0,Word,Meaning,Examples/0,Examples/1,Examples/2,Examples/3,Examples/4,Examples/5,Examples/6,Examples/7,Examples/8,Examples/9
0,undes,a phrase used to reference a situation that is...,The poor orphan’s life hasn’t been a bed of ro...,"Because Mark studied for many weeks, taking th...",The stressed business owner quickly realized t...,,,,,,,
1,undel,something that is believed to be true without ...,Religious people have the a priori belief that...,The jaded woman made a priori assumptions that...,Christopher Columbus had the a priori belief t...,People make a priori assumptions that the sun ...,Lawyers use a priori arguments about their def...,,,,,
2,undef,"a large, long-eared nocturnal animal that uses...",The long-eared aardvark slept all day and ate ...,"With its rabbit like ears and piglike snout, t...","Living a solitary nightlife, the African aardv...",The aardvark is a snouty mammal that dines hea...,"Though it is a small mammal, the closest relat...",,,,,
3,overy,in amazement; shocked,I was taken aback by the server’s rude comment...,"While Amy enjoyed the film as a whole, as a mi...",Ken believed he had a great marriage so he was...,"At seventy years of age, Wanda was taken aback...",Although I had been exercising daily and eatin...,,,,,
4,overb,toward the stern; behind,The wind was abaft which made the ship glide f...,They placed the lifeboats in the rear part of ...,My first sailing lesson was learning that abaf...,"The sea spray was coming from abaft, making my...",I was told to sit abaft to lessen my seasickne...,,,,,


In [108]:
# split data into train, test and prompt sets
train = 0.5
test = 0.2
prompt = 0.3

train_df = word_def_usage_synthetic_df.sample(frac=train, random_state=42)
test_df = word_def_usage_synthetic_df.drop(train_df.index).sample(frac=test / (1 - train), random_state=42)
prompt_df = word_def_usage_synthetic_df.drop(train_df.index).drop(test_df.index).sample(frac=prompt / (1 - train - test), random_state=42)

train_df.shape, test_df.shape, prompt_df.shape

((6572, 12), (2628, 12), (3943, 12))

In [109]:
# verify that there are no duplicates in train, test and prompt sets
print(train_df[train_df.duplicated(subset=['Word', 'Meaning'], keep=False)].shape)
print(test_df[test_df.duplicated(subset=['Word', 'Meaning'], keep=False)].shape)
print(prompt_df[prompt_df.duplicated(subset=['Word', 'Meaning'], keep=False)].shape)

(0, 12)
(0, 12)
(0, 12)


In [110]:
# write to csv
train_df.to_csv("../out/word_def_usage_synthetic_train.csv", index=False)
test_df.to_csv("../out/word_def_usage_synthetic_test.csv", index=False)
prompt_df.to_csv("../out/word_def_usage_synthetic_prompt.csv", index=False)