# Evaluation of Vocabulary Assistance Method

In [1]:
import pandas as pd
from simplertimes.vocab import find_diffs, find_potential_matches, eliminate_dupes, get_word_definition


[nltk_data] Downloading package perluniprops to
[nltk_data]     C:\Users\hansg\AppData\Roaming\nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hansg\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hansg\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Step 1: Testing on Synthetic Data

Test the system using some hand-crafted samples and samples from the published outputs of ACCESS.

In [2]:
infile = 'docs/dev_files/test_sample/vocab_assistance_input_example_data.csv'
outfile = 'vocab_assistance_output_example_data.csv'
in_df = pd.read_csv(infile)
input_data = in_df.itertuples(index=False, name=None)

In [3]:
final_dict = {}
i = 0
for comp_sentence, simp_sentence in input_data:
    deleted,replaced = find_diffs(comp_sentence, simp_sentence)
    potential_pairs = find_potential_matches(deleted,replaced)
    final_pairs = eliminate_dupes(potential_pairs, replaced)
    for pair in final_pairs:  
        final_dict[i] = {'complicated_sentence': comp_sentence, 'simplified_sentence': simp_sentence, 'difficult_word': pair[0], 'easy_word': pair[1], 'meaning_of_difficult_word': get_word_definition(pair[0].text)}
        i = i + 1


In [4]:
out_df = pd.DataFrame.from_dict(final_dict, orient='index')
out_df.to_csv(outfile, index=False)
out_df

Unnamed: 0,complicated_sentence,simplified_sentence,difficult_word,easy_word,meaning_of_difficult_word
0,i love cats,i like dogs,love,like,a strong positive emotion of regard and affection
1,i love cats,i like dogs,cats,dogs,feline mammal usually having thick soft fur an...
2,"Theia, a one-year-old bully breed mix, was hit...","Theia , a one-year-old breed mix , was hit by ...",bully,,a cruel and brutal fellow
3,one side of the armed conflicts is composed ma...,one side of the armed conflict is made up of t...,composed,,form the substance of
4,one side of the armed conflicts is composed ma...,one side of the armed conflict is made up of t...,mainly,,for the most part
...,...,...,...,...,...
83,"during an interview, edward gorey mentioned th...","during an interview, edward gorey said that ba...",lamenting,said,express grief verbally
84,"during an interview, edward gorey mentioned th...","during an interview, edward gorey said that ba...",fact,,a piece of information about circumstances tha...
85,"during an interview, edward gorey mentioned th...","during an interview, edward gorey said that ba...",fine,best,money extracted as a penalty
86,gable also earned an academy award nomination ...,gable also won an academy award nomination whe...,earned,won,earn on some commercial or business transactio...


## Step 2: Testing on ACCESS-Simplified ST Dataset

Test the system using the ACCESS-simplified Straits Times dataset.

In [5]:
infile = 'results/system_outputs/access_doc_sent_pair_simp.csv'
outfile = 'vocab_assistance_output_access_sent_pairs.csv'
in_df = pd.read_csv(infile)
input_data = in_df.itertuples(index=False, name=None)

In [6]:
final_dict = {}
i = 0
for _, _, comp_sentence, simp_sentence in input_data:
    deleted,replaced = find_diffs(comp_sentence, simp_sentence)
    potential_pairs = find_potential_matches(deleted,replaced)
    final_pairs = eliminate_dupes(potential_pairs, replaced)
    for pair in final_pairs:  
        final_dict[i] = {'complicated_sentence': comp_sentence, 'simplified_sentence': simp_sentence, 'difficult_word': pair[0], 'easy_word': pair[1], 'meaning_of_difficult_word': get_word_definition(pair[0].text)}
        i = i + 1


  candidate_word[repl_w] = del_w.similarity(repl_w)


In [7]:
out_df = pd.DataFrame.from_dict(final_dict, orient='index')
out_df.to_csv(outfile, index=False)
out_df

Unnamed: 0,complicated_sentence,simplified_sentence,difficult_word,easy_word,meaning_of_difficult_word
0,The Johor Bahru-Singapore Rapid Transit System...,The Johor Bahru-Singapore Rapid Transit System...,progressing,work,develop in a positive way
1,45 per cent of the work on the Singapore side ...,45% of the work on the Singapore side has been...,cent,,a fractional monetary unit of several countries
2,Transport Minister S. Iswaran provided the upd...,Transport Minister S. Iswaran gave the update ...,provided,gave,give something useful or necessary to
3,Transport Minister S. Iswaran provided the upd...,Transport Minister S. Iswaran gave the update ...,visited,went,"go to see a place, as for entertainment"
4,He said: We are on track to achieve the comple...,He said: We are on track to get the completion...,achieve,,to gain with effort
...,...,...,...,...,...
111,His return to the platforms gives him access t...,His return to the platforms gives him a good w...,key,good,metal device shaped in such a way that when it...
112,His return to the platforms gives him access t...,His return to the platforms gives him a good w...,vehicles,,a conveyance that transports people or objects
113,Mr Trump powered his improbable 2016 president...,Mr Trump powered the 2016 presidential campaig...,improbable,,not likely to be true or to occur or to have o...
114,YouTube banned Mr Trump in 2021 for violating ...,YouTube banned Mr Trump in 2021 for being allo...,violating,allowed,fail to agree with; be in violation of; as of ...
