In [1]:
import pandas as pd
import difflib
import spacy
import operator
from nltk.corpus import wordnet as wn
nlp = spacy.load("en_core_web_lg")

In [2]:
infile = 'vocab_assistance_input_example_data.csv'
outfile = 'vocab_assistance_output_example_data.csv'
in_df = pd.read_csv(infile)
input_data = list(in_df.itertuples(index=False, name=None))

In [3]:
def find_diffs(comp_sentence, simp_sentence):
    ''' find list of diffs'''
    doc_comp = nlp(comp_sentence)
    doc_simp = nlp(simp_sentence)
    d = difflib.Differ()
    diff = d.compare([w.text for w in doc_comp], [w.text for w in doc_simp])
    deleted = []
    replaced = []
    for i,s in enumerate(diff):
        if s[0]=='-':
            word = s[2:]
            for w in doc_comp:
                if w.text==word and not w.is_stop and not w.is_punct:
                    deleted.append(w)
        if s[0]=='+':
            word = s[2:]
            for w in doc_simp:
                if w.text == word and not w.is_stop and not w.is_punct:
                    replaced.append(w)
    return deleted,replaced 

In [4]:
def find_potential_matches(deleted,replaced):
    '''find potential word matches'''
    replaced_pos = [w.pos_ for w in replaced]
    potential_pairs = []
    for del_w in deleted:
        candidate_word = {}
        to_skip = False
        for repl_w in replaced:
            if del_w.lemma_ == repl_w.lemma_:
                #skip this word
                to_skip = True
            elif replaced_pos.count(repl_w.pos_) == 1 and del_w.pos_ == repl_w.pos_:
                potential_pairs.append([del_w, repl_w])
                break
            elif del_w.pos_ == repl_w.pos_: 
                candidate_word[repl_w] = del_w.similarity(repl_w)
        if candidate_word:
            top_similar = sorted(candidate_word.items(), key=operator.itemgetter(1),reverse=True)[0][0]
            potential_pairs.append([del_w, top_similar])
        if not to_skip and not del_w in [item for pair in potential_pairs for item in pair]:
            potential_pairs.append([del_w, None])
    return potential_pairs
    

In [5]:
def eliminate_dupes(potential_pairs):
    ''' eliminate duplicate matches'''
    matched = [item[1] for item in potential_pairs]
    dupes = {}
    for word in matched:
        indices = [index for index, item in enumerate(matched) if item == word]
        if len(indices) > 1 and not word is None:
            dupes[word] = indices
    for word, indices in dupes.items():
        similarity_words = [potential_pairs[i][0] for i in indices]
        similarity_scores={}
        for sim_w in similarity_words:
            similarity_scores[sim_w] = sim_w.similarity(word)
        top_similar = sorted(similarity_scores.items(), key=operator.itemgetter(1),reverse=True)[0][0]
        for i in indices:
            if potential_pairs[i][0] != top_similar:
                potential_pairs[i][1] = None
    return potential_pairs
        
    

In [6]:
final_dict = {}
i = 0
for pair in input_data:
    comp_sentence,simp_sentence = pair
    
    deleted,replaced = find_diffs(comp_sentence, simp_sentence)
    potential_pairs = find_potential_matches(deleted,replaced)
    final_pairs = eliminate_dupes(potential_pairs)
    dict_pairs = []
    
    for pair in final_pairs:  
        syns = wn.synsets(pair[0].text)
        meaning = None
        if syns:
            meaning = syns[0].definition()
        final_dict[i] = {'complicated_sentence': comp_sentence, 'simplified_sentence': simp_sentence, 'difficult_word': pair[0], 'easy_word': pair[1], 'meaning_of_difficult_word': meaning}
        i = i + 1


In [7]:
out_df = pd.DataFrame.from_dict(final_dict, orient='index')
out_df.to_csv(outfile, index=False)
out_df

Unnamed: 0,complicated_sentence,simplified_sentence,difficult_word,easy_word,meaning_of_difficult_word
0,i love cats,i like dogs,love,like,a strong positive emotion of regard and affection
1,i love cats,i like dogs,cats,dogs,feline mammal usually having thick soft fur an...
2,"Theia, a one-year-old bully breed mix, was hit...","Theia , a one-year-old breed mix , was hit by ...",bully,,a cruel and brutal fellow
3,one side of the armed conflicts is composed ma...,one side of the armed conflict is made up of t...,composed,,form the substance of
4,one side of the armed conflicts is composed ma...,one side of the armed conflict is made up of t...,mainly,,for the most part
...,...,...,...,...,...
83,"during an interview, edward gorey mentioned th...","during an interview, edward gorey said that ba...",lamenting,,express grief verbally
84,"during an interview, edward gorey mentioned th...","during an interview, edward gorey said that ba...",fact,,a piece of information about circumstances tha...
85,"during an interview, edward gorey mentioned th...","during an interview, edward gorey said that ba...",fine,best,money extracted as a penalty
86,gable also earned an academy award nomination ...,gable also won an academy award nomination whe...,earned,won,earn on some commercial or business transactio...
