#### NOTES

#### Objective: 
This code aims at extracting synonyms of a given list of words from Merriam-Webster Dictionary through API call 
#### Synonyms extraction with given word lists:
update the wordlists (restricted to 2k words, 1k per API KEY
#### Analysis on extracted words:
calculate count number of seed words, number of expanded words (exclude seedwords), expansion rate 


In [36]:
import requests
import json
from collections import defaultdict
import pandas as pd 
import numpy as np
import json 
import os
from datetime import datetime
from collections import Counter
import re




API_KEY_1 = "YOUR_API_KEY1"
API_KEY_2 ='YOUR_API_KEY2'


# extract the raw synoynms using API call
def extract_raw_syn_list(query_list,only_oneword_syn=True):
    word_list = defaultdict(dict)
    not_found_list=[]
#     with Bar('Processing', max=len(query_list)) as bar:
    count =0
    for query in query_list:
        count +=1
        word_attr=defaultdict(list)
        try:
            url = "https://www.dictionaryapi.com/api/v3/references/thesaurus/json/{}?key={}".format(query,API_KEY_1)
            query_js = requests.get(url).json()
        except:
            url = "https://www.dictionaryapi.com/api/v3/references/thesaurus/json/{}?key={}".format(query,API_KEY_2)
            query_js = requests.get(url).json()
        try:
            for item in query_js:
                    if item['hwi']['hw']==query: ## to extract only result contains one word only
                        pos = item['fl'] ## get pos 
                        synset = item['meta']['syns'] ## get syn list for each entry
                        synset = clean_syn_list(synset,only_oneword_syn)
                        word_attr[pos]+=synset
            word_list[query]=word_attr
        except:
             ## keep the failed keywords 
            not_found_list.append(query)
            word_list[query]={}
        if (count%100==0):
            print(f"count==>{count}")
    print(f'len of not_found:{len(not_found_list)}')
    print(f'len of total valid seed words:{len(query_list)-len(not_found_list)}')
    return word_list,not_found_list

### keep only oneword synonyms only
def clean_syn_list(synset,only_oneword_syn=True):
    if only_oneword_syn:
        for syn_list in synset:
            for item in syn_list:
                if item.find(' ')!=-1: ### remove synonyms that are more than one words
                    syn_list.remove(item)
    return synset


"""
Save the raw extraction to a txt file for future use
"""
def save_raw_extraction_as_json(raw_word_list,not_found_list, output_name="raw_extraction"):
    now = datetime.now()
    current_time = now.strftime("%d_%m_%H-%M-%S")
    word_list_filename = "./{}{}.txt".format(output_name,current_time)
    not_fount_list_filename='./{}_not_found{}.txt'.format(output_name,current_time)
    with open(word_list_filename,"w+") as f:
        json.dump(raw_word_list,f)
    with open(not_fount_list_filename,'w+')as f:
        json.dump(not_found_list,f)

    
    
"""
output word_list to csv ==> 3 columns
word 
pos_sub= pos_num (indicate the pos and number of meaning of that pos )
synlist = the syn list of that pos_sub

"""
def output_word_list(word_list,output_name="word_list"):
    pos_sub =[] 
    synset =[]
    word_index =[] ## first column
    for word, pos in word_list.items():
        for pos,syn_set in pos.items():
            count = 1
            for syn_list in syn_set:
                index_string = "{}_{}_{:02d}".format(word,pos,count)
                pos_sub.append(index_string)
                synset.append(syn_list)
                count+=1
                word_index+=[word]
    new_pf = pd.DataFrame(np.array([word_index,pos_sub,synset]).T,columns=['word','pos_sub','synlist'])
    return new_pf.to_csv("{}.csv".format(output_name),index=False)


def main(query_list,only_oneword_syn=True):
    word_list,not_found_list = extract_raw_syn_list(query_list,only_oneword_syn)
    save_raw_extraction_as_json(word_list,not_found_list)
#     data = pd.DataFrame(word_list).T  ## index = word, columns = POS
    output_word_list(word_list)
    
    

# if __name__=="__main__":
#     ### update your query_list here
#     pd = 
#     main(query_list)
    

### Synonyms extraction with given word lists

In [29]:
query_list = pd.read_excel('./syn/rank_imdb_rank_ngram.xlsx')

In [33]:
# query using the rank_imdb/rank_ngram percentage
main(query_list['rank_imdb/rank_ngram'].tolist())

len of not_found:128
len of total valid seed words:872


### Analysis on the words

In [38]:
first1k = pd.read_csv("./syn/first1k/word_list_1st.csv")

In [64]:
full_expand = set()

for item_list in first1k['synlist']:
    for item in re.findall('[a-zA-Z]+',item_list):
        full_expand.add(item)
seed_word = set(first1k['word'].unique())

In [79]:
first1k['synlist_formatted']=first1k['synlist'].apply(lambda x: re.findall('[a-zA-Z]+',x))

In [86]:
first1k.head()

Unnamed: 0,word,pos_sub,synlist,synlist_formatted
0,indifference,indifference_noun_01,"['apathy', 'casualness', 'complacence', 'disin...","[apathy, casualness, complacence, disintereste..."
1,false,false_adjective_01,"['erroneous', 'inaccurate', 'incorrect', 'inex...","[erroneous, inaccurate, incorrect, inexact, in..."
2,false,false_adjective_02,"['artificial', 'bogus', 'dummy', 'ersatz', 'fa...","[artificial, bogus, dummy, ersatz, factitious,..."
3,false,false_adjective_03,"['bogus', 'counterfeit', 'fake', 'forged', 'in...","[bogus, counterfeit, fake, forged, inauthentic..."
4,false,false_adjective_04,"['affected', 'artificial', 'assumed', 'bogus',...","[affected, artificial, assumed, bogus, contriv..."


In [65]:
print(f'number of seed words : {len(seed_word)}')
print(f'number of expanded words (exclude seedwords) : {len(full_expand - seed_word)}')
print(f'expansion rate {len(full_expand - seed_word)/len(seed_word)}')

number of seed words : 819
number of expanded words (exclude seedwords) : 12182
expansion rate 14.874236874236875


In [122]:
total = []
for item in first1k['synlist_formatted']:
    total.extend(item)
len(total)
cnt = Counter(total)

In [129]:
len(dict(filter(lambda elem: elem[1]>1,dict(cnt).items())).keys())

6281

In [144]:
groupby_word_syn_count = first1k[['word','syn_len']].groupby('word').sum()
groupby_word_syn_count.loc[groupby_word_syn_count['syn_len']>1]

Unnamed: 0_level_0,syn_len
word,Unnamed: 1_level_1
abruptly,2
absolute,82
abuse,51
abyss,5
acceptable,11
...,...
write,4
yea,16
yesterday,4
yet,33


In [148]:
second1k[['word','syn_len']].groupby('word')['syn_len'].count()

word
abruptly      1
absolute      5
abuse         6
abyss         1
acceptable    1
             ..
write         2
yea           3
yesterday     1
yet           5
zip           5
Name: syn_len, Length: 827, dtype: int64