In [60]:
import collections
import re

In [61]:
import pandas as pd
import nltk.tokenize 
import numpy as np

## Step 1

In [115]:
synonyms_df = pd.read_csv("synonyms.csv", skiprows=1, header=None)
middle_vocabulary_df = pd.read_csv("middle_vocabulary.csv", usecols=[0, 1])
middle_vocabulary_df.columns=["word", "pos"]
middle_vocabulary = set([w.lower() for w in middle_vocabulary_df.word])

In [116]:
def is_word(token):
    for c in token:
        if not c.isalpha():
            return False
    return True

In [117]:
SynonymList = collections.namedtuple(
    "SynonymList", ["word", "synonyms", "is_original"]
)


def extract_synonyms(line, middle_vocabulary, synonym_map):
    tokens = [token.lower().strip() for token in nltk.tokenize.word_tokenize(line)]
    words = [token for token in tokens if token in middle_vocabulary]
    if len(words) >= 2:
        synonym_map[words[0]] = SynonymList(words[0], words[1:], True)
        for i in range(1, len(words)):
            synonyms = words[:i] + words[(i + 1):]
            if words[i] not in synonym_map:
                synonym_map[words[i]] = SynonymList(
                    words[i], synonyms, False
                )
            else:
                if not synonym_map[words[i]].is_original:
                    synonym_map[words[i]].synonyms.extend(synonyms)


def test__extract_synonyms():
    synonym_map = {}
    extract_synonyms("Blank —– Empty, bare", {"blank", "empty"}, synonym_map)
    assert synonym_map["blank"].synonyms == ["empty"]
    assert synonym_map["blank"].is_original == True
    assert synonym_map["empty"].synonyms == ["blank"]
    assert synonym_map["empty"].is_original == False
    extract_synonyms("Bare —– blank, empty", {"blank", "empty", "bare"}, synonym_map)
    assert synonym_map["blank"].synonyms == ["empty"]
    assert synonym_map["blank"].is_original == True
    assert synonym_map["empty"].synonyms == ["blank", "bare", "blank"]
    assert synonym_map["empty"].is_original == False
    assert synonym_map["bare"].synonyms == ["blank", "empty"]
    assert synonym_map["bare"].is_original == True

    
test__extract_synonyms()

In [118]:
synonym_map = {}
for _, row in synonyms_df.iterrows():
    extract_synonyms(row[0], middle_vocabulary, synonym_map)

    
result = []
for _, synonym_list in sorted(synonym_map.items(), key=lambda item: item[0]):
    if synonym_list.is_original:
        synonyms = synonym_list.synonyms
    else:
        synonyms = list(set(synonym_list.synonyms))
    result.append(
        {
            "word": synonym_list.word,
            "synonyms": ", ".join(synonyms),
            "is_original": synonym_list.is_original
        }
    )

In [119]:
pd.DataFrame(
    result, columns=["word", "synonyms", "is_original"]
).to_csv("processed_synonyms.csv", index=False)

## Step 2

In [139]:
edited_synonyms = pd.read_csv("edited_synonyms.csv", skiprows=1)

In [150]:
edited_synonyms["is_original"] = edited_synonyms.word.map(
    {w: synonym_list.is_original for w, synonym_list in synonym_map.items()}
)

In [228]:
edited_synonyms.head()

Unnamed: 0,word,synonyms,is_original
0,able,capable,True
1,absent,away,False
2,accept,"agree, admit",False
3,accident,crash,False
4,account,consider,False


## Step 3

### Applying a Correction

In [211]:
corrected_synonyms_df = pd.read_csv("corrected_synonyms.csv", header=None).dropna()
corrected_synonyms_df.columns = ["word", "synonyms"]
corrected_synonyms_df["word"] = corrected_synonyms_df.word.apply(
    lambda p: p.strip().strip("()").split(",")[0].strip("'")
)

In [217]:
final_synonyms = pd.merge(
    edited_synonyms,
    corrected_synonyms_df,
    on="word",
    how="left"
)

In [219]:
final_synonyms["synonyms"] = final_synonyms["synonyms_x"]
synonyms_y_not_null_mask = ~final_synonyms.synonyms_y.isnull()
final_synonyms.loc[synonyms_y_not_null_mask, "synonyms"] = final_synonyms.loc[
    synonyms_y_not_null_mask, "synonyms_y"
]
final_synonyms.drop(["synonyms_x", "synonyms_y"], 1, inplace=True)

### Excluding Some Words

In [176]:
exclude_df = pd.read_csv("words_to_exclude.csv", header=None)

In [197]:
words_to_exclude = []

for i in range(4):
    words_to_exclude.extend(
        [line.split()[0].lower() for line in exclude_df.loc[~exclude_df[i].isnull(), i]]
    )

### Creating a CSV File

In [227]:
final_synonyms.loc[
    final_synonyms.is_original & ~final_synonyms.word.isin(set(words_to_exclude)), 
    ["word", "synonyms"]
].to_csv("filtered_edited_synonyms.csv", index=False)