In [None]:
# Building the Golden Set!

# So our "golden set" takes the form of a JSON dictionary
# {eng word: chn word, eng word: chn word, eng word: chn word... } etc.
# We'll be taking this golden set from 4 different dictionaries
# Yabla, Cambridge, MDBG, and the Facebook AI set

# First we compile a set of ALL english keys from every data set
import json

with open("../Cambridge/JSON Data/full cdict three senses.json", encoding="utf-8-sig") as in_file:
    cdict = json.load(in_file)
    cdictWords = list(cdict.items())

with open("../MDBG/JSON Data/full mdbg dict three senses.json", encoding="utf-8-sig") as in_file:
    mdbgdict = json.load(in_file)
    mdbgWords = list(mdbgdict.items())

with open("../Yabla/JSON Data/yabla dict jsons/full yabla dict.json", encoding="utf-8-sig") as in_file:
    ydict = json.load(in_file)
    ydictWords = list(ydict.items())

with open("../processed enzhDict.json", encoding="utf-8-sig") as in_file:
    facebookDict = json.load(in_file)
    facebookWords = list(facebookDict.items())
allWords = cdictWords + mdbgWords + ydictWords + facebookWords

# with open("allWords.txt", "w", encoding="utf-8-sig") as out_file:
#       out_file.write("\n".join(allWords))

In [2]:
# Then we check all four to see the text equivalents
# If a word has the same translation in AT LEAST three datasets:
# add to golden set dictionary
# else, continue
# we can make quick modifications to input word to golden set if it has same translation
# in at least two, but this would involve dealing with possible ties

goldenSet = {}

for pair in allWords:
    (key, senselist) = pair
    for sense in senselist:
        cdictEquiv = cdict.get(key)
        mdbgdictEquiv = mdbgdict.get(key)
        ydictEquiv = ydict.get(key)
        facebookEquiv = facebookDict.get(key)

        # Makes temporary list with all four translation equivalents
        temp = [cdictEquiv, mdbgdictEquiv, ydictEquiv, facebookEquiv]

        wordCounts = dict()
        # Making a word count dictionary for each equivalent in the list
        for wordlist in temp:
            if wordlist == None:
                continue
            for chnWord in wordlist:
                if chnWord == None:
                    continue
                if chnWord != sense:
                    continue
                wordCounts[chnWord] = wordCounts.get(chnWord, 0) + 1
                

        # Adding a ENG-ZH pair to the golden set if it occurs in 3 or more dictionaries
        for count in wordCounts:
            if wordCounts[count] >= 3:
                if goldenSet.get(key) == None:
                    goldenSet[key] = [count]
                elif count not in goldenSet.get(key):
                    goldenSet[key].append(count)

with open("full golden set.json", "w", encoding="utf-8-sig") as out_file:
    json.dump(goldenSet, out_file, indent=4, ensure_ascii=False)