In [2]:
#Imports and initialization of HISCO vocabulary.

import gzip
import json
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from nltk.metrics import distance

hisco_vocab = None 
with gzip.open('./hisco.json.gz', 'r') as jsonfile:
    hisco_vocab = json.load(jsonfile)
    
# limit to desired language's occupations only
lang = input("Please type the desired language for the HISCO dataset (i.e., English, Dutch, etc.): ")
hisco_vocab = {k:v for k,v in hisco_vocab.items() if "Language" in v.keys() and v['Language'] == lang}

Please type the desired language for the HISCO dataset (i.e., English, Dutch, etc.):  Dutch


In [3]:
#Load test data.
data = None 
with gzip.open('./biocorpus.json.gz', 'r') as jsonfile:
    data = json.load(jsonfile)

In [4]:
#Create output file

try:
    f = open("Matches.txt", "x")
except IOError:
    print("Please remove the current 'Matches.txt' file from the directory before continuing.")

Please remove the current 'Matches.txt' file from the directory before continuing.


In [None]:
#Go through each entry in data, format it, and print any potential matches.
lang = lang.lower()
correct = 0 #Test variable for amount of correct matches.
iter = 0 #Test variable for finite loop iteration. #FOR TESTING
for key, value in data.items(): #Parse through each item within the textual dataset.
    sample = data[key]
    sample_tokens_cleaned = word_tokenize(sample, language=lang)

    iter_2 = 0 #FOR TESTING
    index = 0 #Index used for contextual overview.
    for token in sample_tokens_cleaned:  #Parse through each token in a given text item.
        if token not in string.punctuation and token not in stopwords.words(lang): #Ignores stopwords and punctuation, but keeps them for context.
            iter_3 = 0
            match = 100
            title_m = ''
            code_m = ''
            context = ''
            for hisco_concept in hisco_vocab.values(): #Parse through HISCO occupation codes.
                title = hisco_concept['Occupational title']
                code = hisco_concept['Hisco code']
                levenshtein = nltk.edit_distance(token, title) #Levenshtein distance calculation.
                if match > levenshtein: #If the match is closer than the previous one, overwrite it.
                    match = levenshtein
                    title_m = title
                    code_m = code
                if match == 0: #If exact match, break loop and proceed with analysis.
                    break

                iter_3 += 1 #FOR TESTING
                if iter_3 > 500:#FOR TESTING
                    break#FOR TESTING

            if match < 2: #NB: this value may change in the coming days based on research into the distance between compounded words which are common in Dutch. Will most likely be adaptable based on language
                cont = 1
                while cont < 20: #Add pre-context.
                    if (index + cont < len(sample_tokens_cleaned) and index - cont >= 0):
                        context = sample_tokens_cleaned[index - cont] + " " + context
                    cont += 1
                    if sample_tokens_cleaned[index - cont] == '.' or sample_tokens_cleaned[index - cont] == '?' or sample_tokens_cleaned[index - cont] == '!' or sample_tokens_cleaned[index - cont] == ':':
                        cont = 21
                context = context + token + " "
                cont = 1
                while cont < 20: #Add post context.
                    if (index + cont < len(sample_tokens_cleaned) and index - cont >= 0):
                        context = context + sample_tokens_cleaned[index + cont] + " "
                    cont += 1
                    if sample_tokens_cleaned[index + cont] == '.' or sample_tokens_cleaned[index - cont] == '?' or sample_tokens_cleaned[index - cont] == '!' or sample_tokens_cleaned[index - cont] == ':':
                        cont = 21
                print("Match found!") #Prints necessary information for the user to judge the match.
                print("Currently analyzing the biography of: " + key)
                print("Word with potential occupational match: " + token)
                print("Potential HISCO occupation match: " + title_m)
                print("In-text contextualization of the word: ")
                print("\"" + context + "\"")
                print("HISCO code for potential occupation: " + code_m)
                print("Levenshtein distance between word and potential HISCO occupation: " + str(match))
                done = False
                while done == False:
                    answer = input("Do you agree with this matching? y/n") #NB: final version will not be a keyboard prompt, but a button in the interface.
                    if answer == 'y': #If match accepted, it is recorded.
                        print("Match stored.")
                        done = True
                        f.write("Biography in question: " + key + ".\n")
                        f.write("Match between token '" + token + "' and HISCO occupation '" + title_m + "', with HISCO code " + code_m + ".\n")
                        f.write("Levenshtein distance of: " + str(match) + ".\n")
                        f.write("Contextual overview: '" + context + "\n")
                        f.write("\n")
                    elif answer == 'n': #If unnaccepted, match is discarded.
                        print("Match removed.")
                        done = True
                    else:
                        print("Please enter y or n.")
        index += 1
        if index + 1 > len(sample_tokens_cleaned):
            index = 0

            iter_2 += 1 #FOR TESTING
            if iter_2 > 500:#FOR TESTING
                break#FOR TESTING
    iter += 1 #FOR TESTING
    if iter > 5:#FOR TESTING
        break#FOR TESTING

Match found!
Currently analyzing the biography of: Petronella Voûte
Word with potential occupational match: diepe
Potential HISCO occupation match: dieper
In-text contextualization of the word: 
"Mogelijkerwijs heeft de diepe indruk van een ontmoeting met een prostituee in haar jeugd bij dit besluit een rol gespeeld "
HISCO code for potential occupation: 97400
Levenshtein distance between word and potential HISCO occupation: 1


Do you agree with this matching? y/n n


Match removed.
Match found!
Currently analyzing the biography of: Antoine Louis des Tombe
Word with potential occupational match: burgemeester
Potential HISCO occupation match: Burgemeester
In-text contextualization of the word: 
"Des Tombe , lid van de Christelijk Historische Unie , werd in oktober 1934 benoemd tot burgemeester van Abcoude "
HISCO code for potential occupation: 20110
Levenshtein distance between word and potential HISCO occupation: 1


Do you agree with this matching? y/n n


Match removed.
Match found!
Currently analyzing the biography of: Antoine Louis des Tombe
Word with potential occupational match: burgemeester
Potential HISCO occupation match: Burgemeester
In-text contextualization of the word: 
"Zijn benoeming tot burgemeester betekende voor de hele familie een verhuizing "
HISCO code for potential occupation: 20110
Levenshtein distance between word and potential HISCO occupation: 1
