# Named Entity Recognition (NER) French tests

## Function section

This needs to be run before any of the other cells


In [3]:
!pip3 install -U nltk

[0m[33mDEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 24.1 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [5]:
import nltk

AttributeError: module 'numpy' has no attribute 'int'.
`np.int` was a deprecated alias for the builtin `int`. To avoid this error in existing code, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

In [4]:
from pathlib import Path
import requests
from time import sleep
import json
import csv
import os
from fuzzywuzzy import fuzz # fuzzy logic matching
from copy import deepcopy
from langdetect import detect
from langdetect import detect_langs
import datetime

import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

import spacy
from spacy import displacy
from collections import Counter

import en_core_web_sm
nlp = en_core_web_sm.load()


# ----------------
# Configuration settings
# ----------------

home = str(Path.home()) # gets path to home directory; supposed to work for both Win and Mac
accept_media_type = 'application/json'

# Calculate the reference date retrieved value for all statements
whole_time_string_z = datetime.datetime.utcnow().isoformat() # form: 2019-12-05T15:35:04.959311
dateZ = whole_time_string_z.split('T')[0] # form 2019-12-05
ref_retrieved = dateZ + 'T00:00:00Z' # form 2019-12-05T00:00:00Z as provided by Wikidata, without leading +

# ----------------
# Utility functions
# ----------------

def generate_utc_date():
    whole_time_string_z = datetime.datetime.utcnow().isoformat() # form: 2019-12-05T15:35:04.959311
    date_z = whole_time_string_z.split('T')[0] # form 2019-12-05
    return date_z

def remove_parens(string):
    name_string = string.split('(')[0]
    return name_string.strip()

def remove_description(string):
    try:
        right_string = string.split('(')[1]
        left_string = right_string.split(')')[0]
        result = left_string.strip()
    except:
        result = ''
    return result


AttributeError: module 'numpy' has no attribute 'int'.
`np.int` was a deprecated alias for the builtin `int`. To avoid this error in existing code, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.
The aliases was originally deprecated in NumPy 1.20; for more details and guidance see the original release note at:
    https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations

# Load data

These data need to be loaded before running any of the following cells

In [None]:
filename = 'rawTextSentHolly.txt'
with open(filename, 'r') as file:
    raw_text = file.read()

# Split the text by line
lines = raw_text.split('\n')

# See https://towardsdatascience.com/named-entity-recognition-with-nltk-and-spacy-8c4a7d88e7da 
# for codes used in output

## NLTK

NOTE: seems to depend very heavily on capitalization.

In [None]:
output_list = []

for line in lines[:20]:
    print('raw text:', line)
    ne_list = []
    # https://stackoverflow.com/questions/31836058/nltk-named-entity-recognition-to-a-python-list
    tokens = nltk.word_tokenize(line)
    print('tokens:', tokens)
    tagged_tokens = nltk.pos_tag(tokens)
    print('tagged:', tagged_tokens)
    named_entity_chunks = nltk.ne_chunk(tagged_tokens)
    print('NE chunks:', named_entity_chunks)
    
    for chunk in named_entity_chunks:
        if hasattr(chunk, 'label'):
            ne_dict = {'ne_label': chunk.label()}
            # A chunk is some kind of iterable of tuples
            # Each tuple contains (word, noun_descriptor)
            ne_string = chunk[0][0] # 0th tuple, word
            # Iterate through the rest of the tuples in the chunk
            for additional_tuple in chunk[1:len(chunk)]:
                ne_string += ' ' + additional_tuple[0]
            ne_dict['ne_string'] = ne_string
            ne_list.append(ne_dict)

            print(chunk.label(), ' '.join(c[0] for c in chunk))
    print('NE list:', json.dumps(ne_list, indent = 2))
    print('------------------------')
    

In [None]:
# Run this cell if you want a diagram of the NER chunks
# It will open in a separate window that must be closed before any cell can be run again.
# Sometimes it opens under other windows and you must click on its icon in the dock to make
# it come to the frong.
named_entity_chunks.draw()

## Spacy 

Results were fairly similar between the cleaned up capitalization and the raw label strings

That makes me think that Spacy is relatively insensite to capitalization.

In [None]:
output_list = []
for work in works[3361:3362]:
    print(work['label_en'])
    work_dict = {'qid': work['qid'], 'label_en': work['label_en'], 'inventory_number': work['inventory_number']}
    
    # https://www.analyticsvidhya.com/blog/2021/06/nlp-application-named-entity-recognition-ner-in-python-with-spacy/
    result = nlp(work['label_en'])
    for entity in result.ents:
        print(entity.text, entity.label_)
    print()
    
    
    # results were somewhat worse when using all lower case
    print('converted to lower case:')
    result = nlp(work['label_en'].lower())
    for entity in result.ents:
        print(entity.text, entity.label_)
    print()

    print('-----------')
    


## Parallel Dots API

NOTE: Seems to be relatively insensitive to case

Note: If you don't have a Parallel Dots API key, you can't run this test.

Cleaned test set:

In [None]:
# Note: ParallelDots is a commercial product with a free tier. But it requires an account and API key to use.
# If you don't have an account, comment out this section.

key = load_credential('paralleldots_api_key.txt', 'home')
paralleldots.set_api_key(key)

output_list = []
for work in works[3361:3362]:
    print(work['label_en'])
    work_dict = {'qid': work['qid'], 'label_en': work['label_en'], 'inventory_number': work['inventory_number']}
    named_entities = paralleldots.ner(work['label_en'])
    work_dict['named_entities'] = named_entities
    output_list.append(work_dict)
    sleep(dots_sleep)
out_text = json.dumps(output_list, indent = 2)
print(out_text)


In [None]:
# lower case test
output_list = []
for work in works[3361:3362]:
    print(work['label_en'])
    work_dict = {'qid': work['qid'], 'label_en': work['label_en'], 'inventory_number': work['inventory_number']}
    named_entities = paralleldots.ner(work['label_en'].lower())
    work_dict['named_entities'] = named_entities
    output_list.append(work_dict)
    sleep(dots_sleep)
out_text = json.dumps(output_list, indent = 2)
print(out_text)


Try Stanza, NER by Stanford

## Combined code 

This script is the final product putting together the test scripts above. The configuration cell must be run before this one, but that's the only one.

In [None]:
# WARNING: there is a daily limit for the Parallel Dots free tier, so don't run this script over and over
# or with a large dataset without commenting out the Parallel Dots part of the code
# Free tier daily limits are 2000 hits, rate limit 60 hits per minute. NOTE: the limit is
# hits, not API calls. So there is no easily predictable way to know how many itereations will be run.

def perform_ner_analysis(works):
    try:
        with open('start_work.txt', 'rt', encoding='utf-8') as fileObject:
            start_work = int(fileObject.read())
    except:
        start_work = 0

    output_list = []
    # This determines the starting record for the session. When the 2000 hit limit for Parallel Dots is reached,
    # an error will be thrown. At that point, the loop will end and the data will be written.
    work_number = start_work
    for work in works[start_work:]:
        work_number += 1 # when the loop fails, this will be recorded in the filename so that we know where to resume next day
        print('raw text:', work['label_en'])
        work_dict = {'qid': work['qid'], 'label_en': work['label_en'], 'inventory_number': work['inventory_number']}

        # NLP
        ne_list = []
        # https://stackoverflow.com/questions/31836058/nltk-named-entity-recognition-to-a-python-list
        tokens = nltk.word_tokenize(work['label_en'])
        # print('tokens:', tokens)
        tagged_tokens = nltk.pos_tag(tokens)
        # print('tagged:', tagged_tokens)
        named_entity_chunks = nltk.ne_chunk(tagged_tokens)
        # print('NE chunks:', named_entity_chunks)

        for chunk in named_entity_chunks:
            if hasattr(chunk, 'label'):
                ne_dict = {'label': chunk.label()}
                # A chunk is some kind of iterable of tuples
                # Each tuple contains (word, noun_descriptor)
                ne_string = chunk[0][0] # 0th tuple, word
                # Iterate through the rest of the tuples in the chunk
                for additional_tuple in chunk[1:len(chunk)]:
                    ne_string += ' ' + additional_tuple[0]
                ne_dict['string'] = ne_string
                ne_list.append(ne_dict)

                # print(chunk.label(), ' '.join(c[0] for c in chunk))
        # print('NE list:', ne_list)
        work_dict['nltk'] = ne_list
        # print(json.dumps(work_dict, indent = 2))

        # Spacy
        # See https://towardsdatascience.com/named-entity-recognition-with-nltk-and-spacy-8c4a7d88e7da 
        # for codes used in output by Spacy
        ne_list = []
        result = nlp(work['label_en'])
        for entity in result.ents:
            ne_list.append({'label': entity.label_, 'string': entity.text})
        work_dict['spacy'] = ne_list

        # Parallel Dots API
        try:
            named_entities = paralleldots.ner(work['label_en'])
        except:
            # avoid script crashes when there's an error with the API
            print('API error for', work['qid'], work['label_en'])
            work_dict['parallel_dots'] = [{'label': 'error', 'string': '', 'confidence_score': ''}]
            output_list.append(work_dict)
            continue # go to start of loop and do next item

        # Convert the Parallel Dots JSON structure to match the others
        try: # an error will be thrown if the API data is an error message instead of having an 'entities' key (daily limit)
            temp_list = []
            for entity in named_entities['entities']:
                temp_list.append({'label': entity['category'], 'string': entity['name'], 'confidence_score': entity['confidence_score']})
            work_dict['parallel_dots'] = temp_list
        except:
            break # When the daily limit is exceeded, the loop will terminate and go on to the output code

        sleep(dots_sleep) # This is a delay to prevent hitting the API to frequently and getting blocked

        # Append all of the collected data to the accumulation list
        output_list.append(work_dict)

    work_number -= 1 # decrement because the last one failed
    if work_number > start_work: # Do not output unless at least one work was processed.
        output_text = json.dumps(output_list, indent = 2)

        # the file name includes the first record number and the record number after the last record (i.e. Python range style)
        out_file_path = 'named_er' + '_' + str(start_work) + '_' + str(work_number) + '.json'
        with open(out_file_path, 'wt', encoding='utf-8') as file_object:
            file_object.write(output_text)

        with open('start_work.txt', 'wt', encoding='utf-8') as fileObject:
            fileObject.write(str(work_number)) # Next time, start with the record after the last successfully analyzed record
        success = True
        print('done')
    else:
        print('No works analyzed.')
        success = False

    return success


In [None]:
# Note: ParallelDots is a commercial product with a free tier. But it requires an account and API key to use.
key = load_credential('paralleldots_api_key.txt', 'home')
paralleldots.set_api_key(key)

filename = 'works_multiprop.csv'
works = read_dict(filename)

while True: # infinite loop
    print('Time checked:', datetime.datetime.utcnow().isoformat())

    try:
        # Look to see the last date the script was run
        with open('last_run.txt', 'rt', encoding='utf-8') as fileObject:
            date_last_run = fileObject.read()
    except:
        date_last_run = '2021-01-01' # a date in the past
    print('Date last run:', date_last_run)

    date_now_utc = generate_utc_date()
    print('UTC date now is:', date_now_utc)

    if date_now_utc > date_last_run:
        success = perform_ner_analysis(works)
        
        if success:
            with open('last_run.txt', 'wt', encoding='utf-8') as fileObject:
                # If analysis occurrred successfully, update the last_run date in the file with today's date
                fileObject.write(generate_utc_date())

    print()
    # wait an hour before checking again
    sleep(3600)
