# Code to set up NLTK

On a local Jupyter notebook, these are one-time actions that need to be done to install packages and data. On Colab, you probably need to do them every time you use the notebook in a new session.

In [None]:
import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

!pip install langdetect

print('Done!')


# Fine arts gallery chunking tests

Based on Chapter 7 of Natural Language Processing with Python
https://www.nltk.org/book/ch07.html

NOTE: the NLTK setup must be done before running this notebook!

## Function section

This needs to be run before any of the later cells

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk import RegexpParser

import csv
import json
import requests

from langdetect import detect_langs

# read from a CSV file into a list of dictionaries
def read_dict(filename):
    with open(filename, 'r', newline='', encoding='utf-8') as file_object:
        dict_object = csv.DictReader(file_object)
        array = []
        for row in dict_object:
            array.append(row)
    return array

# Write list of dictionaries to a CSV file
def write_dicts_to_csv(table, filename, fieldnames):
    with open(filename, 'w', newline='', encoding='utf-8') as csv_file_object:
        writer = csv.DictWriter(csv_file_object, fieldnames=fieldnames)
        writer.writeheader()
        for row in table:
            writer.writerow(row)

# read a CSV from a URL into a list of dictionaries
def url_csv_to_list_of_dicts(url):
    response = requests.get(url)
    file_text = response.text.splitlines()
    file_rows = csv.DictReader(file_text)
    list_of_dicts = []
    for row in file_rows:
        list_of_dicts.append(row)
    return list_of_dicts

def words_in_phrase(piece):
    """Tokenizes a phrase, then removes non-word punctuation, etc. and counts words. Input a string, output an integer."""
    tokens = nltk.word_tokenize(piece)
    if '.' in tokens:
        tokens.remove('.')
    if 's' in tokens:
        tokens.remove('s')
    if '’' in tokens:
        tokens.remove('’')
    return len(tokens)

def detect_language(string):
    """Runs Google language detection. Input is a string, output is a tuple of (language code, confidence)."""
    try:
        lang_list = detect_langs(string)
        lang_string = str(lang_list[0])
        confidence = float(lang_string[3:])
        lang = lang_string[:2]
    except: #exceptions occur when no info to decide, e.g. numbers
        lang = 'zxx'
        confidence = float(0)
    return lang, confidence

def treat_as_single_string(pieces):
    """Determine whether the pair of phrases should be considered the same language. Input a list of tokens, output a boolean."""
    abort = False
    lang_list = []
    for piece in pieces:
        lang, prec = detect_language(piece.strip())
        lang_list.append(lang)

        # Can't be considered different languages if any confidence assessment fails to meet the minimum score
        if prec < precision_cutoff:
            abort = True
            print('fail precision cutoff')
        # Can't be considered different languages if the phrase doesn't include the minimum number of words.
        if words_in_phrase(piece) < phrase_length_cutoff:
            abort = True
            print('fail word length cutoff')

    # If the previous screens on the two phrases pass, then not considered different languages if the
    # two phrases are assigned to the same language.
    if lang_list[0] == lang_list[1]:
        abort = True
        print('fail same language test')
    print(lang_list)
    return abort


print('Done!')

## Load data

Loads gallery metadata (Wikidata Q ID, label, description).

In [None]:
# Use this code to load a CSV table from a URL
#url = 'https://gist.githubusercontent.com/baskaufs/f76c243a4a4ad94d0dd00cdcaca6d8df/raw/3410f020df72cdbdf65d81fed8d0c344c66e7e5b/gallery_works.csv'
#works = url_csv_to_list_of_dicts(url)

# Use this code to load a CSV table from local file
#filename = 'works_multiprop.csv'
#works = read_dict(filename)

# Read preprocessed JSON data from secret Gist
url = 'https://gist.githubusercontent.com/baskaufs/054662389fcd08f107c01b06cd024338/raw/bfe474d67708da64033ae61e95d113cc3cbc2ba6/3d_parts.json'
response_object = requests.get(url)
file_text = response_object.text
works = json.loads(file_text)

print(works[0])

print('Done!')


Select a work and examine the data structure

In [None]:
work_number = 1
field_name = 'object_description'
print(json.dumps(works[work_number], indent=2))
print()
print(works[work_number][field_name])

## Perform tokenization and tagging

See https://www.nltk.org/book/ch03.html about tokenization.

See https://www.nltk.org/book/ch05.html about tagging.

In [None]:
work = works[work_number]
tokens = nltk.word_tokenize(work[field_name])
print('tokens:', tokens)


In [None]:
tagged_tokens = nltk.pos_tag(tokens)
print('tagged:', tagged_tokens)


## Extract nouns from object_description

This script puts together bits from the code above and encloses it in a loop.

In [None]:
# We discovered text segments beginning with these words that did not belong in the main subject description.
# So we will remove any text following these words.
splitter_words = [' of ', ' with ', ' in ', ' (2 parts)']

summary_noun_list = []

field_name = 'object_description'
for work_number in range(len(works)):
    #print(json.dumps(works[work_number], indent=2))
    #print()
    # This is the object_description phrase previously extracted based on some rules
    #print(works[work_number][field_name])
    # Convert to lower case, since NLTK seems overly dependent on capitalization to categorize parts of speech
    phrase = works[work_number][field_name].lower()

    # Check for each of the possible words that begin unwanted text segments
    # If the subject description does not contain the split_word, nothing happens.
    for split_word in splitter_words:
        pieces = phrase.split(split_word) # split the subject description into two pieces if it contains the split_word
        #print(pieces)
        phrase = pieces[0].strip() # replace the original phrase with the first piece. Strip extra whitespace if any from ends.
        #print(phrase)

    # Often a parenthetical expression translates a non-English term to an English one. So this code extracts the
    # text inside the first set of parentheses and discards other text.
    # This is sometimes wrong when the non-English translation is in the parentheses or if it's just an English
    # parenthetical expression following an English phrase. These will have to be detected and corrected manually.

    # Split the phrase to pull out anything in a first set of parentheses
    pieces = phrase.split('(') # Split label into parts before and after left parenthesis
    if len(pieces) > 2:
        pieces = list(pieces[:2]) # Throw away anything after a second set of parentheses (limit to first parentheses only)    

    if len(pieces) == 2: # Don't do anything if no parentheses.
        phrase = pieces[1].split(')')[0] # Remove anything after the right parenthesis for the parenthetical phrase

    tokens = nltk.word_tokenize(phrase)
    #print('tokens:', tokens)
    tagged_tokens = nltk.pos_tag(tokens)
    #print('tagged:', tagged_tokens)

    # Danni observed that if there were consecutive nouns, the last one is nearly always what we want.
    # NLTK frequently mis-identifies adjectives as nouns, but since adjectives usually preceed nouns in English,
    # using the rule of extracting the last "noun" (according to NLTK) nearly always results in the real noun.
    # So the strategy here is to step through the tagged_tokens and look for the part of speech (item 1 in the tuple), 
    # and see if the first two letters contain "NN", which is used for all noun tags. If it does, save the actual word 
    # (the token) in a list of nouns. Finally, select the last noun in the list as the primary noun of the description.
    noun_list = []
    for tagged_token in tagged_tokens:
        #print(tagged_token)
        #print(tagged_token[1])
        if tagged_token[1][:2] == 'NN': # 1 is for tuple item 1 (the second item) and :2 is the last 2 characters in the string
            noun_list.append(tagged_token[0]) # add tuple item 0 (the first item) to the list of nouns.
    if len(noun_list) == 0:
        result = '* Noun not detected!'
    else:
        result = noun_list[-1] # list item -1 is the last item.
    #print(result)
    #print()
    works[work_number]['noun'] = result
       
    if not(result in summary_noun_list) and result != '* Noun not detected!':
        summary_noun_list.append(result)

summary_noun_list.sort()
with open('noun_list.txt', 'wt', encoding='utf-8') as file_object:
    file_object.write('\n'.join(summary_noun_list))

fieldnames = ['qid', 'type', 'noun', 'label', 'object_description', 'form_description', 'design_description', 'includes', 'noun']
write_dicts_to_csv(works, '3d_parts.csv', fieldnames)

print('done')

# REMAINDER NOT USED

The remaining cells weren't used in the project but were kept here for legacy reasons.

## Language detection test

This was a good idea, but the phrases are too short for accurate detection. Usually neither of the detected languages was English.

In [None]:
test_string = 'Chashaku (tea scoop)'
    
precision_cutoff = 0.5
phrase_length_cutoff = 1

translations_list = [] # Create a list to hold the translations

# Split the phrase to pull out anything in a first set of parentheses
pieces = test_string.split('(') # Split label into parts before and after left parenthesis
if len(pieces) > 2:
    pieces = list(pieces[:2]) # Throw away anything after a second set of parentheses (limit to first parentheses only)    

# Code to decide whether parenthetical text is a translation
if len(pieces) == 2: # Don't analyze if no parentheses
    pieces[1] = pieces[1].split(')')[0] # Remove anything after the right parenthesis for the parenthetical phrase
    if treat_as_single_string(pieces):
        pieces = [test_string] # If it fails the translations screening test, treat it as a single string

print(pieces)



## Chunking text

The commented out `grammar` assignments offer alternative sets of rules for doing the chunking.

See https://www.h2kinfosys.com/blog/part-of-speech-tagging-chunking-with-nltk/ for codes used to tag the tokens

In [None]:
# grammar= """chunk:{<NN.?>*<VBD.?>*<JJ.?>*<CC>?}""" # test pattern

# grammar = "NP: {<DT>?<JJ>*<NN>}" # noun phrase detection

grammar = "NP: {<DT>?<JJ.*>*<NN.*>+}" # modified NP detection

#grammar = r"""
#  NP: {<DT|PP\$>?<JJ>*<NN>.*}   # chunk determiner/possessive, adjectives and noun
#      {<NNP>+}                # chunk sequences of proper nouns
#"""
  	
# Chinking example:
#grammar = r"""
#  NP:
#    {<.*>+}          # Chunk everything
#    }<VBD|IN>+{      # Chink sequences of VBD and IN
#  """

chunker = RegexpParser(grammar)
print("Chunker summary:", chunker)


In [None]:
chunks = chunker.parse(tagged_tokens)
print("chunks:",chunks)


**Does not work on Colab!**

When you run the following cell on a local Jupyter notebook, the diagram pops up in a separate window.  That window must be closed to stop the cell from running in order to be able re-run the chunking cell again.

Sometimes the popup is below other windows and you may need to click on the "python" icon in the task bar to bring it to the front.

In [None]:
chunks.draw()

# Named entity recognition

Seems to be heavily dependent on capitalization, so not that great for titles

Code hacked from https://stackoverflow.com/questions/31836058/nltk-named-entity-recognition-to-a-python-list

In [None]:
# Requires the "Perform tokenization and tagging" cell to be run first.
# Try with row 6293
named_entity_chunks = nltk.ne_chunk(tagged_tokens)
print('NE chunks:', named_entity_chunks)
print()

ne_list = []
for chunk in named_entity_chunks:
    if hasattr(chunk, 'label'):
        ne_dict = {'ne_label': chunk.label()}
        # A chunk is some kind of iterable of tuples
        # Each tuple contains (word, noun_descriptor)
        ne_string = chunk[0][0] # 0th tuple, word
        # Iterate through the rest of the tuples in the chunk
        for additional_tuple in chunk[1:len(chunk)]:
            ne_string += ' ' + additional_tuple[0]
        ne_dict['ne_string'] = ne_string
        ne_list.append(ne_dict)

        # Print results for humans to see
        print(chunk.label(), ' '.join(c[0] for c in chunk))

# List of dictionaries format for subsequent use or output as a CSV
print()
print('NE list:', ne_list)


**Does not work on Colab!**

In [None]:
# Run this cell if running locally and you want a diagram of the NER chunks.
# It will open in a separate window that must be closed before any cell can be run again.
# Sometimes it opens under other windows and you must click on its icon in the dock to make
# it come to the frong.
named_entity_chunks.draw()