# Code to set up NLTK

These are one-time actions that need to be done to install packages and data

In [None]:
import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

# Fine arts gallery chunking tests

Based on Chapter 7 of Natural Language Processing with Python
https://www.nltk.org/book/ch07.html

NOTE: the NLTK setup must be done before running this notebook!

## Function section

This needs to be run before any of the other cells

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk import RegexpParser

import csv
import json
import requests

# read from a CSV file into a list of dictionaries
def read_dict(filename):
    with open(filename, 'r', newline='', encoding='utf-8') as file_object:
        dict_object = csv.DictReader(file_object)
        array = []
        for row in dict_object:
            array.append(row)
    return array

# read a CSV from a URL into a list of dictionaries
def url_csv_to_list_of_dicts(url):
    response = requests.get(url)
    file_text = response.text.splitlines()
    file_rows = csv.DictReader(file_text)
    list_of_dicts = []
    for row in file_rows:
        list_of_dicts.append(row)
    return list_of_dicts

print('Done!')

## Load data

Loads gallery metadata (Wikidata Q ID, label, description).

In [None]:
# Use this code to load a CSV table from a URL
url = 'https://gist.githubusercontent.com/baskaufs/f76c243a4a4ad94d0dd00cdcaca6d8df/raw/3410f020df72cdbdf65d81fed8d0c344c66e7e5b/gallery_works.csv'
works = url_csv_to_list_of_dicts(url)

# Use this code to load a CSV table from local file
#filename = 'works_multiprop.csv'
#works = read_dict(filename)


Select a work and examine the data structure

In [None]:
# Some works to try: 36, 6293, 6560, 6789
work_number = 6293
print(json.dumps(works[work_number], indent=2))
print()
print(works[work_number]['label_en'])

## Perform tokenization and tagging

See https://www.nltk.org/book/ch03.html about tokenization.

See https://www.nltk.org/book/ch05.html about tagging.

In [None]:
work = works[work_number]
tokens = nltk.word_tokenize(work['label_en'])
print('tokens:', tokens)


In [None]:
tagged_tokens = nltk.pos_tag(tokens)
print('tagged:', tagged_tokens)


## Chunking text

The commented out `grammar` assignments offer alternative sets of rules for doing the chunking.

See https://www.h2kinfosys.com/blog/part-of-speech-tagging-chunking-with-nltk/ for codes used to tag the tokens

In [None]:
# grammar= """chunk:{<NN.?>*<VBD.?>*<JJ.?>*<CC>?}""" # test pattern

# grammar = "NP: {<DT>?<JJ>*<NN>}" # noun phrase detection

grammar = "NP: {<DT>?<JJ.*>*<NN.*>+}" # modified NP detection

#grammar = r"""
#  NP: {<DT|PP\$>?<JJ>*<NN>.*}   # chunk determiner/possessive, adjectives and noun
#      {<NNP>+}                # chunk sequences of proper nouns
#"""
  	
# Chinking example:
#grammar = r"""
#  NP:
#    {<.*>+}          # Chunk everything
#    }<VBD|IN>+{      # Chink sequences of VBD and IN
#  """

chunker = RegexpParser(grammar)
print("Chunker summary:", chunker)


In [None]:
chunks = chunker.parse(tagged_tokens)
print("chunks:",chunks)


When you run the following cell on a local Jupyter notebook, the diagram pops up in a separate window.  That window must be closed to stop the cell from running in order to be able re-run the chunking cell again.

Sometimes the popup is below other windows and you may need to click on the "python" icon in the task bar to bring it to the front.

In [None]:
chunks.draw()

# Named entity recognition

Seems to be heavily dependent on capitalization, so not that great for titles

Code hacked from https://stackoverflow.com/questions/31836058/nltk-named-entity-recognition-to-a-python-list

In [None]:
# Requires the "Perform tokenization and tagging" cell to be run first.
# Try with row 6293
named_entity_chunks = nltk.ne_chunk(tagged_tokens)
print('NE chunks:', named_entity_chunks)
print()

ne_list = []
for chunk in named_entity_chunks:
    if hasattr(chunk, 'label'):
        ne_dict = {'ne_label': chunk.label()}
        # A chunk is some kind of iterable of tuples
        # Each tuple contains (word, noun_descriptor)
        ne_string = chunk[0][0] # 0th tuple, word
        # Iterate through the rest of the tuples in the chunk
        for additional_tuple in chunk[1:len(chunk)]:
            ne_string += ' ' + additional_tuple[0]
        ne_dict['ne_string'] = ne_string
        ne_list.append(ne_dict)

        # Print results for humans to see
        print(chunk.label(), ' '.join(c[0] for c in chunk))

# List of dictionaries format for subsequent use or output as a CSV
print()
print('NE list:', ne_list)


In [None]:
# Run this cell if running locally and you want a diagram of the NER chunks.
# It will open in a separate window that must be closed before any cell can be run again.
# Sometimes it opens under other windows and you must click on its icon in the dock to make
# it come to the frong.
named_entity_chunks.draw()