In [None]:
#! pip install openai

In [3]:
script_version = '0.0.1'
version_modified = '2024-03-18'

# -------------------
# Imports
# -------------------

from openai import OpenAI
from pathlib import Path
import json
import time
import pandas as pd

# -------------------
# Global variables
# -------------------

# create a base prompt that will be used for all questions
BASE_PROMPT = """Below is a CSV file that contains titles of artworks. Each title is delineated by a comma. For each artwork title, give me the list of:
- object named entity
- location named entity
- person named entity
- genre named entity
- miscellaneous named entity.
For each title, format the output as shown in the example given below with the following keys:
- TITLE for the title of the artwork
- OBJECT for organization named entity
- LOCATION for location named entity
- PERSON for person named entity
- GENRE for genre named entity
- MISCELLANEOUS for miscellaneous named entity.
Example below: 
- Title 1: "Portrait of the Artist Pablo Picasso with a Guitar in Paris"
- Output:
{
    "TITLE": "Portrait of the Artist Pablo Picasso with a Guitar in Paris",
    "OBJECT": ["guitar"],
    "LOCATION": ["Paris"],
    "PERSON": ["Pablo Picasso"],
    "GENRE": ["portrait"],
    "MISCELLANEOUS": ["artist"]
}
"""

OPENAI_QUERY_PARAMS = {
    "model": "gpt-3.5-turbo",
    "temperature": 0,
    "max_tokens": 1024
}

# add your API key here
openai_key_filename = 'open_ai_api_key_text_analysis.txt'

home = str(Path.home()) # gets path to home directory; supposed to work for both Win and Mac
with open(home + '/' + openai_key_filename, 'r') as file:
    api_key_string = file.read().strip() # remove any leading or trailing white space or newlines

CLIENT = OpenAI(api_key=api_key_string)


In [5]:
# -------------------
# Function definitions
# -------------------

def ask_openai(prompt: str, base_prompt=BASE_PROMPT, openai_query_params=OPENAI_QUERY_PARAMS) -> str:
    """Send a request to OpenAI's ChatGPT API to do entity recognition. The prompt should be a sentence or paragraph of text
    on which you want to perform NER.
    
    The function returns a JSON-formatted string with the named entities extracted from the input text.
    """
    response = CLIENT.chat.completions.create(
        messages=[
        {
            "role": "system", 
            "content": "You are a smart and intelligent Named Entity Recognition (NER) system whose job is to extract entities from the title of an artwork. You will look for people, location names, and common objects. You will also look for genres of visual art and any miscellaneous entities. The labels in your output should not include words that are not in the title."
        },
        {
            "role": "user", 
            "content": base_prompt + prompt
        }        
    ],
        **openai_query_params
    )
    
    return(response.choices[0].message.content)


In [6]:
# example 
example_text = "Vanderbilt University is a private research university in Nashville, Tennessee. It was founded in 1873."
print(ask_openai(example_text))

{
    "TITLE": "Vanderbilt University is a private research university in Nashville, Tennessee. It was founded in 1873.",
    "OBJECT": ["university"],
    "LOCATION": ["Nashville", "Tennessee"],
    "PERSON": [],
    "GENRE": ["research"],
    "MISCELLANEOUS": ["Vanderbilt University", "1873"]
}


In [None]:
import sys
from SPARQLWrapper import SPARQLWrapper, JSON

endpoint_url = "https://query.wikidata.org/sparql"


def get_results(endpoint_url, tag):
    query = """SELECT (count(distinct ?item) as ?count) WHERE{{  
      ?item ?depicts "{tag}"@en.  
      ?article schema:about ?item .
      ?article schema:inLanguage "en" .
      ?article schema:isPartOf <https://en.wikipedia.org/>.	   
    }}""".format(tag=tag)
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()


NER on Artwork Titles

In [None]:
# getting artwork titles for NER
title_pd = pd.read_csv(home + '/GitHub/vandycite/gallery_buchanan/works_multiprop.csv')
url_pd = pd.read_csv(home + '/GitHub/vandycite/gallery_buchanan/image_analysis/object_localization_image_urls.csv')

# delete all columns except english label and inventory_number
title_pd = title_pd[['label_en', 'inventory_number']]

# only keep rows of title_pd with inventory_number in url_pd and status = uploaded
url_pd = url_pd[url_pd['status'] == 'uploaded']
title_pd = title_pd[title_pd['inventory_number'].isin(url_pd['accession_number'])]

# isolate titles only to send to OpenAI
#title_pd_slice = title_pd['label_en']
# TODO: isolate first 15 titles only to send to OpenAI (current token limit might cut off output otherwise)
title_pd_slice = title_pd['label_en'].head(15)
title_pd_slice = title_pd_slice[~title_pd_slice.str.contains("Untitled")]

In [None]:
ner_output = ask_openai(title_pd_slice.to_csv(index=False, header=False))

ner_output_list = ner_output.split('}\n')
ner_output_list = [x + '}' for x in ner_output_list]
# drop extra '}' from last element
ner_output_list[-1] = ner_output_list[-1][:-1]
ner_output_json = [json.loads(ner_output_list[i]) for i in range(len(ner_output_list))]

Checking NER Output in Wikidata Labels

In [None]:
ner_output_df = pd.DataFrame(columns=['title', 'ner_output', 'wikidata_item', 'no_item'])

# iterate through ner_output_json, extract title and ner labels to dataframe
for i in range(len(ner_output_json)):
    ner_output_df.at[i, 'title'] = ner_output_json[i]['TITLE']
    ner_output_df.at[i, 'wikidata_item'] = []
    ner_output_df.at[i, 'no_item'] = []
    # add the keys that are not title as ner_output in the dataframe
    ner_labels = []
    for key in ner_output_json[i].keys():
        if key != 'TITLE':
            ner_labels.extend(ner_output_json[i][key])
    ner_output_df.at[i, 'ner_output'] = ner_labels


In [None]:
for i in range(len(ner_output_df)):
    # loop through each label in ner_output
    for label in ner_output_df.at[i, 'ner_output']:
        results = get_results(endpoint_url, label)
        if results["results"]["bindings"][0]["count"]["value"] != '0':
            # add label to wikidata_item list
            ner_output_df.at[i, 'wikidata_item'].append(label)
        else:
            # add label to no_item list
            ner_output_df.at[i, 'no_item'].append(label)

In [None]:
# iterate through rows, only keep unique list items in wikidata_item
for index, row in ner_output_df.iterrows():
    # convert lists to string
    ner_output_df.at[index, 'wikidata_item'] = ', '.join(ner_output_df.at[index, 'wikidata_item'])
    ner_output_df.at[index, 'no_item'] = ', '.join(ner_output_df.at[index, 'no_item'])

ner_output_df.head()
#ner_output_df.to_csv('title_ner_image_wikilabels.csv', index=False)

Object Detection on Google Vision Output

In [None]:

# Load the object_localization_image_urls.csv file into a pandas dataframe
object_data = pd.read_csv('object_localization_image_urls.csv', na_filter=False, dtype = str)
# object_data.head()


prompt = "What’s the main subject of this artwork?" # basic prompt
#prompt = 'State what is present in this image. Provide only a name for what is depicted, but do not provide details, such as the direction it is oriented.' # specific prompt

object_data.head()

In [None]:

# Loop through each row in the dataframe
for index, row in object_data.iterrows():
    
    # If the row already has a GPT description, skip it
    #if row['gpt_description'] != '':
        #continue
    
    print('Processing image', index + 1, 'of', len(object_data))
    # Get the time at the start of the request
    start_time = time.time()

    image_url = row['image_url']

    incomplete = True
    tries = 0
    while incomplete:
        tries += 1
        try:
            response = CLIENT.chat.completions.create(
              model="gpt-4-vision-preview",
              messages=[
                {
                  "role": "system", 
                  "content": "You are an intelligent and concise multi-modal model whose job is to identify what is depicted in an artwork."
                }, 
                {
                  "role": "system", 
                  "content": "I will provide you with artworks that each depict an unknown subject. For each artwork, give me your three best guesses for what the main subject is. Do not consider the type of artwork; only focus on the person or object being depicted in the artwork. Each guess should be two words long."
                },
                {
                  "role": "user",
                  "content": [
                    {"type": "text", "text": prompt},
                    {
                      "type": "image_url",
                      "image_url": {
                        "url": image_url,
                      },
                    },
                  ],
                }
              ],
              max_tokens=300,
            )
            incomplete = False
            #print(response)
        except Exception as e:
            # Print the error message
            print(e)
            if tries > 5:
                print('Error after 5 tries. Skipping this image.')
                break
            print('Error, waiting 10 seconds.')
            time.sleep(10)
            print('Retrying.')

    if not incomplete: # Only save the response if the request was successful
        print(image_url)
        # Extract the response from the API
        gpt_description = response.choices[0].message.content
        print(gpt_description)
        total_tokens = response.usage.total_tokens

        # Get the time at the end of the request
        end_time = time.time()
        # Calculate the elapsed time
        elapsed_time = end_time - start_time

        # Add the response to the dataframe
        object_data.at[index, 'gpt_description'] = gpt_description
        object_data.at[index, 'total_tokens'] = total_tokens
        object_data.at[index, 'elapsed_time'] = elapsed_time

        # Save the dataframe to a CSV file after each iteration in case it crashes
        object_data.to_csv('object_localization_image_urls1.csv', index=False)

print('Done.')
    

In [None]:
# load csv, preserve line breaks
gptv_output = pd.read_csv('object_localization_image_urls1.csv', na_filter=False, dtype = str)
gptv_output.head()

# add column wikidata_item, make each value a list
gptv_output['wikidata_item'] = ''
gptv_output['no_item'] = ''
gptv_output['wikidata_item'] = gptv_output['wikidata_item'].apply(lambda x: [])
gptv_output['no_item'] = gptv_output['no_item'].apply(lambda x: [])

# remove rows that have '[' in the gpt_description, reassign index to remaining rows
gptv_output = gptv_output[~gptv_output['gpt_description'].str.contains('\[')].reset_index(drop=True)
# remove numberings like '1.' from gpt_description
gptv_output['gpt_description'] = gptv_output['gpt_description'].str.replace(r'\d+\. ', '')
# make all gpt_description lowercase
gptv_output['gpt_description'] = gptv_output['gpt_description'].str.lower()
# replace new line characters with commas and a space
gptv_output['gpt_description'] = gptv_output['gpt_description'].str.replace('\n', ', ')
# split gpt_description into a list
gptv_output['gpt_description'] = gptv_output['gpt_description'].str.split(', ')
# if any item starts with a non-alphabetical character, remove it
gptv_output['gpt_description'] = gptv_output['gpt_description'].replace(r'^\W+ ', '', regex=True)

gptv_output.head()


Checking GPT-V Output in Wikidata Labels

In [None]:
# loop through rows of gptv_output
for index, row in gptv_output.iterrows():
    # turn gpt_description into a list
    print('querying item', index+1, 'of', len(gptv_output))
     # loop through each label in the list
    for label in row['gpt_description']:
        results = get_results(endpoint_url, label)
        if results["results"]["bindings"][0]["count"]["value"] != '0':
            # add label to wikidata_item list
            gptv_output.at[index, 'wikidata_item'].append(label)
            continue
         # if label is not in wikidata, split label into two words and search again
        split_label = label.split(' ')
        results = get_results(endpoint_url, split_label[1])
        if results["results"]["bindings"][0]["count"]["value"] != '0':
            # add label to wikidata_item list
            gptv_output.at[index, 'wikidata_item'].append(split_label[1])
        else:
            # add label to no_item list
            gptv_output.at[index, 'no_item'].append(label)
   


In [None]:
# iterate through rows, only keep unique list items in wikidata_item
for index, row in gptv_output.iterrows():
    gptv_output.at[index, 'wikidata_item'] = list(set(gptv_output.at[index, 'wikidata_item']))
    # convert lists to string
    gptv_output.at[index, 'wikidata_item'] = ', '.join(gptv_output.at[index, 'wikidata_item'])
    gptv_output.at[index, 'no_item'] = ', '.join(gptv_output.at[index, 'no_item'])

gptv_output.to_csv('object_localization_image_wikilabels.csv', index=False)
gptv_output.head()