# Configuration and functions

Run for every cell

In [None]:
import json
import csv

# Pandas for data frame management
import pandas as pd

# Natural Language Tool Kit
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk import RegexpParser

# Language detection
from langdetect import detect
from langdetect import detect_langs

# ---------------
# Utility functions
# ---------------

# read from a CSV file into a list of dictionaries
def read_dict(filename):
    with open(filename, 'r', newline='', encoding='utf-8') as file_object:
        dict_object = csv.DictReader(file_object)
        array = []
        for row in dict_object:
            array.append(row)
    return array

# write a list of dictionaries to a CSV file
def write_dicts_to_csv(table, filename, fieldnames):
    with open(filename, 'w', newline='', encoding='utf-8') as csv_file_object:
        writer = csv.DictWriter(csv_file_object, fieldnames=fieldnames)
        writer.writeheader()
        for row in table:
            writer.writerow(row)
            
# ---------------
# Processing functions
# ---------------

# This function splits a string based on finding the test_string in it and returns info on the pieces
def test_split(target_string, test_string):
    split = False
    pieces = target_string.split(test_string)
    # remove leading and trailing spaces
    for piece_number in range(len(pieces)):
        pieces[piece_number] = pieces[piece_number].strip()
    if len(pieces) != 1:
        split = True
    return split, len(pieces), pieces

# Test all possible splitting strings and return information about whether the target_string was split
def split_string(target_string, test_string_list):
    for test_string in test_string_list:
        split, n_pieces, pieces = test_split(target_string, test_string)
        if split: # quit the function if a particular test string worked
            return split, n_pieces, pieces
    # If none of the test strings worked, then return false and the original string
    return False, 1, target_string

def detect_language(string):
    try:
        lang_list = detect_langs(string)
        lang_string = str(lang_list[0])
        confidence = float(lang_string[3:])
        lang = lang_string[:2]
    except: #exceptions occur when no info to decide, e.g. numbers
        lang = 'zxx'
        confidence = float(0)
    return lang, confidence

precision_cutoff = 0.5
phrase_length_cutoff = 2
def words_in_phrase(piece):
    tokens = nltk.word_tokenize(piece)
    if '.' in tokens:
        tokens.remove('.')
    if 's' in tokens:
        tokens.remove('s')
    if '’' in tokens:
        tokens.remove('’')
    return len(tokens)

def treat_as_single_string(pieces):
    abort = False
    lang_list = []
    for piece in pieces:
        lang, prec = detect_language(piece.strip())
        lang_list.append(lang)
        if prec < precision_cutoff:
            abort = True
            #print('fail precision cutoff')
        if words_in_phrase(piece) <= phrase_length_cutoff:
            abort = True
            #print('fail word length cutoff')
            
    if lang_list[0] == lang_list[1]:
        abort = True
        #print('fail same language test')
    return abort

def generate_translations(label):
    translations_list = [] # Create a list to hold the translations

    # Split the phrase to pull out anything in a first set of parentheses
    pieces = label.split('(') # Split label into parts before and after left parenthesis
    if len(pieces) > 2:
        pieces = list(pieces[:2]) # Throw away anything after a second set of parentheses (limit to first parentheses only)    
    
    # Code to decide whether parenthetical text is a translation
    if len(pieces) == 2: # Don't analyze if no parentheses
        pieces[1] = pieces[1].split(')')[0] # Remove anything after the right parenthesis for the parenthetical phrase
        if treat_as_single_string(pieces):
            pieces = [label] # If it fails the translations screening test, treat it as a single string
        
    for piece in pieces:
        translation_data = {'text': piece.strip()} # Create a dict to hold translation data
        translation_data['n_words'] = words_in_phrase(piece)
        lang, prec = detect_language(piece)
        translation_data['language'] = lang
        translation_data['probability'] = prec
        
        translations_list.append(translation_data)
        #print()
    return translations_list


## Process 3D works to determine type

In [None]:
classifications = pd.read_csv('works_classification.csv', na_filter=False, dtype = str)
works = pd.read_csv('works_multiprop.csv', na_filter=False, dtype = str)
#test = works.head(100)

output_list_3d = []

# Iterate through the work rows to analize label text
for index, work in works.iterrows():
    # Look up the dimension and type info for the work from the classifications DataFrame
    dimension = classifications.loc[classifications.qid==work['qid'], 'dimension'].values[0]
    type = classifications.loc[classifications.qid==work['qid'], 'type'].values[0]

    #print(work['label_en'], dimension, type)
    if dimension == '3D':
        label = work['label_en']
        print(label)
        work_dict = {'qid': work['qid'], 'type': type, 'label': label}
        processed = False
        
        # Test "with a Design/design"
        test_string_list = ['with a Design', 'with a design', 'with designs', 'with design', 'With a Design', 'With Design', 'depicting']
        split, n_pieces, pieces = split_string(label, test_string_list)
        if split:
            processed = True
            if n_pieces > 2:
                pieces = list(pieces[:2]) # limit to two parts, not likely to ever happen with this one
                work_dict['object_description'] = 'Error on "with a Design"'
            if n_pieces == 2:
                work_dict['object_description'] = pieces[0]
                work_dict['form_description'] = ''
                work_dict['design_description'] = pieces[1]
                work_dict['includes'] = ''
        
        # Test "in the form of/Form of"
        if not processed: # do not perform the test if a previous split already worked
            test_string_list = ['in the Form of', 'in the form of', 'in the shape of', 'in the design of', 'in the Shape of', 'Representing']
            split, n_pieces, pieces = split_string(label, test_string_list)
            if split:
                processed = True
                if len(pieces) > 2:
                    pieces = list(pieces[:2]) # limit to two parts, not likely to ever happen with this one
                    work_dict['object_description'] = 'Error on "in the form of"'
                if len(pieces) == 2:
                    work_dict['object_description'] = pieces[0]
                    work_dict['form_description'] = pieces[1]
                    work_dict['design_description'] = ''
                    work_dict['includes'] = ''

        # Test "with"
        if not processed: # do not perform the test if a previous split already worked
            test_string_list = ['with', 'With']
            split, n_pieces, pieces = split_string(label, test_string_list)
            if split:
                processed = True
                # if there are more than two "with"s in the string, only split by the first one
                if len(pieces) > 2:
                    # re-assemble phrase after first "with"
                    built = pieces[1]
                    for piece in pieces[2:]:
                        built += ' with ' + piece
                    pieces = [pieces[0], built]

                work_dict['object_description'] = pieces[0]
                work_dict['form_description'] = ''
                work_dict['design_description'] = ''
                work_dict['includes'] = pieces[1]

        # If none of the splits worked, just leave the label as the object description
        if not processed:
            work_dict['object_description'] = label
            work_dict['form_description'] = ''
            work_dict['design_description'] = ''
            work_dict['includes'] = ''

        #print(json.dumps(work_dict, indent = 2))
        #print()
        
        output_list_3d.append(work_dict)

with open('3d_parts.json', 'wt', encoding='utf-8') as fileObject:
    fileObject.write(json.dumps(output_list_3d, indent = 2))
    
fieldnames = list(output_list_3d[0].keys())
write_dicts_to_csv(output_list_3d, '3d_parts.csv', fieldnames)

print('done')

## Process titles for language

In [None]:
works = pd.read_csv('works_multiprop.csv', na_filter=False, dtype = str)
# limit to rows that are missing titles and get only the qid and label columns
no_titles = works.loc[works.title=='', 'qid': 'label_en'].copy()

output_list = []
for index, work in no_titles.iterrows():
    print(work['label_en'])
    translations = generate_translations(work['label_en'])
    line_dict = {'qid': work['qid'], 'label': work['label_en']}
    n = 0
    for language in translations:
        n += 1
        line_dict['title' + str(n)] = language['text']
        line_dict['language' + str(n)] = language['language']
        line_dict['probability' + str(n)] = language['probability']
    if n == 1:
        line_dict['title2'] = ''
        line_dict['language2'] = ''
        line_dict['probability2'] = 0
    output_list.append(line_dict)
print('done')

In [None]:
fieldnames = list(output_list[0].keys())
write_dicts_to_csv(output_list, 'titles_lang.csv', fieldnames)

print('done')

## old code

In [None]:
classifications = read_dict('works_classification.csv')
works = read_dict('works_multiprop.csv')
#print(classifications[0:2])
#print()
#print(works[0:1])

In [None]:
output_list = []
for work in works:
    for classification in classifications:
        found = False
        if work['qid'] == classification['qid']:
            found = True
            break
    if found:
        #print(work['label_en'], classification['dimension'], classification['type'])
        if classification['dimension'] == '3D':
            label = work['label_en']
            print(label)
            work_dict = {'qid': work['qid'], 'type': classification['type'], 'label': label}
            
            processed = False
            
            # -------------
            # with a Design/design
            # -------------

            # Split the phrase by object description and design on object
            split = False
            pieces = label.split('with a Design')
            if len(pieces) != 1:
                split = True
            else:
                pieces = label.split('with a design')
                if len(pieces) != 1:
                    split = True
                else:
                    pieces = label.split('with designs')
                    if len(pieces) != 1:
                        split = True
                    else:
                        pieces = label.split('with design')
                        if len(pieces) != 1:
                            split = True

            if split:
                processed = True
                if len(pieces) > 2:
                    pieces = list(pieces[:2]) # limit to two parts, not likely to ever happen with this one
                    print('Error on "with a Design"')
                if len(pieces) == 2:
                    work_dict['object_description'] = generate_translations(pieces[0].strip())
                    work_dict['form_description'] = ''
                    work_dict['design_description'] = generate_translations(pieces[1].strip())
                    work_dict['includes'] = ''

            # -------------
            # in the form of/Form of
            # -------------

            # Split the phrase by object description and design on object
            split = False
            pieces = label.split('in the Form of')
            if len(pieces) != 1:
                split = True
            else:
                pieces = label.split('in the form of')
                if len(pieces) != 1:
                    split = True

            if split:
                processed = True
                if len(pieces) > 2:
                    pieces = list(pieces[:2]) # limit to two parts, not likely to ever happen with this one
                    print('Error on "in the form of"')
                #print(pieces)
                if len(pieces) == 2:
                    work_dict['object_description'] = generate_translations(pieces[0].strip())
                    work_dict['form_description'] = generate_translations(pieces[1].strip())
                    work_dict['design_description'] = ''
                    work_dict['includes'] = ''
            
            if not processed:
                pieces = label.split('with')
                if len(pieces) != 1:
                    split = True
                else:
                    pieces = label.split('With')
                    if len(pieces) != 1:
                        split = True
                        
                if split:
                    processed = True
                    if len(pieces) > 2:
                        # re-assemble phrase after first "with"
                        built = pieces[1]
                        for piece in pieces[2:]:
                            built += 'with' + piece
                        pieces = [pieces[0], built]

                    work_dict['object_description'] = generate_translations(pieces[0].strip())
                    work_dict['form_description'] = ''
                    work_dict['design_description'] = ''
                    work_dict['includes'] = generate_translations(pieces[1].strip())

            if not processed:
                work_dict['object_description'] = generate_translations(label)
                work_dict['form_description'] = ''
                work_dict['design_description'] = ''
                work_dict['includes'] = ''

            output_list.append(work_dict)

'''
with open('2d_parts.json', 'wt', encoding='utf-8') as fileObject:
    # If analysis occurrred successfully, update the last_run date in the file with today's date
    fileObject.write(json.dumps(output_list, indent = 2))
''' 
print('done')