# Infocard Prototype

This prototype seeks to sketch out possible interfaces that can pull data from names authority databases and supplement the Primary Source Coop.

Cheverus

Links:

SNAC ID -> SNAC Bio, Resources

LCNAF ID -> Wikidata Q ID, VIAF
https://id.loc.gov/authorities/names/n80001490.html

Wikidata -> Image Field (p18), Gender (p21), Occupation (p106), Position held (p39)

Wikidata Query Service: https://query.wikidata.org/

VIAF -> Works

## Linked Open Data

* Making documents discoverable
* Internal Coop links amplify with Wikidata, etc.?
* Reverse link (wikidata, etc. id to xml-id)
* List of Coop references

Wikidata congressmen during timeframe and find them in dJQA.

## Notes

* Messy
    * There are series of responses required to get some data: LOC -> VIAF -> VIAF.xml
    * Pro: LOC gets VIAF; Con: slows down responsivity
    
* To Do
    * Return list of all documents that reference individual persRef

In [1]:
import requests, re, warnings
import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup

import dash, dash_table
from dash.dependencies import Input, Output, State
from dash.exceptions import PreventUpdate
import dash_core_components as dcc
import dash_bootstrap_components as dbc
import dash_html_components as html
from jupyter_dash import JupyterDash

# Ignore simple warnings.
warnings.simplefilter('ignore', DeprecationWarning)

abs_dir = '/Users/quinn.wi/Documents/Data/JQA_pre-2021-04-14/'

In [2]:
%%time

names_data = pd.read_excel(abs_dir + 'DJQA_Names-List_singleSheet.xlsx', index_col = None) 

names_data.columns = names_data.columns.str.replace('\s', '_')

# names_data = names_data.query('(Last_Name == "Randolph") & (First_Name == "John")')

names_data = names_data.dropna(subset = ['LC_Name_Authority'])

CPU times: user 1min 7s, sys: 676 ms, total: 1min 8s
Wall time: 1min 9s


In [3]:
%%time

names_data.head(1)

CPU times: user 331 µs, sys: 17 µs, total: 348 µs
Wall time: 345 µs


Unnamed: 0,Last_Name,First_Name,Middle_Name,Maiden_Name,Variant_form_of_name,Title,Suffix,Short-hand_option_for_name,Hyogebated-unique-string-of-characters,Birth_Date,...,notes_for_editorial_team,Notes,Source,URL,LC_Name_Authority,SNAC,Identifier's_initials_and_date,project,Date_First_Mentioned,Second_URL
1419,Adams,Louisa,Catherine,Johnson,,,,,adams-louisa-catherine,1775,...,,wife of JQA,Adams Biographical Sketches at MHS website,http://www.masshist.org/2012/adams/biographies...,n 86022545,,KNB 9/8/2018,,,


## Functions

In [4]:
%%time

def parseLOC(identifier):
#     Lookup URI.
    url = f"https://id.loc.gov/authorities/names/{identifier}.madsxml.xml"
    response = requests.get(url).text
    
#     Parse XML with namespace from string.
    root = ET.fromstring(response)
    namespace = re.match(r"{(.*)}", str(root.tag))
    ns = {"ns":namespace.group(1)}
    
#     Gather information.
    namePart = root.find('.//ns:name[@authority="naf"]/ns:namePart', ns).text
    birthDeathDate = root.find('.//ns:name[@authority="naf"]/ns:namePart[@type="date"]', ns).text
    genderedTerm = root.find('.//ns:genderTerm', ns).text
    
    return {'id': identifier, "name": namePart, "birthDeath": birthDeathDate, "genderedTerm": genderedTerm}

# Read LOC html to get Wikidata & VIAF.
def locSoup(identifier):
    url = f"https://id.loc.gov/authorities/names/{identifier}.html"
    response = requests.get(url).text
    locSoup = BeautifulSoup(response)

    wiki = locSoup.find('span', {'href': re.compile(r'http://www.wikidata.org/entity/.*')})['href']
    wiki_id = re.search(r'.*/(Q.*)', wiki).group(1)
    
    viaf = locSoup.find('a', {'href': re.compile(r'http://viaf.org/viaf/.*')})['href']
    
    return {'wiki': {'id': wiki_id, 'url': wiki}, 
            'viaf': {'url': viaf}}
    

def parseSNAC(identifier):
#     Lookup URI & get JSON format.
    url = f"https://snaccooperative.org/download/{identifier}?type=constellation_json"
    response = requests.get(url).json()

    namePart = response['nameEntries'][0]['original']
    birthDeathDate = re.search('\d{4}-\d{4}', namePart).group(0)
    
    return {'id': identifier, 'name': namePart, 'birthDeath': birthDeathDate}

# Use SPARQL to gather Wikidata.
# Run this function after locSoup.
def sparqlWiki(data):
    wiki_key = data['wiki']['id']
    url = 'https://query.wikidata.org/sparql'
    
    
    query = f"""

    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

    SELECT ?personLabel ?genderLabel ?birthDate ?deathDate ?occupationLabel

    WHERE 
    {{
      wd:{wiki_key} rdfs:label ?person ;
                wdt:P18 ?image ;
                wdt:P21 ?gender ;
                wdt:P569 ?birthDate ;
                wdt:P570 ?deathDate ;
                wdt:P106 ?occupationLabel .
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
      FILTER ( LANG ( ?person ) = 'en' )
    }}
    """
    
    resp = requests.get(url, params = {'format': 'json', 'query': query}).json()
    
#     data['wiki']['image'] = resp['results']['bindings'][0]['image']['value']
    data['wiki']['genderLabel'] = resp['results']['bindings'][0]['genderLabel']['value']
    data['wiki']['birthDate'] = resp['results']['bindings'][0]['birthDate']['value']
    data['wiki']['deathDate'] = resp['results']['bindings'][0]['deathDate']['value']
    
    data['wiki']['occupations'] = []
    for o in resp['results']['bindings']:
        data['wiki']['occupations'].append(o['occupationLabel']['value'])
        
        
# Gather VIAF data.
# Run after locSoup.
def parseVIAF(data):
    url = data['viaf']['url']
    response = requests.get(url).text
    soup = BeautifulSoup(response)

    viaf_key = soup.find('title').text

#     Use VIAF Key to parse VIAF entity.
#     Redirects.
    url = f"https://viaf.org/viaf/{viaf_key}/viaf.xml"
    response = requests.get(url).text

#     Parse XML with namespace from string.
    root = ET.fromstring(response)
    namespace = re.match(r"{(.*)}", str(root.tag))
    ns = {"ns":namespace.group(1)}

#     Gather information.
    titles = root.findall('.//ns:titles', ns)
    
    data['viaf']['works'] = []
    
    for w in titles:
        work = w.findall('./ns:work', ns)
        for t in work:
            title = t.find('./ns:title', ns)
            
            data['viaf']['works'].append(title.text)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 6.91 µs


## Test

In [5]:
%%time

# locID for Lydia Child
locID = "n80001490"

# snacID
snacID = '84910652'

data = {}

# Get LOC & SNAC data.
data['loc'] = parseLOC(locID)
data['snac'] = parseSNAC(snacID)

# Get Wiki data.
data['wiki'] = locSoup(locID)['wiki']
sparqlWiki(data)

# Get VIAF data.
data['viaf'] = locSoup(locID)['viaf']
parseVIAF(data)

CPU times: user 294 ms, sys: 30.5 ms, total: 324 ms
Wall time: 6.29 s


## App

In [36]:
%%time

# App configurations
app = JupyterDash(__name__, 
                  external_stylesheets = [dbc.themes.SLATE],
                  meta_tags=[
                      {"name": "viewport", "content": "width=device-width, initial-scale=1"},
                  ],
                 )

app.config.suppress_callback_exceptions = True

app.layout = html.Div(
    className = 'app-body',
    children = [
        
        html.H1('Info Card'),
        html.P('Description'),
        
        dcc.Dropdown(
            id='entityID',
             options = [
                 {'label': 'Cheverus, Jean-Louis-Aimé- Madeleine Lefebvre de', 'value': 'n92060378'},
                 {'label': 'Child, Lydia Maria', 'value': 'n80001490'}
             ], 
             value = 'n92060378'),
        
        dcc.Checklist(
            id = 'selector',
            options=[
                {'label': 'Library of Congress', 'value': 'LOC'},
                {'label': 'SNAC', 'value': 'SNAC'},
                {'label': 'WikiData', 'value': 'WIKI'},
                {'label': 'VIAF', 'value': 'VIAF'}
            ],
            value=['LOC', 'WIKI']),
        
        html.Button('Submit', id='submit-button', n_clicks=0),
        
        html.Div(className = 'data-information', id = 'data-information'),
        
    ]
)

###########################
######### Callbacks #######
###########################

@app.callback(Output('data-information', 'children'), 
              [Input('submit-button', 'n_clicks'), Input('entityID', 'value'), Input('selector', 'value')])
def collect_loc_and_derivatives(submit, entityID, selector):
    
    #     Only run if the n_click 'id' is triggered by the revision-button-container.
    changed_id = [p['prop_id'] for p in dash.callback_context.triggered][0]
        
    if changed_id != 'submit-button.n_clicks':
        raise PreventUpdate
        
    for s in selector:
        if s == "LOC":
            data['loc'] = parseLOC(entityID)
            return html.P(data['loc'])
            
        elif s == "WIKI":
            data['wiki'] = locSoup(locID)['wiki']
            sparqlWiki(data)
            return html.P(data['wiki'])
            
        
#     try:
#         print (n_clicks)
        
#     except:
#         return dash.no_update


if __name__ == "__main__":
    app.run_server(#mode = 'inline', 
                   debug = True) # mode = 'inline' for JupyterDash

Dash app running on http://127.0.0.1:8050/
CPU times: user 63.3 ms, sys: 55.5 ms, total: 119 ms
Wall time: 331 ms
