# MHS Data Derivatives in Progress

In [1]:
import operator, random, requests, re, warnings, pprint, json, gensim, geopy, folium
import pandas as pd
import numpy as np
from IPython.display import HTML, display, Javascript

# Info-Card
import xml.etree.ElementTree as ET
from bs4 import BeautifulSoup

# Topic Model
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pyLDAvis, pyLDAvis.sklearn
from pyLDAvis import PreparedData
pyLDAvis.enable_notebook()


# Primary visualizations
import matplotlib.pyplot as plt
import matplotlib
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
import seaborn as sns
import plotly.express as px

# PCA visualization
from scipy.spatial.distance import cosine
from sklearn.metrics import pairwise
from sklearn.manifold import MDS, TSNE
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA

# Import (Jupyter) Dash -- App Functionality
import dash, dash_table
from dash.dependencies import Input, Output, State
from dash.exceptions import PreventUpdate
import dash_core_components as dcc
import dash_bootstrap_components as dbc
import dash_html_components as html
from jupyter_dash import JupyterDash

# GeoReferencing
from geopy.extra.rate_limiter import RateLimiter

# Ignore simple warnings.
warnings.simplefilter('ignore')

abs_dir = '/Users/quinn.wi/Documents/Data/'

## Functions

In [2]:
# Toggle/Hide Function
def hide_toggle(for_next=False):
    this_cell = """$('div.cell.code_cell.rendered.selected')"""
    next_cell = this_cell + '.next()'

    toggle_text = 'Toggle show/hide'  # text shown on toggle link
    target_cell = this_cell  # target cell to control with toggle
    js_hide_current = ''  # bit of JS to permanently hide code in current cell (only when toggling next cell)

    if for_next:
        target_cell = next_cell
        toggle_text += ' next cell'
        js_hide_current = this_cell + '.find("div.input").hide();'

    js_f_name = 'code_toggle_{}'.format(str(random.randint(1,2**64)))

    html = """
        <script>
            function {f_name}() {{
                {cell_selector}.find('div.input').toggle();
            }}

            {js_hide_current}
        </script>

        <a href="javascript:{f_name}()">{toggle_text}</a>
    """.format(
        f_name=js_f_name,
        cell_selector=target_cell,
        js_hide_current=js_hide_current, 
        toggle_text=toggle_text
    )

    return HTML(html)

# Info-Card Functions
def parseLOC(identifier):
#     Lookup URI.
    url = f"https://id.loc.gov/authorities/names/{identifier}.madsxml.xml"
    response = requests.get(url).text
    
#     Parse XML with namespace from string.
    root = ET.fromstring(response)
    namespace = re.match(r"{(.*)}", str(root.tag))
    ns = {"ns":namespace.group(1)}
    
#     Gather information.
    loc_xpath = {'Name': './/ns:name[@authority="naf"]/ns:namePart', 
                 'Birth & Death Date': './/ns:name[@authority="naf"]/ns:namePart[@type="date"]', 
                 'Gendered Term': './/ns:genderTerm'}

    loc_data = {}
    loc_data['id'] = identifier
    
    for k, v in loc_xpath.items():
        try:
            loc_data[k] = root.find(v, ns).text
        except:
            loc_data[k] = 'NaN'
    
    return loc_data
    

# Read LOC html to get Wikidata & VIAF.
def locSoup(identifier):
    url = f"https://id.loc.gov/authorities/names/{identifier}.html"
    response = requests.get(url).text
    locSoup = BeautifulSoup(response)

    wiki = locSoup.find('span', {'href': re.compile(r'http://www.wikidata.org/entity/.*')})['href']
    wiki_id = re.search(r'.*/(Q.*)', wiki).group(1)
    
    viaf = locSoup.find('a', {'href': re.compile(r'http://viaf.org/viaf/.*')})['href']
    
    return {'wiki': {'id': wiki_id, 'url': wiki}, 
            'viaf': {'url': viaf}}
    

def parseSNAC(identifier):
#     Lookup URI & get JSON format.
    url = f"https://snaccooperative.org/download/{identifier}?type=constellation_json"
    response = requests.get(url).json()

    namePart = response['nameEntries'][0]['original']
    birthDeathDate = re.search('\d{4}-\d{4}', namePart).group(0)
    
    return {'SNAC ID': identifier, 'Name': namePart, 'Birth & Death Date': birthDeathDate}

# Use SPARQL to gather Wikidata.
# Run this function after locSoup.
def sparqlWiki(data):
    wiki_key = data['wiki']['id']
    url = 'https://query.wikidata.org/sparql'
    
    query = f"""

    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>

    SELECT ?personLabel ?genderLabel ?birthDate ?deathDate ?occupationLabel

    WHERE 
    {{
      wd:{wiki_key} rdfs:label ?person ;
                wdt:P18 ?image ;
                wdt:P21 ?gender ;
                wdt:P569 ?birthDate ;
                wdt:P570 ?deathDate ;
                wdt:P106 ?occupationLabel .
      SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
      FILTER ( LANG ( ?person ) = 'en' )
    }}
    """
    
    resp = requests.get(url, params = {'format': 'json', 'query': query}).json()
    
#     data['wiki']['image'] = resp['results']['bindings'][0]['image']['value']
    data['wiki']['genderLabel'] = resp['results']['bindings'][0]['genderLabel']['value']
    data['wiki']['birthDate'] = resp['results']['bindings'][0]['birthDate']['value']
    data['wiki']['deathDate'] = resp['results']['bindings'][0]['deathDate']['value']
    
    data['wiki']['occupations'] = []
    for o in resp['results']['bindings']:
        data['wiki']['occupations'].append(o['occupationLabel']['value'])
        
        
# Gather VIAF data.
# Run after locSoup.
def parseVIAF(data):
    url = data['viaf']['url']
    response = requests.get(url).text
    soup = BeautifulSoup(response)

    viaf_key = soup.find('title').text

#     Use VIAF Key to parse VIAF entity.
#     Redirects.
    url = f"https://viaf.org/viaf/{viaf_key}/viaf.xml"
    response = requests.get(url).text

#     Parse XML with namespace from string.
    root = ET.fromstring(response)
    namespace = re.match(r"{(.*)}", str(root.tag))
    ns = {"ns":namespace.group(1)}

#     Gather information.
    titles = root.findall('.//ns:titles', ns)
    
    data['viaf']['works'] = []
    
    for w in titles:
        work = w.findall('./ns:work', ns)
        for t in work:
            title = t.find('./ns:title', ns)
            
            data['viaf']['works'].append(title.text)

hide_toggle()

## Info Card

In [3]:
%%time

# locID for Lydia Child
locID = "n80001490"

# snacID
snacID = '84910652'

data = {}

# Get LOC & SNAC data.
data['loc'] = parseLOC(locID)
print ('Library of Congress')
pprint.pprint(data['loc'])

data['snac'] = parseSNAC(snacID)
print ('\nSNAC')
pprint.pprint(data['snac'])

# Get Wiki data.
data['wiki'] = locSoup(locID)['wiki']
sparqlWiki(data)
print ('\nWiki')
pprint.pprint(data['wiki'])

# Get VIAF data.
data['viaf'] = locSoup(locID)['viaf']
parseVIAF(data)
print ('\nVIAF')
pprint.pprint(data['viaf'])

Library of Congress
{'Birth & Death Date': '1802-1880',
 'Gendered Term': 'female',
 'Name': 'Child, Lydia Maria',
 'id': 'n80001490'}

SNAC
{'Birth & Death Date': '1802-1880',
 'Name': 'Child, Lydia Maria, 1802-1880',
 'SNAC ID': '84910652'}

Wiki
{'birthDate': '1802-02-11T00:00:00Z',
 'deathDate': '1880-10-20T00:00:00Z',
 'genderLabel': 'female',
 'id': 'Q443132',
 'occupations': ['http://www.wikidata.org/entity/Q36180',
                 'http://www.wikidata.org/entity/Q49757',
                 'http://www.wikidata.org/entity/Q520549',
                 'http://www.wikidata.org/entity/Q1930187',
                 'http://www.wikidata.org/entity/Q6625963'],
 'url': 'http://www.wikidata.org/entity/Q443132'}

VIAF
{'url': 'http://viaf.org/viaf/sourceID/LC%7Cn++80001490#skos:Concept',
 'works': ['The American anti-slavery almanac, for ... : calculated for '
           'Boston, New York, and Pittsburgh ...',
           'Anti-slavery catechism',
           'appeal in favor of that class of A

## Topic Model

In [4]:
%%time

# Topic 13: church sermons?
# Topic 19: steam power?
HTML(abs_dir + 'Output/TopicModels/jqa_topics-40_pyLDAvis.html')

CPU times: user 674 µs, sys: 835 µs, total: 1.51 ms
Wall time: 962 µs


### Topic Model Details



## Word Vectors

In [7]:
%%time

# Load model.
model = gensim.models.KeyedVectors.load_word2vec_format(abs_dir + '/Output/WordVectors/jqa_w2v.txt')

print ('Words most similar to "work":\n', [word for word, score in model.most_similar(['work'])], '\n')

print('Similarity of "tariff" to "economy": ', model.similarity('tariff', 'economy'), '\n')

print ('Word most similar to "foreign + policy + congress":\n', 
       model.most_similar(positive = ['foreign', 'policy'], negative = ['congress'], topn = 4), '\n')

Words most similar to "work":
 ['tomb', 'researches', 'incomplete', 'inscription', 'agricola', 'lectures', 'pictures', 'metamorphoses', 'larger', 'paradise'] 

Similarity of "tariff" to "economy":  0.5708187 

Word most similar to "foreign + policy + congress":
 [('european', 0.6353726387023926), ('turkey', 0.6332741379737854), ('southamerican', 0.6127479076385498), ('treatment', 0.6089410185813904)] 

CPU times: user 887 ms, sys: 18.3 ms, total: 906 ms
Wall time: 900 ms


## XQuery

## Networks

http://localhost:8080/index.html

## GeoReferencing

In [5]:
%%time

geo = pd.read_csv(abs_dir + 'Output/Geo/jqa-geoReference.csv', sep = ',')

print (f'Number of places (not unique) mentioned {len(geo.index)}.\n')

geo = geo.groupby(['lon', 'lat'], as_index = False).size().reset_index()

geo.columns = ['lon', 'lat', 'count']

geo = geo.query('count > 20')

m = folium.Map(location=[42.361145, -71.057083])

geo.apply(lambda row: folium.CircleMarker(location = [row["lat"], row['lon']]).add_to(m), axis = 1)

m

Number of places (not unique) mentioned 12353.

CPU times: user 40.5 ms, sys: 8.87 ms, total: 49.4 ms
Wall time: 58.6 ms
