# Art historians' network relations

## Imports

In [None]:
# rdflib
import rdflib
from rdflib import Namespace , Literal , URIRef
from rdflib.namespace import RDF , RDFS
from SPARQLWrapper import SPARQLWrapper, JSON

# utils
import ssl, os.path, json, requests , ast
from collections import defaultdict
import itertools

# spacy
import spacy
import random
from spacy.util import minibatch, compounding
from pathlib import Path

# python3 -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

# data proc 
import pandas as pd
import numpy as np
from apyori import apriori

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpmax, fpgrowth
from mlxtend.frequent_patterns import association_rules

# graph data
import networkx as nx
from networkx.algorithms import bipartite

# data viz
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

# maps
from ipywidgets import HTML
from ipyleaflet import Map, Marker, Popup, LayersControl, AwesomeIcon

## Utils

In [None]:
def wikidata_reconciliation(query, q_class=None):
    "Find Wikidata QID for a query string"
    params = {
        'action': 'wbsearchentities',
        'format': 'json',
        'language': 'en',
        'search': query
    }
    # query wd API
    API_WD = "https://www.wikidata.org/w/api.php"
    r = requests.get(API_WD, params = params).json()

    # double check if the entity belongs to the right class
    if 'search' in r and len(r['search']) >= 1:
        if q_class:
            query_string = """ASK {wd:"""+r['search'][0]['title']+""" a <https://www.wikidata.org/entity/"""+q_class+""">. }"""
            res = return_sparql_query_results(query_string)
            print("\nRES", query, query_string, res)
            if res["boolean"] == 'true':
                return [ r['search'][0]['title'] , 'class_match']
            else:
                return [ r['search'][0]['title'] , 'no_class_match']
        else:
            return [ r['search'][0]['title'] , 'no_class_given']
    else:
        return 'not matched'
    
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

## Prepare data

### Get the list of historians, names, and biographies from ARTchives.

In [None]:
artists_query = """
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX wdt: <http://www.wikidata.org/wiki/Property:>
    SELECT ?historian (sample(?names) as ?name) ?bio
    WHERE { 
        ?collection wdt:P170 ?historian . 
        ?historian rdfs:label ?names. 
        OPTIONAL{?historian <http://purl.org/dc/terms/description> ?bio}
    }
    GROUP BY ?historian ?name ?bio"""

art_historians = []

try :
    art_sparql = SPARQLWrapper("http://artchives.fondazionezeri.unibo.it/sparql")
    art_sparql.setQuery(artists_query)
    art_sparql.setReturnFormat(JSON)
    results = art_sparql.query().convert()
    for result in results["results"]["bindings"]:
        historian = {}
        historian["uri"] = result["historian"]["value"]
        historian["name"] = result["name"]["value"]
        if "bio" in result:
            historian["bio"] = result["bio"]["value"]
        art_historians.append(historian)
except Exception as e:
    print(e) 

### Get historians' places of education and activity from Wikidata

In [None]:
ssl._create_default_https_context = ssl._create_unverified_context
wikidata_endpoint = "https://query.wikidata.org/bigdata/namespace/wdq/sparql"

if os.path.isfile("historian_places.json"):
    f = open('historian_places.json')
    results = json.load(f)
else:
    historians_list = ' '.join(['<'+art_dict["uri"]+'>' for art_dict in art_historians])
    eduplace_query = """ 
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    SELECT DISTINCT ?historian ?workplace ?workplace_label ?coordinates1 ?eduplace ?eduplace_label ?coordinates2
    WHERE {
            VALUES ?historian {"""+historians_list+"""} . 
            
            optional {
                ?historian wdt:P108 ?workplace . 
                ?workplace rdfs:label ?workplace_label ; 
                            wdt:P625 ?coordinates1; 
                            wdt:P31 ?type .
                FILTER (langMatches(lang(?workplace_label), "EN")) 
                ?type rdfs:label ?type_label . 
                FILTER (langMatches(lang(?type_label), "EN"))
                }
            optional {
                ?historian wdt:P69 ?eduplace . 
                ?eduplace rdfs:label ?eduplace_label ;
                            wdt:P625 ?coordinates2; 
                            wdt:P31 ?type . 
                FILTER (langMatches(lang(?eduplace_label), "EN")) 
                ?type rdfs:label ?type_label . 
                FILTER (langMatches(lang(?type_label), "EN")) 
                }
            } 
    GROUP BY ?historian ?workplace ?workplace_label ?coordinates1 ?eduplace ?eduplace_label ?coordinates2 
    """
    try :
        sparql_wd = SPARQLWrapper(wikidata_endpoint)
        sparql_wd.setQuery(eduplace_query)
        sparql_wd.setReturnFormat(JSON)
        results = sparql_wd.query().convert()

        #with open('historian_places.json', 'w') as f:
        #    json.dump(results, f, indent=4)
    except Exception as e:
        print(e)

### Include results in a graph

In [None]:
# create an empty Graph
g = rdflib.ConjunctiveGraph()

# bind namespaces
wd = Namespace("http://www.wikidata.org/entity/") # remember that a prefix matches a URI until the last slash (or hashtag #)
wdt = Namespace("http://www.wikidata.org/prop/direct/")
art = Namespace("https://w3id.org/artchives/")
rdfs = Namespace ("http://www.w3.org/2000/01/rdf-schema#")

for result in results["results"]["bindings"]:
    historian_uri = result["historian"]["value"]
    name = [art_dict["name"] for art_dict in art_historians if art_dict["uri"] == historian_uri][0]
    g.add(( URIRef(historian_uri) , RDFS.label , Literal(name) ))
    if "workplace" in result: 
        workplace = result["workplace"]["value"]
        if "workplace_label" in result and "coordinates1" in result: 
            workplace_label = result["workplace_label"]["value"]
            work_coord = result["coordinates1"]["value"][6:-1].split(" ")
            g.add(( URIRef(historian_uri) , URIRef(wdt.P108) , URIRef(workplace) ))
            g.add(( URIRef(workplace) , RDFS.label , Literal(workplace_label) ))
            g.add(( URIRef(workplace) , RDFS.comment , Literal("institution") ))
            g.add(( URIRef(workplace) , URIRef(wdt.P625) , Literal(work_coord) ))
    if "eduplace" in result: 
        eduplace = result["eduplace"]["value"]
        if "eduplace_label" in result and "coordinates2" in result: 
            eduplace_label = result["eduplace_label"]["value"]
            eduplace_coord = result["coordinates2"]["value"][6:-1].split(" ")
            g.add(( URIRef(historian_uri) , URIRef(wdt.P69) , URIRef(eduplace) ))
            g.add(( URIRef(eduplace) , RDFS.label , Literal(eduplace_label) ))
            g.add(( URIRef(eduplace) , RDFS.comment , Literal("institution") ))
            g.add(( URIRef(eduplace) , URIRef(wdt.P625) , Literal(eduplace_coord) ))

### Retrain Spacy to recognize specific organizations.

In [None]:
# train spacy with new data

ner=nlp.get_pipe("ner")
TRAIN_DATA = [
    ("Working at Villa I Tatti.", {"entities": [(11, 24, "ORG")]})
]

for _, annotations in TRAIN_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])
        
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

# TRAINING THE MODEL
with nlp.disable_pipes(*unaffected_pipes):

  # Training for 30 iterations
  for iteration in range(30):

    # shuufling examples  before every iteration
    random.shuffle(TRAIN_DATA)
    losses = {}
    # batch up the examples using spaCy's minibatch
    batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
    for batch in batches:
        texts, annotations = zip(*batch)
        nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
        print("Losses", losses)

### Get places and organizations mentioned in the biographies of historians from ARTchives. Reconcile Named Entities with Wikidata.

In [None]:
artchives_places = []
for historian in art_historians:
    if "bio" in historian:
        doc = nlp(historian["bio"])
        print('\n' , historian["name"])
        for ent in doc.ents:
            if ent.label_ == 'GPE' or ent.label_ == 'ORG':
                place = {}
                place["historian"] = historian["uri"]
                place["name"] = ent.text
                place["type"] = ent.label_
                ent_text = ent.text[4:] if (ent.text).startswith("the") else ent.text
                qid = wikidata_reconciliation(ent_text)
                place["qid"] = qid[0] if qid != 'not matched' else "not found"
                print(place)
                artchives_places.append(place)

Disambiguate place names that appear also in the name of the historian.

### Find coordinates in Wikidata. 

In [None]:
entities_list = []
for historian in artchives_places:
    historian["historian_name"] =  [hist["name"] for hist in art_historians if hist["uri"] == historian["historian"]][0]
    if historian["name"] not in historian["historian_name"] and historian["qid"] != 'not found':
        entities_list.append("<http://www.wikidata.org/entity/"+historian["qid"]+">")

for group_entities in chunks(entities_list, 100):
    entities_tbr = " ".join(list(set(group_entities)))  
    wd_coordinates = """
        PREFIX wdt: <http://www.wikidata.org/prop/direct/>
        SELECT DISTINCT ?place ?place_label ?coord (group_concat(?type_label ; separator="; ") as ?label) 
        WHERE {
            VALUES ?place {"""+entities_tbr+"""} . 
            ?place wdt:P625 ?coord.
            OPTIONAL {?place rdfs:label ?place_label . FILTER (langMatches(lang(?place_label), "EN")) .} 
            OPTIONAL {?place wdt:P31 ?type . ?type rdfs:label ?type_label . FILTER (langMatches(lang(?type_label), "EN"))} 
            
            } 
            group by ?place ?place_label ?coord ?label
        """

    # set the endpoint 
    sparql_wd = SPARQLWrapper(wikidata_endpoint)
    # set the query
    sparql_wd.setQuery(wd_coordinates)
    # set the returned format
    sparql_wd.setReturnFormat(JSON)
    # get the results
    results = sparql_wd.query().convert()

    loc_list = ['country', 'city', 'village', 'capital', 'state', 'region', 'municipality', 'county', 'frazione', 'comune', 'city-state', 'enclave']

    for result in results["results"]["bindings"]:
        loc = ''
        place = result["place"]["value"]
        place_name = result["place_label"]["value"] if "place_label" in result else "no name"
        print(place, place_name)
        coord = result["coord"]["value"][6:-1].split(" ")
        type_label = result["label"]["value"].split("; ")[0] if "label" in result else "no name"
        type_label_list = type_label.split(" ")
        check =  any(item in loc_list for item in type_label_list)
        if check:
            loc = "geoloc"
        else:
            loc = 'institution'

        g.add(( URIRef(place) , RDFS.label , Literal(place_name) ))
        g.add(( URIRef(place) , wdt.P625 , Literal(coord) ))
        g.add(( URIRef(place) , RDFS.comment , Literal(loc) ))     

### Link new places to historians

In [None]:
for historian in artchives_places:
    qid = URIRef("http://www.wikidata.org/entity/"+historian["qid"]) if historian["qid"] != 'not found' else None
    if qid and (qid, RDFS.label, None) in g:
        g.add(( URIRef(historian["historian"]) , wdt.P7153 , qid )) # P7153 significant place
        
g.serialize(destination='RQ1.nq', format='nquads') 

### Convert to JSON.

In [None]:
data_RQ1 = []

g = rdflib.ConjunctiveGraph()

# parse a local RDF file by specifying the format
result = g.parse("RQ1.nq", format='nquads')

qres = g.query(
    """
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    SELECT DISTINCT ?placeqid ?placename ?coords ?placetype ?historianuri ?historianlabel 
       WHERE {
          ?placeqid wdt:P625 ?coords .
          OPTIONAL {?placeqid rdfs:label ?placename ; rdfs:comment ?placetype .}
          ?historianuri (wdt:P7153 | wdt:P108 | wdt:P69) ?placeqid ; rdfs:label ?historianlabel
       }""")

unique_places = list(set([row[0] for row in qres]))

for place in unique_places:
    place_dict = defaultdict(list)
    for row in qres:
        if row[0] == place:
            place_dict["QID"] = str(row[0])
            place_dict["label"] = str(row[1])
            place_dict["coords"] = str(row[2])
            place_dict["type"] = str(row[3])
            place_dict["historians"]. append(( str(row[4]),str(row[5])  ))
    place_dict = dict(place_dict)
    data_RQ1.append(place_dict)

In [None]:
data_RQ1_hist = []
unique_historians = list(set(row[4] for row in qres))
for hist in unique_historians:
    hist_dict = defaultdict(list)
    for row in qres:
        if row[4] == hist:
            hist_dict["historian"] = str(row[4])
            hist_dict["label"] = str(row[5])
            loc_type = " (loc)" if str(row[3]) == "geoloc" else " (inst)"
            place = "United States (loc)" if str(row[1]) == "United States of America" else str(row[1])+loc_type
            hist_dict["places"].append(place)
    hist_dict["places"] = list(set(hist_dict["places"]))
    hist_dict = dict(hist_dict)
    data_RQ1_hist.append(hist_dict)

In [None]:
with open('RQ1.json', 'w') as f:
    json.dump(data_RQ1, f, indent=4)
with open('RQ1_hist.json', 'w') as f:
    json.dump(data_RQ1_hist, f, indent=4)