# EDA

## Imports

In [None]:
# rdflib
import rdflib
from rdflib import Namespace , Literal , URIRef
from rdflib.namespace import RDF , RDFS
from SPARQLWrapper import SPARQLWrapper, JSON

# utils
import ssl, os.path, json, requests , ast
from collections import defaultdict
import itertools

# spacy
import spacy
import random
from spacy.util import minibatch, compounding
from pathlib import Path

# python3 -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

# data proc 
import pandas as pd
import numpy as np
#from apyori import apriori
from efficient_apriori import apriori

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpmax, fpgrowth
from mlxtend.frequent_patterns import association_rules

# graph data
import networkx as nx
from networkx.algorithms import bipartite

# data viz
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

# maps
from ipywidgets import HTML
from ipyleaflet import Map, Marker, Popup, LayersControl, AwesomeIcon

## RQ1. What are the places of education and activity of the selected art historians?


### Map of the places of art historians

In [None]:
center = (41.080684, -30.683374)
m = Map(center=center, zoom=3, close_popup_on_click=False)

if os.path.isfile("RQ1.json"):
    f = open('RQ1.json')
    data_RQ1 = json.load(f)

for place in data_RQ1:
    historians = [list(item) for item in set(tuple(row) for row in place["historians"])]
    names = [ hist[1] for hist in historians ]
    namelist = "; ".join(names)
    coords = ast.literal_eval(place["coords"])
    if place["type"] == "geoloc":
        icon2 = AwesomeIcon(
        name = "map-marker",
        marker_color='blue',
        icon_color='white',
        spin=False
        )
        marker = Marker(icon = icon2, location=(coords[1], coords[0]))
        m.add_layer(marker)
    else:
        icon2 = AwesomeIcon(
        name = "bank",
        marker_color='green',
        icon_color='white',
        spin=False
            )
        marker = Marker(icon = icon2, location=(coords[1], coords[0]))
        m.add_layer(marker)
    
    message = HTML()
    marker.popup = message
    message.description = ""
    message.value = "<b>" + place["label"] + "</b>" + "<br/>" + namelist

m

### Map of the distribution of art historians

In [None]:
df = pd.read_json("RQ1.json")
df.head()

In [None]:
df[['long','lat']] = df['coords'].str.strip('[]').str.split(', ', expand=True)
#df['list'] = df['historians'].apply(lambda txt: ','.join(map(str, txt ) ) )
df['people'] = [ "; ".join([e[1] for e in i]) for i in df['historians'].tolist() ] 
df['count'] = df.historians.apply(len)
df['long'] = df['long'].str.replace("'",'').astype(float)
df['lat'] = df['lat'].str.replace("'",'').astype(float)
tok = 'pk.eyJ1IjoibWFyaWxlbmFkYXF1aW5vIiwiYSI6ImNpeWx6MjZ3dTAwMjkzMmpyZGR2ejVqZHoifQ.AjYdPJPgptZiqrLSrrHSxQ'
px.set_mapbox_access_token(tok)
fig = px.scatter_mapbox(df, lat="lat", lon="long", size_max=40, color="type",
    size="count", hover_data=["people"], hover_name='label', zoom=2, title='Distribution of historians')
fig.update_layout( mapbox_accesstoken=tok)
#fig.show()

In [None]:
fig = px.bar(df, x="label", y="count",  hover_data=["count"], color="type", hover_name='label', title='Distribution of historians')
fig.update_layout( mapbox_accesstoken=tok)
fig.show()

The most represented places are USA (New York and Los Angeles) and Europe: Rome, Munich and Freiburg, Florence and London.

The most represented institutions include:

 * USA
   * East Coast: NY Universities and museums. Harvard University
   * West Coast: Los Angeles, The Getty Museum
 * Europe
   * Italy: Rome area and Sapienza University. Florence and Villa I Tatti. Bologna. Turin and the University of Turin.
   * Germany: Munich and the Max Planck Society. Berlin and the Humboldt University. Friedburg
   * UK: London, the British Museum and the National Gallery. Oxford and its colleges. Birmingham 

### Relations between art historians 

Based on the network of places and institutions

In [None]:
people = df['people'].tolist()
people = [ list(set([el]))  if ';' not in el else list(set([i for i in el.split('; ')])) for el in people]
te = TransactionEncoder()
te_ary = te.fit(people).transform(people)
dfrel = pd.DataFrame(te_ary, columns=te.columns_)

frequent_itemsets_historians = fpgrowth(dfrel, min_support=0.02, use_colnames=True)
rules = association_rules(frequent_itemsets_historians, metric="confidence", min_threshold=0.1)
rules['lhs items'] = rules['antecedents'].apply(lambda x:len(x) )
rules[rules['lhs items']>1].sort_values('lift', ascending=False).head()
# Replace frozen sets with strings
rules['antecedents_'] = rules['antecedents'].apply(lambda a: ','.join(list(a)))
rules['consequents_'] = rules['consequents'].apply(lambda a: ','.join(list(a)))
# Transform the DataFrame of rules into a matrix using the lift metric
pivot = rules[rules['lhs items']>1].pivot(index = 'antecedents_', columns = 'consequents_', values= 'lift')
# Generate a heatmap with annotations on and the colorbar off
sns.heatmap(pivot, annot = True)
plt.yticks(rotation=0)
plt.xticks(rotation=90)
plt.show()

Significant cooccurrence of historians:
 
 * W. Cohn, R. Krautimer, L. Steinberg
 * E.Steinemann, R. Krautimer, W. Lotz
 * O.Brockhaus, R. Krautimer, W. Lotz
 * F. Zeri, L. Steinberg, L. Salerno
 * W. Cohn, K. Badt
 
 

Based on only institutions they have in common

In [None]:
dfinst = df.copy()
dfinst = dfinst.drop(dfinst[dfinst["type"] == 'geoloc'].index)
dfinst.head()

In [None]:
people_inst = dfinst['people'].tolist()
people_inst = [ list(set([el]))  if ';' not in el else list(set([i for i in el.split('; ')])) for el in people_inst]
te = TransactionEncoder()
te_ary = te.fit(people_inst).transform(people_inst)
dfinstt = pd.DataFrame(te_ary, columns=te.columns_)

frequent_itemsets_historians_inst = fpgrowth(dfinstt, min_support=0.02, use_colnames=True)
rules = association_rules(frequent_itemsets_historians_inst, metric="confidence", min_threshold=0.1)
rules['lhs items'] = rules['antecedents'].apply(lambda x:len(x) )
rules[rules['lhs items']>1].sort_values('lift', ascending=False).head()
# Replace frozen sets with strings
rules['antecedents_'] = rules['antecedents'].apply(lambda a: ','.join(list(a)))
rules['consequents_'] = rules['consequents'].apply(lambda a: ','.join(list(a)))
# Transform the DataFrame of rules into a matrix using the lift metric
pivot = rules[rules['lhs items']>1].pivot(index = 'antecedents_', columns = 'consequents_', values= 'lift')
# Generate a heatmap with annotations on and the colorbar off
sns.heatmap(pivot, annot = True)
plt.yticks(rotation=0)
plt.xticks(rotation=90)
plt.show()

Significant cooccurrences:
 
 * E. Steinmann , O. L-Brockhaus
 * E. Fahy + L. Steinberg, F. Zeri
 * W. Cohn, W. Lotz
 

### Network of historians 

Based on places and institutions in common

In [None]:
unique_names = set(itertools.chain.from_iterable(people))

# Get all combinations of pairs
all_pairs = list(itertools.combinations(unique_names, 2))

# Create the dictionary
hp_relations = [ (pair[0], pair[1], len([x for x in people if set(pair) <= set(x)]) ) for pair in all_pairs]
hp_relations = [ pair for pair in hp_relations if pair[2] >0]

G = nx.Graph()
G.add_weighted_edges_from(hp_relations)

plt.figure(1,figsize=(100,100)) 

pos=nx.circular_layout(G) # pos = nx.nx_agraph.graphviz_layout(G)
nx.draw_networkx_nodes(G,pos,node_color='green',node_size=7500)
 
#3. If you want, add labels to the nodes
labels = {}
for node_name in unique_names:
    labels[str(node_name)] =str(node_name)

nx.draw_networkx_labels(G,pos,labels,font_size=120)

all_weights = []
#4 a. Iterate through the graph nodes to gather all the weights
for (node1,node2,data) in G.edges(data=True):
    all_weights.append(data['weight']) #we'll use this when determining edge thickness

#4 b. Get unique weights
unique_weights = list(set(all_weights))

#4 c. Plot the edges - one by one!
for weight in unique_weights:
    weighted_edges = [(node1,node2) for (node1,node2,edge_attr) in G.edges(data=True) if edge_attr['weight']==weight]
    nx.draw_networkx_edges(G,pos,edgelist=weighted_edges,edge_color='g', width=weight*7.0, alpha= 0.4)

The most connected historians seem to be:

 * R. Krautheimer > W. Lotz (confirmed in the biography of Lotz and collection of R.K), L. Steinberg and W. Cohn (missing E. Steinmann)
 * W. Lotz > R. Krautheimer, Aby Warburg and W. Cohn
 * W. Cohn > R. Krautheimer, L. Steinberg, U. Middeldorf, W. Lotz and E. Steinmann
 * F. Zeri > L. Steinberg, L. Salerno, E. Waterhouse (missing J. Pope-Hennessy and R. Longhi)
 * L. Salerno > F. Zeri and L. Steinberg

### Historians' degree of centrality in the network of academic relations

**<font color='red'>Influential historians</font>** What are the most influential historians? Based on the network of places they are connected to.

In [None]:
cent = nx.degree_centrality(G)
sorted_centrality = {k: v for k, v in sorted(cent.items(), key=lambda item: item[1])}
plt.bar(range(len(sorted_centrality)), list(sorted_centrality.values()), align='center')
plt.xticks(range(len(sorted_centrality)), list(sorted_centrality.keys()), rotation=90)
plt.show()

Central historians (with a degree higher than 0.8) are:

 * F. Zeri
 * L. Steinberg
 * W. Lotz
 * L. Salerno
 * L. Vertova

### Relations between places and institutions 

Based on historians' paths.


In [None]:
dfh = pd.read_json("RQ1_hist.json")
places = dfh['places'].tolist()

te = TransactionEncoder()
te_ary = te.fit(places).transform(places)
dfp = pd.DataFrame(te_ary, columns=te.columns_)

frequent_itemsets = fpgrowth(dfp, min_support=0.1, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)
rules['lhs items'] = rules['antecedents'].apply(lambda x:len(x) )
rules[rules['lhs items']>=1].sort_values('lift', ascending=False).head()
# Replace frozen sets with strings
rules['antecedents_'] = rules['antecedents'].apply(lambda a: ','.join(list(a)))
rules['consequents_'] = rules['consequents'].apply(lambda a: ','.join(list(a)))
# Transform the DataFrame of rules into a matrix using the lift metric
pivot = rules[rules['lhs items']>=1].pivot(index = 'antecedents_', 
                    columns = 'consequents_', values= 'lift')
# Generate a heatmap with annotations on and the colorbar off
sns.heatmap(pivot, annot = True)
plt.yticks(rotation=0)
plt.xticks(rotation=90)
plt.show()

Most relevant rules:
 * Rule 0-3: Hertziana, Rome and Germany / Munich
 * Rule 7-8: New York University and Berlin. 
 * Rule 5-6: London or USA and Rome. 

### Relations between institutions 

Based on historians' paths.


In [None]:
institutions = [[place for place in place_list if "(loc)" not in place] for place_list in places ]
institutions = [x for x in institutions if x]

te = TransactionEncoder()
te_ary = te.fit(places).transform(institutions)
dfi = pd.DataFrame(te_ary, columns=te.columns_)

frequent_itemsets = fpgrowth(dfi, min_support=0.05, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)
rules['lhs items'] = rules['antecedents'].apply(lambda x:len(x) )
rules[rules['lhs items']>=1].sort_values('lift', ascending=False).head()
# Replace frozen sets with strings
rules['antecedents_'] = rules['antecedents'].apply(lambda a: ','.join(list(a)))
rules['consequents_'] = rules['consequents'].apply(lambda a: ','.join(list(a)))
# Transform the DataFrame of rules into a matrix using the lift metric
pivot = rules[rules['lhs items']>=1].pivot(index = 'antecedents_', 
                    columns = 'consequents_', values= 'lift')
# Generate a heatmap with annotations on and the colorbar off
sns.heatmap(pivot, annot = True)
plt.yticks(rotation=0)
plt.xticks(rotation=90)
plt.show()

Significant cooccurrence of institutions:

 * Hertziana, NY University, and Vassar College
 * Columbia University, NY University, and MET museum

In [None]:
institutions = [tuple(l) for l in institutions]
itemsets, rules = apriori(institutions, min_support=0.05, min_confidence=0.7, output_transaction_ids=True)
print(itemsets)