In [None]:
!pip install pyoai
!pip install tabulate

# Prosecution Project OAI-PMH Demo

In [None]:
# pyoai provides a nice python API for OAI-PMH (http://infrae.com/download/OAI/pyoai)
from oaipmh.client import Client
from oaipmh.metadata import MetadataRegistry, oai_dc_reader
# other libraries we'll use
from datetime import datetime
from collections import defaultdict
from IPython.display import HTML, display
import tabulate

Simple OAI-PMH harvester in Python:

In [None]:
URL = 'https://oai.prosecutionproject-test.griffith.edu.au/oai'
records = []

# setup harvester
registry = MetadataRegistry()
registry.registerReader('oai_dc', oai_dc_reader)
client = Client(URL, registry, force_http_get=True)

# track the start time of the harvest
_start = datetime.now()

# fetch all records from oai
for record in client.listRecords(metadataPrefix='oai_dc'):
    records.append(record[1].getMap())
    
# record finish time
_finish = datetime.now()

# work out how long the harvester took
t_d = (_finish-_start).total_seconds()
print("Harvested {} records in {} seconds".format(len(records), t_d))

Create a naive top 10 list of the most common offences in the data

In [None]:
# do a simple occurence count of offences (witout any datacleaning)
offences = defaultdict(int)
offpjur = defaultdict(lambda: defaultdict(int))
jurisdiction = set()
for _rec in records: 
    _offence = _rec['title'][0].split(',')[1].strip().title()
    _jurisdiction = _rec['identifier'][0].split(':')[-1].split('/')[0]
    jurisdiction.add(_jurisdiction)
    offences[_offence] += 1    
    offpjur[_jurisdiction][_offence] += 1
    
# remove unknown offences
del(offences['[Unknown Offence]'])

# find the top 10 offences across all records and juristictions 
top10 = [[_of, _count] for _of, _count in sorted(offences.items(), reverse=True, key=lambda x: x[1])[:10]]
# Add the counts for each juristiction
top10 = [row + [offpjur[_j][row[0]] for _j in sorted(list(jurisdiction))] for row in top10]

# make it look pretty
_headers = ['Offence', 'Total']+[x.replace('SC','') for x in sorted(list(jurisdiction))]
display(HTML(tabulate.tabulate(top10, tablefmt='html', headers=_headers)))