## Building a Database of Performances

This one is likely to get a bit more complex.

In [192]:
import pandas

In [193]:
anno_df = pandas.read_json('../data/transcriptions.gz', compression='gzip')
anno_df.head()

Unnamed: 0,body,created,creator,generated,generator,id,motivation,partOf,tag,target,transcription,type
0,"[{u'type': u'TextualBody', u'purpose': u'descr...",2018-05-30T18:29:57Z,,2018-06-04T09:43:56Z,"[{u'homepage': u'https://www.libcrowds.com', u...",https://annotations.libcrowds.com/annotations/...,describing,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,genre,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,Melo-Drama,Annotation
10,"[{u'type': u'TextualBody', u'purpose': u'descr...",2018-05-30T18:36:33Z,,2018-06-04T09:43:56Z,"[{u'homepage': u'https://www.libcrowds.com', u...",https://annotations.libcrowds.com/annotations/...,describing,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,genre,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,Asiatic Melo-Dramatic Romance,Annotation
1009,"[{u'type': u'TextualBody', u'purpose': u'descr...",2018-06-03T21:42:23Z,,2018-06-04T09:43:57Z,"[{u'homepage': u'https://www.libcrowds.com', u...",https://annotations.libcrowds.com/annotations/...,describing,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,genre,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,Comedy,Annotation
101,"[{u'type': u'TextualBody', u'purpose': u'descr...",2018-06-01T16:39:55Z,,2018-06-04T09:43:56Z,"[{u'homepage': u'https://www.libcrowds.com', u...",https://annotations.libcrowds.com/annotations/...,describing,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,title,{u'source': u'https://api.bl.uk/metadata/iiif/...,Othello Travestie,Annotation
1019,"[{u'type': u'TextualBody', u'purpose': u'descr...",2018-06-03T21:42:23Z,,2018-06-04T09:43:57Z,"[{u'homepage': u'https://www.libcrowds.com', u...",https://annotations.libcrowds.com/annotations/...,describing,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,genre,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,Comedy,Annotation


In [194]:
task_df = pandas.read_json('../data/pybossa_tasks.gz', compression='gzip')

In [195]:
def get_task_id(generator):
    for g in generator:
        if 'api/task' in g['id']:
            return g['id'].split('/')[-1]

In [196]:
anno_df['task_id'] = anno_df['generator'].apply(get_task_id)
anno_df.head()

Unnamed: 0,body,created,creator,generated,generator,id,motivation,partOf,tag,target,transcription,type,task_id
0,"[{u'type': u'TextualBody', u'purpose': u'descr...",2018-05-30T18:29:57Z,,2018-06-04T09:43:56Z,"[{u'homepage': u'https://www.libcrowds.com', u...",https://annotations.libcrowds.com/annotations/...,describing,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,genre,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,Melo-Drama,Annotation,78482
10,"[{u'type': u'TextualBody', u'purpose': u'descr...",2018-05-30T18:36:33Z,,2018-06-04T09:43:56Z,"[{u'homepage': u'https://www.libcrowds.com', u...",https://annotations.libcrowds.com/annotations/...,describing,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,genre,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,Asiatic Melo-Dramatic Romance,Annotation,78801
1009,"[{u'type': u'TextualBody', u'purpose': u'descr...",2018-06-03T21:42:23Z,,2018-06-04T09:43:57Z,"[{u'homepage': u'https://www.libcrowds.com', u...",https://annotations.libcrowds.com/annotations/...,describing,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,genre,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,Comedy,Annotation,76071
101,"[{u'type': u'TextualBody', u'purpose': u'descr...",2018-06-01T16:39:55Z,,2018-06-04T09:43:56Z,"[{u'homepage': u'https://www.libcrowds.com', u...",https://annotations.libcrowds.com/annotations/...,describing,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,title,{u'source': u'https://api.bl.uk/metadata/iiif/...,Othello Travestie,Annotation,77426
1019,"[{u'type': u'TextualBody', u'purpose': u'descr...",2018-06-03T21:42:23Z,,2018-06-04T09:43:57Z,"[{u'homepage': u'https://www.libcrowds.com', u...",https://annotations.libcrowds.com/annotations/...,describing,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,genre,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,Comedy,Annotation,76075


## Add source column

This is needed to later find our links between different types of annotation

In [197]:
def get_source(target):
    if isinstance(target, dict):
        return target['source']
    else:
        return target

In [198]:
anno_df['source'] = anno_df['target'].apply(get_source)

## Map titles

Start with the titles, as these will be our root element.

In [199]:
def get_tag_df(tag):
    tag_df = anno_df[anno_df['tag'] == tag]
    tag_df = tag_df.rename(columns={'transcription': tag})
    tag_df = tag_df.drop('tag', axis=1)
    return tag_df

This becomes our main dataframe

In [200]:
df = get_tag_df('title')
df.head()

Unnamed: 0,body,created,creator,generated,generator,id,motivation,partOf,target,title,type,task_id,source
101,"[{u'type': u'TextualBody', u'purpose': u'descr...",2018-06-01T16:39:55Z,,2018-06-04T09:43:56Z,"[{u'homepage': u'https://www.libcrowds.com', u...",https://annotations.libcrowds.com/annotations/...,describing,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,{u'source': u'https://api.bl.uk/metadata/iiif/...,Othello Travestie,Annotation,77426,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...
102,"[{u'type': u'TextualBody', u'purpose': u'descr...",2018-06-01T16:40:38Z,,2018-06-04T09:43:56Z,"[{u'homepage': u'https://www.libcrowds.com', u...",https://annotations.libcrowds.com/annotations/...,describing,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,{u'source': u'https://api.bl.uk/metadata/iiif/...,"Revenge; Or, the Captive Moor",Annotation,77430,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...
103,"[{u'type': u'TextualBody', u'purpose': u'descr...",2018-06-01T16:42:59Z,,2018-06-04T09:43:56Z,"[{u'homepage': u'https://www.libcrowds.com', u...",https://annotations.libcrowds.com/annotations/...,describing,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,{u'source': u'https://api.bl.uk/metadata/iiif/...,"Jew of Lubeck Or, the Heart of a Father",Annotation,74944,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...
104,"[{u'type': u'TextualBody', u'purpose': u'descr...",2018-06-01T16:44:28Z,,2018-06-04T09:43:56Z,"[{u'homepage': u'https://www.libcrowds.com', u...",https://annotations.libcrowds.com/annotations/...,describing,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,{u'source': u'https://api.bl.uk/metadata/iiif/...,"Night Hag Or, St. Swithin's Chair",Annotation,75378,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...
105,"[{u'type': u'TextualBody', u'purpose': u'descr...",2018-06-01T16:44:49Z,,2018-06-04T09:43:56Z,"[{u'homepage': u'https://www.libcrowds.com', u...",https://annotations.libcrowds.com/annotations/...,describing,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,{u'source': u'https://api.bl.uk/metadata/iiif/...,Woodman's Hut,Annotation,75396,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...


## Map dates

In [201]:
dates_df = get_tag_df('date')

In [202]:
cols = ['date', 'source', 'task_id']
df = df.merge(dates_df[cols], on='source', how='left', suffixes=('_title', '_date'))
df.head()

Unnamed: 0,body,created,creator,generated,generator,id,motivation,partOf,target,title,type,task_id_title,source,date,task_id_date
0,"[{u'type': u'TextualBody', u'purpose': u'descr...",2018-06-01T16:39:55Z,,2018-06-04T09:43:56Z,"[{u'homepage': u'https://www.libcrowds.com', u...",https://annotations.libcrowds.com/annotations/...,describing,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,{u'source': u'https://api.bl.uk/metadata/iiif/...,Othello Travestie,Annotation,77426,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,1836-12-08,83095
1,"[{u'type': u'TextualBody', u'purpose': u'descr...",2018-06-01T16:40:38Z,,2018-06-04T09:43:56Z,"[{u'homepage': u'https://www.libcrowds.com', u...",https://annotations.libcrowds.com/annotations/...,describing,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,{u'source': u'https://api.bl.uk/metadata/iiif/...,"Revenge; Or, the Captive Moor",Annotation,77430,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,1850-10-11,83096
2,"[{u'type': u'TextualBody', u'purpose': u'descr...",2018-06-01T16:42:59Z,,2018-06-04T09:43:56Z,"[{u'homepage': u'https://www.libcrowds.com', u...",https://annotations.libcrowds.com/annotations/...,describing,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,{u'source': u'https://api.bl.uk/metadata/iiif/...,"Jew of Lubeck Or, the Heart of a Father",Annotation,74944,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,1828-04-11,82838
3,"[{u'type': u'TextualBody', u'purpose': u'descr...",2018-06-01T16:44:28Z,,2018-06-04T09:43:56Z,"[{u'homepage': u'https://www.libcrowds.com', u...",https://annotations.libcrowds.com/annotations/...,describing,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,{u'source': u'https://api.bl.uk/metadata/iiif/...,"Night Hag Or, St. Swithin's Chair",Annotation,75378,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,1828-01-29,82997
4,"[{u'type': u'TextualBody', u'purpose': u'descr...",2018-06-01T16:44:49Z,,2018-06-04T09:43:56Z,"[{u'homepage': u'https://www.libcrowds.com', u...",https://annotations.libcrowds.com/annotations/...,describing,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,{u'source': u'https://api.bl.uk/metadata/iiif/...,Woodman's Hut,Annotation,75396,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,1830-11-09,83004


We now have titles mapped against dates

In [203]:
df[['title', 'date']].head()

Unnamed: 0,title,date
0,Othello Travestie,1836-12-08
1,"Revenge; Or, the Captive Moor",1850-10-11
2,"Jew of Lubeck Or, the Heart of a Father",1828-04-11
3,"Night Hag Or, St. Swithin's Chair",1828-01-29
4,Woodman's Hut,1830-11-09


## Mapping the genres

Mapping the genres requires us to go back to the PYBOSSA task that was used to create each transcription. We have a match if the target of a genre task is the same as the target of a title annotation.

In [204]:
genres_df = get_tag_df('genre')

In [205]:
def map_fragment_from_task(task_id):
    try:
        task = task_df.loc[int(task_id)]
    except KeyError:
        return None
    
    return task['info']['target']['selector']['value']

In [206]:
genres_df['fragment'] = genres_df['task_id'].apply(map_fragment_from_task)
df['fragment'] = df['task_id_title'].apply(map_fragment_from_task)

We can't merge on the target easily because dictionary columns are not hashable, so create another lookup function.

In [207]:
cols = ['genre', 'source', 'fragment']
df = df.merge(genres_df[cols], on=['source', 'fragment'], how='left')

## Add a link back to the original image

Again, using the task ID.

In [208]:
def map_link_from_task(task_id):
    try:
        task = task_df.loc[int(task_id)]
    except KeyError:
        return None
    
    return task['info']['link']

In [209]:
df['link'] = df['task_id_title'].apply(map_link_from_task)

In [210]:
df = df[['partOf', 'source', 'title', 'date', 'genre', 'link']]
df.head()

Unnamed: 0,partOf,source,title,date,genre,link
0,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,Othello Travestie,1836-12-08,Burletta,http://access.bl.uk/item/viewer/ark:/81055/vdc...
1,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,"Revenge; Or, the Captive Moor",1850-10-11,Classical Tragedy,http://access.bl.uk/item/viewer/ark:/81055/vdc...
2,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,"Jew of Lubeck Or, the Heart of a Father",1828-04-11,After-Piece,http://access.bl.uk/item/viewer/ark:/81055/vdc...
3,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,"Night Hag Or, St. Swithin's Chair",1828-01-29,,http://access.bl.uk/item/viewer/ark:/81055/vdc...
4,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,https://api.bl.uk/metadata/iiif/ark:/81055/vdc...,Woodman's Hut,1830-11-09,,http://access.bl.uk/item/viewer/ark:/81055/vdc...


## Map volume metadata


In [211]:
volume_md_df = pandas.read_csv('../metadata/volume.csv')
volume_md_df.set_index('manifest_uri', inplace=True, verify_integrity=True)

In [213]:
cols = ['theatre', 'city', 'country']
df = df.merge(volume_md_df[cols], left_on='partOf', right_on='manifest_uri', how='left')

In [214]:
df.to_csv('../data/performances.csv', encoding='utf-8', index=False)

pybossa_tasks_df = pandas.read_pickle('../data/pybossa_tasks.pkl')