# Processing Zooniverse annotations

In [1]:
import json
from collections import ChainMap, OrderedDict
from pathlib import Path

import pandas as pd
from toolz import dicttoolz

First we grab all of the `csv` files containing exported data from zooniverse. 

In [2]:
csvs = list(Path("data/bl_books_annotations_analysis/31_08_2021").rglob("**/*.csv"))

We remove one rouge csv in which the annotations are all in different columns to the other `csv` files. We could correct this if we wanted but to keep things simple here we drop it. 

In [3]:
csvs.remove(
    Path(
        "data/bl_books_annotations_analysis/31_08_2021/msb1_undetermined_language/ms-digitised-books-undetermined-language-classifications.csv"
    )
)

In [4]:
csvs

[PosixPath('data/bl_books_annotations_analysis/31_08_2021/msbeng3_english3/ms-digitised-books-english-classifications.csv'),
 PosixPath('data/bl_books_annotations_analysis/31_08_2021/msbeng2_english2/ms-digitised-books-english-classifications.csv'),
 PosixPath('data/bl_books_annotations_analysis/31_08_2021/msb2_french_german/ms-digitised-books-french-and-german-classifications.csv'),
 PosixPath('data/bl_books_annotations_analysis/31_08_2021/msbeng1_english1/ms-digitised-books-english-classifications.csv'),
 PosixPath('data/bl_books_annotations_analysis/31_08_2021/vmorris_other_language/ms-digitised-books-multiple-languages-classifications.csv')]

We load each CSV into a dataframe

In [5]:
dfs = []
for csv in csvs:
    df = pd.read_csv(csv)
    if "annotations" in df.columns:
        dfs.append(df)

and concatenate them all into one big DataFrame. 

In [6]:
df = pd.concat(dfs)

In [7]:
df.columns

Index(['classification_id', 'user_name', 'user_id', 'user_ip', 'workflow_id',
       'workflow_name', 'workflow_version', 'created_at', 'gold_standard',
       'expert', 'metadata', 'annotations', 'subject_data', 'subject_ids'],
      dtype='object')

In [8]:
df["subject_data"] = df["subject_data"].apply(lambda x: json.loads(x))

In [9]:
df["subject_data"]

0       {'44331392': {'retired': {'id': 62638916, 'wor...
1       {'44378089': {'retired': {'id': 62692902, 'wor...
2       {'44378423': {'retired': {'id': 62689341, 'wor...
3       {'44373317': {'retired': {'id': 62686164, 'wor...
4       {'44365516': {'retired': {'id': 62682180, 'wor...
                              ...                        
1747    {'44305891': {'retired': {'id': 62611481, 'wor...
1748    {'44305865': {'retired': {'id': 62611685, 'wor...
1749    {'44305738': {'retired': {'id': 62611613, 'wor...
1750    {'44305899': {'retired': {'id': 62611460, 'wor...
1751    {'44305720': {'retired': {'id': 62611824, 'wor...
Name: subject_data, Length: 4398, dtype: object

In [10]:
df_subject_data = df["subject_data"].apply(lambda x: x[next(iter(x))]).apply(pd.Series)

In [11]:
df_subject_data

Unnamed: 0,retired,Image,Title,BL record ID,BL shelfmark,Link to Explore,Link to digitised book,Topics from MARC record,Languages from MARC record,Genre terms from MARC record,Literary form terms from MARC record,Language of original from MARC record,Literary form/genre suggestions from Christin Hoene
0,"{'id': 62638916, 'workflow_id': 14449, 'classi...",vdc_00000002F970_title_page.jpg,Sonnets,014616869,Digital Store 11652.i.20,http://explore.bl.uk/BLVU1:LSCOP-ALL:BLL010146...,http://access.bl.uk/item/viewer/ark:/81055/vdc...,,English,,,,
1,"{'id': 62692902, 'workflow_id': 14449, 'classi...",vdc_000000033150_title_page.jpg,Cousin Simon [A novel.],014818614,Digital Store 12641.c.24,http://explore.bl.uk/BLVU1:LSCOP-ALL:BLL010148...,http://access.bl.uk/item/viewer/ark:/81055/vdc...,,English,,,,Novels
2,"{'id': 62689341, 'workflow_id': 14449, 'classi...",vdc_000000035304_title_page.jpg,Austria and the Austrians [by Wenzel Blumenbach.],014804360,Digital Store 791.e.22,http://explore.bl.uk/BLVU1:LSCOP-ALL:BLL010148...,http://access.bl.uk/item/viewer/ark:/81055/vdc...,,English,,,,
3,"{'id': 62686164, 'workflow_id': 14449, 'classi...",vdc_00000002150A_title_page.jpg,The Banquet: in three cantos [By Hans Busk.],014939397,Digital Store 11647.e.48. (1.),http://explore.bl.uk/BLVU1:LSCOP-ALL:BLL010149...,http://access.bl.uk/item/viewer/ark:/81055/vdc...,,English,,,,
4,"{'id': 62682180, 'workflow_id': 14449, 'classi...",vdc_0000000592E6_title_page.jpg,The Exeter Road. The story of the West of Engl...,014812769,Digital Store 10347.f.45,http://explore.bl.uk/BLVU1:LSCOP-ALL:BLL010148...,http://access.bl.uk/item/viewer/ark:/81055/vdc...,,English,,,,Fiction
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1747,"{'id': 62611481, 'workflow_id': 14412, 'classi...",vdc_00000001C782_title_page.jpg,"Danskhedens Skjæbne i Slesvig, udarbeidet efte...",014895984,Digital Store 9425.c.27,http://explore.bl.uk/BLVU1:LSCOP-ALL:BLL010148...,http://access.bl.uk/item/viewer/ark:/81055/vdc...,,Danish,,,,
1748,"{'id': 62611685, 'workflow_id': 14412, 'classi...",vdc_00000001CCA4_title_page.jpg,Om Robert Molesworth's Skrift 'An Account of D...,014896128,Digital Store 9424.i.5,http://explore.bl.uk/BLVU1:LSCOP-ALL:BLL010148...,http://access.bl.uk/item/viewer/ark:/81055/vdc...,,Danish,,,,
1749,"{'id': 62611613, 'workflow_id': 14412, 'classi...",vdc_00000002043C_title_page.jpg,Oldsagen om Danske Konger ... Aftrykt af N. M....,014919359,Digital Store 9425.bb.28,http://explore.bl.uk/BLVU1:LSCOP-ALL:BLL010149...,http://access.bl.uk/item/viewer/ark:/81055/vdc...,,Danish,,,,
1750,"{'id': 62611460, 'workflow_id': 14412, 'classi...",vdc_0000000169D4_title_page.jpg,Fremstilling af forskjellige kjøbenhavnske kom...,014879625,Digital Store 10280.f.8,http://explore.bl.uk/BLVU1:LSCOP-ALL:BLL010148...,http://access.bl.uk/item/viewer/ark:/81055/vdc...,,Danish,,,,


In [12]:
df["annotations"] = df["annotations"].apply(lambda x: json.loads(x))

In [13]:
task_list = [
    "T0",
    "T1",
    "T3",
    "T5",
    "T7",
    "T8",
    "T9",
    "T10",
    "T11",
    "T12",
    "T13",
    "T14",
    "T15",
    "T17",
    "T16",
    "T6",
    "T2",
    "T4",
]

In [14]:
df["annotations"].iloc[0]

[{'task': 'T0',
  'task_label': '**The title page of a book should appear on the left. You will be asked to provide brief metadata relating to this book.**\n\nClick on the small "i" icon below the title page; a pop-up window will appear, containing links to allow you to access:\n- the record for the book in [Explore](http://explore.bl.uk)\n- the complete digitised book\n\nThis pop-up window also contains brief additional information about the book, which may help you answer some of the questions.\n\n*Please do not click on the Done button until you have completed cataloguing this book.*\n\n**Click on the grey button below to confirm that you are ready to start cataloguing, then click *Next*.**',
  'value': '> I understand the instructions and am ready to begin ...'},
 {'task': 'T1',
  'task_label': '**Is English the correct main language for this book?**\n\nIf there are multiple main languages (for example if the book is a bi-lingual dictionary) please select *No*.',
  'value': 'Yes'},

In [15]:
def update(x):
    subset = [{d["task"]: d["value"]} for d in x]
    subset = dict(ChainMap(*subset))
    for key in task_list:
        subset[key] = subset.get(key, "")
    return dict(OrderedDict(sorted(subset.items())))

In [16]:
df["annotations"].apply(update).iloc[0]

{'T0': '> I understand the instructions and am ready to begin ...',
 'T1': 'Yes',
 'T10': 'NONE',
 'T11': 'NONE',
 'T12': 'NONE',
 'T13': 'Fiction',
 'T14': '655 7 $aPoetry$2fast$0(OCoLC)fst01423828',
 'T15': 'NONE',
 'T16': '**Finished**',
 'T17': '',
 'T2': '',
 'T3': 'No',
 'T4': '',
 'T5': 'No',
 'T6': '',
 'T7': 'published by the author',
 'T8': 'Bury St. Edmunds',
 'T9': 'enk'}

In [17]:
df["annotations"] = df["annotations"].apply(update)

In [18]:
df_annotations = pd.DataFrame(list(df["annotations"]))

In [19]:
df_annotations["T12"].value_counts()

NONE                                 3893
none                                  102
Second edition                         27
New edition                            17
NONE\n                                 13
                                     ... 
Zweite unveränderte Auflage             1
1884                                    1
Zweite, vermehrte Auflage               1
Dritte neu durchgesehene Auflage        1
Danske folkeskrifter XXVIII, XXXI       1
Name: T12, Length: 276, dtype: int64

In [20]:
df_annotations["T1"].value_counts()

Yes    4188
No      210
Name: T1, dtype: int64

In [21]:
df_annotations.head(1)

Unnamed: 0,T0,T1,T10,T11,T12,T13,T14,T15,T16,T17,T2,T3,T4,T5,T6,T7,T8,T9
0,> I understand the instructions and am ready t...,Yes,NONE,NONE,NONE,Fiction,655 7 $aPoetry$2fast$0(OCoLC)fst01423828,NONE,**Finished**,,,No,,No,,published by the author,Bury St. Edmunds,enk


In [22]:
df_annotations["T13"].value_counts()

Non-fiction                                       3586
Fiction                                            787
The book contains both Fiction and Non-Fiction      15
Can't tell                                          10
Name: T13, dtype: int64

In [23]:
df_annotations["T14"].value_counts()

                                                                                                        3596
655 7 ǂaFictionǂ2fastǂ0(OCoLC)fst01423787                                                                206
655 7 $aPoetry$2fast$0(OCoLC)fst01423828                                                                  97
655 7 $aFiction$2fast$0(OCoLC)fst01423787                                                                 92
655 7 ǂaNovelsǂ2fastǂ0(OCoLC)fst01921742                                                                  71
                                                                                                        ... 
655 7 ǂaDetective and mystery stories, Englishǂ2fastǂ0(OCoLC)fst01750159                                   1
655 7 ǂaChristian fictionǂ2fastǂ0(OCoLC)fst01726556                                                        1
655 7 ǂaUtopian fictionǂ2fastǂ0(OCoLC)fst01922578\n655 7 ǂaScience fictionǂ2fastǂ0(OCoLC)fst01726489       1
655 7 ǂaAdventure s

In [24]:
assert len(df) == len(df_annotations)

In [25]:
df2 = df.join(df_annotations)

In [26]:
len(df2) == len(df_subject_data)

True

In [27]:
df3 = df2.join(df_subject_data)

Drop duplicate rows for `classification_id` these should be unique to each annotator + dataset

In [28]:
df3 = df3.drop_duplicates(subset="classification_id")

In [29]:
df3.reset_index(inplace=True, drop=True)

In [30]:
mapping = {
    "Title": "title",
    "T2": "main_language",
    "T3": "other_languages_summaries",
    "T4": "summaries_language",
    "T5": "translation",
    "T6": "original_language",
    "T7": "publisher",
    "T8": "place_pub",
    "T9": "country",
    "T10": "date_pub",
    "T11": "normalised_date_pub",
    "T12": "edition_statement",
    "T13": "genre",
    "T14": "FAST_genre_terms",
    "T15": "FAST_subject_terms",
    "T16": "status",
    "T17": "comments",
}

flag source as annotator 

In [32]:
mapping = dicttoolz.valmap(lambda x: "annotator_" + x, mapping)
mapping

{'Title': 'annotator_title',
 'T2': 'annotator_main_language',
 'T3': 'annotator_other_languages_summaries',
 'T4': 'annotator_summaries_language',
 'T5': 'annotator_translation',
 'T6': 'annotator_original_language',
 'T7': 'annotator_publisher',
 'T8': 'annotator_place_pub',
 'T9': 'annotator_country',
 'T10': 'annotator_date_pub',
 'T11': 'annotator_normalised_date_pub',
 'T12': 'annotator_edition_statement',
 'T13': 'annotator_genre',
 'T14': 'annotator_FAST_genre_terms',
 'T15': 'annotator_FAST_subject_terms',
 'T16': 'annotator_status',
 'T17': 'annotator_comments'}

In [33]:
df3 = df3.rename(columns=mapping)

In [34]:
df3.columns

Index(['classification_id', 'user_name', 'user_id', 'user_ip', 'workflow_id',
       'workflow_name', 'workflow_version', 'created_at', 'gold_standard',
       'expert', 'metadata', 'annotations', 'subject_data', 'subject_ids',
       'T0', 'T1', 'annotator_date_pub', 'annotator_normalised_date_pub',
       'annotator_edition_statement', 'annotator_genre',
       'annotator_FAST_genre_terms', 'annotator_FAST_subject_terms',
       'annotator_status', 'annotator_comments', 'annotator_main_language',
       'annotator_other_languages_summaries', 'annotator_summaries_language',
       'annotator_translation', 'annotator_original_language',
       'annotator_publisher', 'annotator_place_pub', 'annotator_country',
       'retired', 'Image', 'annotator_title', 'BL record ID', 'BL shelfmark',
       'Link to Explore', 'Link to digitised book', 'Topics from MARC record',
       'Languages from MARC record', 'Genre terms from MARC record',
       'Literary form terms from MARC record',
       '

In [35]:
df3.annotator_genre.value_counts()

Non-fiction                                       2847
Fiction                                           1527
The book contains both Fiction and Non-Fiction      18
Can't tell                                           6
Name: annotator_genre, dtype: int64

In [36]:
df3.columns

Index(['classification_id', 'user_name', 'user_id', 'user_ip', 'workflow_id',
       'workflow_name', 'workflow_version', 'created_at', 'gold_standard',
       'expert', 'metadata', 'annotations', 'subject_data', 'subject_ids',
       'T0', 'T1', 'annotator_date_pub', 'annotator_normalised_date_pub',
       'annotator_edition_statement', 'annotator_genre',
       'annotator_FAST_genre_terms', 'annotator_FAST_subject_terms',
       'annotator_status', 'annotator_comments', 'annotator_main_language',
       'annotator_other_languages_summaries', 'annotator_summaries_language',
       'annotator_translation', 'annotator_original_language',
       'annotator_publisher', 'annotator_place_pub', 'annotator_country',
       'retired', 'Image', 'annotator_title', 'BL record ID', 'BL shelfmark',
       'Link to Explore', 'Link to digitised book', 'Topics from MARC record',
       'Languages from MARC record', 'Genre terms from MARC record',
       'Literary form terms from MARC record',
       '

Replace `user_name` with code

In [37]:
df3.user_name = df3.user_name.astype("category").cat.codes

In [39]:
columns = [
    column
    for column in df3.columns
    if not column
    in {
        "user_id",
        "user_ip",
        "workflow_id",
        "workflow_name",
        "workflow_version",
        "gold_standard",
        "expert",
        "metadata",
        "annotations",
        "subject_data",
        "annotator_status",
        "T0",
        "T1",
        "retired",
        "Image",
        "Link to Explore",
        "Topics from MARC record",
        "Languages from MARC record",
        "Genre terms from MARC record",
        "Literary form terms from MARC record",
        "Language of original from MARC record",
        "Literary form/genre suggestions from Christin Hoene",
    }
]

In [40]:
annotation_df = df3[columns]

In [41]:
annotation_df.columns

Index(['classification_id', 'user_name', 'created_at', 'subject_ids',
       'annotator_date_pub', 'annotator_normalised_date_pub',
       'annotator_edition_statement', 'annotator_genre',
       'annotator_FAST_genre_terms', 'annotator_FAST_subject_terms',
       'annotator_comments', 'annotator_main_language',
       'annotator_other_languages_summaries', 'annotator_summaries_language',
       'annotator_translation', 'annotator_original_language',
       'annotator_publisher', 'annotator_place_pub', 'annotator_country',
       'annotator_title', 'BL record ID', 'BL shelfmark',
       'Link to digitised book'],
      dtype='object')

In [42]:
annotation_df = annotation_df[annotation_df.annotator_genre.notna()]

In [43]:
annotation_df.head(1)

Unnamed: 0,classification_id,user_name,created_at,subject_ids,annotator_date_pub,annotator_normalised_date_pub,annotator_edition_statement,annotator_genre,annotator_FAST_genre_terms,annotator_FAST_subject_terms,...,annotator_summaries_language,annotator_translation,annotator_original_language,annotator_publisher,annotator_place_pub,annotator_country,annotator_title,BL record ID,BL shelfmark,Link to digitised book
0,258849534,3,2020-07-08 08:19:14 UTC,44331392,NONE,NONE,NONE,Fiction,655 7 $aPoetry$2fast$0(OCoLC)fst01423828,NONE,...,,No,,published by the author,Bury St. Edmunds,enk,Sonnets,14616869,Digital Store 11652.i.20,http://access.bl.uk/item/viewer/ark:/81055/vdc...


In [44]:
df_current_metadata = pd.read_csv(
    "https://bl.iro.bl.uk/downloads/e1be1324-8b1a-4712-96a7-783ac209ddef?locale=en",
    dtype={
        "BL record ID": "string",
        "BL record ID for physical resource": "string",
        "Date of publication": "string",
    },
)

In [45]:
df_current_metadata.columns

Index(['BL record ID', 'Type of resource', 'Name',
       'Dates associated with name', 'Type of name', 'Role', 'All names',
       'Title', 'Variant titles', 'Series title', 'Number within series',
       'Country of publication', 'Place of publication', 'Publisher',
       'Date of publication', 'Edition', 'Physical description',
       'Dewey classification', 'BL shelfmark', 'Topics', 'Genre', 'Languages',
       'Notes', 'BL record ID for physical resource'],
      dtype='object')

In [46]:
annotation_df = annotation_df.set_index("BL record ID")

In [47]:
annotation_df = annotation_df.drop(columns="BL shelfmark")

In [48]:
df_current_metadata = df_current_metadata.set_index("BL record ID")

In [50]:
joined_df = df_current_metadata.join(
    annotation_df, on="BL record ID", lsuffix="_left", rsuffix="_right"
)

In [51]:
joined_df.columns

Index(['Type of resource', 'Name', 'Dates associated with name',
       'Type of name', 'Role', 'All names', 'Title', 'Variant titles',
       'Series title', 'Number within series', 'Country of publication',
       'Place of publication', 'Publisher', 'Date of publication', 'Edition',
       'Physical description', 'Dewey classification', 'BL shelfmark',
       'Topics', 'Genre', 'Languages', 'Notes',
       'BL record ID for physical resource', 'classification_id', 'user_name',
       'created_at', 'subject_ids', 'annotator_date_pub',
       'annotator_normalised_date_pub', 'annotator_edition_statement',
       'annotator_genre', 'annotator_FAST_genre_terms',
       'annotator_FAST_subject_terms', 'annotator_comments',
       'annotator_main_language', 'annotator_other_languages_summaries',
       'annotator_summaries_language', 'annotator_translation',
       'annotator_original_language', 'annotator_publisher',
       'annotator_place_pub', 'annotator_country', 'annotator_title',
 

In [52]:
joined_df["annotated"] = joined_df["classification_id"].notna()

In [53]:
joined_df["annotated"]

BL record ID
014602826    False
014602830    False
014602831    False
014602832    False
014602833    False
             ...  
016289058    False
016289059    False
016289060    False
016289061    False
016289062    False
Name: annotated, Length: 55343, dtype: bool

In [54]:
joined_df.iloc[0]

Type of resource                                                               Monograph
Name                                                                       Yearsley, Ann
Dates associated with name                                                     1753-1806
Type of name                                                                      person
Role                                                                                 NaN
All names                              More, Hannah, 1745-1833 [person] ; Yearsley, A...
Title                                  Poems on several occasions [With a prefatory l...
Variant titles                                                                       NaN
Series title                                                                         NaN
Number within series                                                                 NaN
Country of publication                                                           England
Place of publication 

In [55]:
joined_df[joined_df["annotated"] == True].iloc[0]

Type of resource                                                               Monograph
Name                                                            Yates, William Joseph H.
Dates associated with name                                                           NaN
Type of name                                                                      person
Role                                                                                 NaN
All names                              Yates, William Joseph H. [person] ; Y, W. J. H...
Title                                  The Canadian farmer. A missionary incident [Si...
Variant titles                                                                       NaN
Series title                                                                         NaN
Number within series                                                                 NaN
Country of publication                                                           England
Place of publication 

In [56]:
from datetime import datetime

In [57]:
date_now = datetime.now()
date_now = date_now.strftime("%Y_%m_%d")
date_now

'2021_10_06'

In [59]:
joined_df.head(1)

Unnamed: 0_level_0,Type of resource,Name,Dates associated with name,Type of name,Role,All names,Title,Variant titles,Series title,Number within series,...,annotator_other_languages_summaries,annotator_summaries_language,annotator_translation,annotator_original_language,annotator_publisher,annotator_place_pub,annotator_country,annotator_title,Link to digitised book,annotated
BL record ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
14602826,Monograph,"Yearsley, Ann",1753-1806,person,,"More, Hannah, 1745-1833 [person] ; Yearsley, A...",Poems on several occasions [With a prefatory l...,,,,...,,,,,,,,,,False


In [58]:
joined_df.to_csv(
    f"genre_classification_of_bl_books/data/annotations_{date_now}.csv", index=True
)