# Import records from Google Sheets

The curation of metadata records is done in a [Google Sheets document](https://docs.google.com/spreadsheets/d/1acNnSzn8XrCDFrxf4joqSAUsAQoOgHEvBU-FIt9p9kU/edit?usp=sharing) with five sheets (tabs), one per document type.

This notebook will read the current contents of the sheet into a Pandas dataframe. The records are converted into Python dictionaries and then saved into JSONL files, split into separate train and test subsets.

In [1]:
# Read the metadata from the Google Sheets document into Pandas dataframes

import pandas as pd
import urllib.parse

DOC_ID = "1acNnSzn8XrCDFrxf4joqSAUsAQoOgHEvBU-FIt9p9kU"  # Google Sheets id
LANGUAGES = ('fin', 'swe', 'eng')

SHEET_NAMES = {
    "thes": "Theses",
    "docthes": "Doctoral theses",
    "report": "Reports",
    "book": "Books",
    "article": "Articles"
}

def read_sheet(doc_id, sheet_name, sheet_id):
    csv_url = f"https://docs.google.com/spreadsheets/d/{doc_id}/gviz/tq?tqx=out:csv&sheet={urllib.parse.quote(sheet_name)}"
    df = pd.read_csv(csv_url, dtype=str, na_filter=False)
    # add a new "rowid" column, with values like "thes37", in between other housekeeping columns
    df["rowid"] = df.index + 2
    df["rowid"] = df["rowid"].apply(lambda x: f"{sheet_id}{x}")
    df.insert(3, "rowid", df.pop("rowid"))
    # restrict to monolingual records in the languages we are interested in
    df = df.loc[df["language/iso"].isin(LANGUAGES)]
    return df

sheets = {sheet_id: read_sheet(DOC_ID, sheet_name, sheet_id)
          for sheet_id, sheet_name in SHEET_NAMES.items()}

In [2]:
# Calculate language statistics for each sheet (document type) and combine into an overview

language_counts = {sheet_id: sheet["language/iso"].value_counts().rename(sheet_id).astype(int)
                   for sheet_id, sheet in sheets.items()}

langstat = pd.concat(language_counts.values(), axis=1).fillna(0).astype(int)
langstat['TOTAL'] = langstat.sum(axis=1)
langstat.loc['TOTAL'] = langstat.sum()
langstat

Unnamed: 0_level_0,thes,docthes,report,book,article,TOTAL
language/iso,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
fin,89,56,56,60,70,331
eng,58,99,19,46,79,301
swe,53,25,37,19,34,168
TOTAL,200,180,112,125,183,800


In [3]:
# Calculate by-repository statistics for each sheet (document type) and combine into an overview

repo_counts = {sheet_id: sheet["Repository"].value_counts().rename(sheet_id).astype(int)
               for sheet_id, sheet in sheets.items()}

repostat = pd.concat(repo_counts.values(), axis=1).fillna(0).astype(int)
repostat['TOTAL'] = repostat.sum(axis=1)
repostat.loc['TOTAL'] = repostat.sum()
repostat

Unnamed: 0_level_0,thes,docthes,report,book,article,TOTAL
Repository,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Theseus,91,1,19,45,114,270
Doria,29,49,17,16,16,127
Osuva,25,23,2,7,16,73
Taju,24,4,1,37,20,86
Trepo,16,35,0,8,0,59
UtuPub,15,40,0,0,0,55
LutPub,0,27,2,0,0,29
Julkari,0,1,35,8,4,48
Kaisu,0,0,36,4,13,53
TOTAL,200,180,112,125,183,800


In [4]:
# Calculate by-COAR-type statistics for each sheet (document type) and combine into an overview

coar_counts = {sheet_id: sheet["type/coar"].value_counts().rename(sheet_id).astype(int)
               for sheet_id, sheet in sheets.items()}

coarstat = pd.concat(coar_counts.values(), axis=1).fillna(0).astype(int)
coarstat['TOTAL'] = coarstat.sum(axis=1)
coarstat.loc['TOTAL'] = coarstat.sum()
coarstat

Unnamed: 0_level_0,thes,docthes,report,book,article,TOTAL
type/coar,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
master thesis,115,0,0,0,0,115
bachelor thesis,79,0,0,0,0,79
thesis,6,0,0,0,0,6
doctoral thesis,0,180,0,0,0,180
research report,0,0,111,0,0,111
report,0,0,1,0,0,1
book,0,0,0,90,0,90
book part,0,0,0,35,0,35
journal article,0,0,0,0,81,81
research article,0,0,0,0,43,43


In [5]:
DROP_COLS = ('Collection', 'Status')
MULTI_VALUE_COLS = {
    'contributor',
    'identifier/isbn',
    'publisher',
    'relation/isbn',
    'subject',
    'title/alternative'
}

# every 1 out of TRAIN_TEST_SPLIT_FACTOR records (on average) will be placed in test set
TRAIN_TEST_SPLIT_FACTOR = 4

import json
import re
import zlib

def is_valid_col(col):
    if col in DROP_COLS:
        return False
    if col.endswith('X'):
        return False
    if col.startswith('Unnamed'):
        return False
    return True

def url_to_id(url):
    return re.sub(r"(.+)/bitstream/handle/(\d+)/(\d+)/.*", r"\1/handle/\2/\3", url)

def is_multivalue_col(col):
    return col in MULTI_VALUE_COLS or col.split('/')[0] in MULTI_VALUE_COLS

def convert_colname(col):
    if col in ('rowid', 'url'):
        return col
    elif col == 'Repository':
        return 'repository'
    return 'dc.' + '.'.join(col.split('/'))

def filter_vals(vals):
    # remove values marked with [], meaning that they can't be directly inferred from the document
    # this works on lists of values as well as individual (string) values
    if isinstance(vals, list):
        return [v for v in vals if v and not (v.startswith('[') and v.endswith(']'))]

    val = vals
    if not (val.startswith('[') and val.endswith(']')):
        return val
    else:
        return None

def row_to_dict(row):
    gt = {}

    # language
    gt["language"] = row["language/iso"].strip()

    # title
    if (title := filter_vals(row["title"].strip())):
        gt["title"] = title

    # creator
    if (creators := filter_vals(row["contributor/author"].strip().split("\n"))):
        gt["creator"] = creators

    # year
    if (issued := filter_vals(row["date/issued"].strip())):
        gt["year"] = issued[:4]  # only include year part

    # publisher
    if (publishers := filter_vals(row["publisher"].strip().split("\n"))):
        gt["publisher"] = publishers

    # e-ISBN
    if (eisbns := filter_vals(row["identifier/isbn"].strip().split("\n"))):
        gt["e-isbn"] = [isbn.replace('-', '') for isbn in eisbns]  # strip dashes in ISBNs

    # p-ISBN
    if (pisbns := filter_vals(row["relation/isbn"].strip().split("\n"))):
        gt["p-isbn"] = [isbn.replace('-', '') for isbn in pisbns]  # strip dashes in ISBNs

    # e-ISSN
    if (eissn := filter_vals(row["relation/eissn"].strip())):
        gt["e-issn"] = eissn

    # p-ISSN
    if (pissn := filter_vals(row["relation/pissn"].strip())):
        gt["p-issn"] = pissn
    
    return {"repository": row["Repository"].strip(),
            "url": row["url"].strip(),
            "id": url_to_id(row["url"].strip()),
            "rowid": row["rowid"].strip(),
            "ground_truth": gt}

def is_test_record(rec):
    """deterministically select, based on the record ID, whether a record goes into the train or test set"""
    return 3 * zlib.crc32(rec['id'].encode('utf-8')) % TRAIN_TEST_SPLIT_FACTOR == 1

for sheet_id, sheet in sheets.items():
    for lang in LANGUAGES:
        df = sheet[sheet['language/iso'] == lang]
        records = [row_to_dict(row) for _, row in df.iterrows()]
        with (open(f"../metadata/{sheet_id}-{lang}-train.jsonl", "w") as trainfile,
              open(f"../metadata/{sheet_id}-{lang}-test.jsonl", "w") as testfile):
            ntrain = ntest = 0
            for rec in records:
                if is_test_record(rec):
                    outfile = testfile
                    subset = 'test'
                    ntest += 1
                else:
                    outfile = trainfile
                    subset = 'train'
                    ntrain += 1
                header = {'doctype': sheet_id, 'subset': subset}
                json.dump(header | rec, outfile)
                outfile.write("\n")
        print(f"{sheet_id}-{lang}:\twrote {len(records)} records ({ntrain} train, {ntest} test ({100*ntest/(ntrain+ntest):.1f} %))")

thes-fin:	wrote 89 records (68 train, 21 test (23.6 %))
thes-swe:	wrote 53 records (37 train, 16 test (30.2 %))
thes-eng:	wrote 58 records (44 train, 14 test (24.1 %))
docthes-fin:	wrote 56 records (47 train, 9 test (16.1 %))
docthes-swe:	wrote 25 records (20 train, 5 test (20.0 %))
docthes-eng:	wrote 99 records (82 train, 17 test (17.2 %))
report-fin:	wrote 56 records (39 train, 17 test (30.4 %))
report-swe:	wrote 37 records (24 train, 13 test (35.1 %))
report-eng:	wrote 19 records (14 train, 5 test (26.3 %))
book-fin:	wrote 60 records (47 train, 13 test (21.7 %))
book-swe:	wrote 19 records (13 train, 6 test (31.6 %))
book-eng:	wrote 46 records (36 train, 10 test (21.7 %))
article-fin:	wrote 70 records (57 train, 13 test (18.6 %))
article-swe:	wrote 34 records (28 train, 6 test (17.6 %))
article-eng:	wrote 79 records (64 train, 15 test (19.0 %))


In [6]:
# Generate statistics about the use of metadata fields

from collections import defaultdict, Counter
import datetime
import glob
from statistics import mean
from tabulate import tabulate


langstat_table = tabulate(langstat, headers='keys', tablefmt='github')
repostat_table = tabulate(repostat, headers='keys', tablefmt='github')
coarstat_table = tabulate(coarstat, headers='keys', tablefmt='github')

metadata_files = glob.glob("../metadata/*.jsonl")

field_counts = defaultdict(Counter)  # key1: doctype, key2: field
doc_counts = Counter()
field_types = {}  # key: field, val: S (single) or M (multiple)
field_nvals = defaultdict(list)  # key: field, val: list of number of values

for mdfile in sorted(metadata_files):
    with open(mdfile) as infile:
        for line in infile:
            rec = json.loads(line)
            # flatten the record: include the ground_truth fields at the top level
            combined_rec = rec | rec["ground_truth"]
            del combined_rec["ground_truth"]
            
            for fld in combined_rec:
                field_counts[rec['doctype']][fld] += 1
                field_types[fld] = 'M' if isinstance(combined_rec[fld], list) else 'S'
                if field_types[fld] == 'M':
                    field_nvals[fld].append(len(combined_rec[fld]))
            doc_counts[rec['doctype']] += 1

data = []

def format_value(val):
    if not val:
        return '-'
    return "{:.0%}".format(val)

for fld in list(field_types.keys()):

    if field_types[fld] == 'M':
        mean_val = mean(field_nvals[fld] or [0])
        max_val = max(field_nvals[fld] or [0])
        ftype = f"M ({mean_val:.1f}/{max_val})"
    else:
        ftype = 'S'


    row = {
        'Field': fld,
        'Type': ftype
    }
    for doctype in SHEET_NAMES.keys():
        row[doctype] = format_value(field_counts[doctype][fld] / doc_counts[doctype])
    data.append(row)

# Create a Pandas DataFrame from the data
df = pd.DataFrame(data)

# Convert DataFrame to markdown table
field_table = tabulate(df, headers='keys', tablefmt='github', showindex=False)

timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")

statfile = f"""# Statistics about metadata

Automatically generated {timestamp}

Type is either:
 * S: single-value
 * M: multi-value, number of values given as (mean/max)

Percentages represent the coverage of a field in a subset. 100% coverage means the field is always present.

## Document counts by language and document type

{langstat_table}

## Document counts by repository and document type

{repostat_table}

## Document counts by COAR resource type and document type

{coarstat_table}

## Metadata coverage by document type

{field_table}
"""

print(statfile)

with open('../statistics.md', 'w') as outf:
    print(statfile, file=outf)

# Statistics about metadata

Automatically generated 2024-07-26 10:27:20

Type is either:
 * S: single-value
 * M: multi-value, number of values given as (mean/max)

Percentages represent the coverage of a field in a subset. 100% coverage means the field is always present.

## Document counts by language and document type

| language/iso   |   thes |   docthes |   report |   book |   article |   TOTAL |
|----------------|--------|-----------|----------|--------|-----------|---------|
| fin            |     89 |        56 |       56 |     60 |        70 |     331 |
| eng            |     58 |        99 |       19 |     46 |        79 |     301 |
| swe            |     53 |        25 |       37 |     19 |        34 |     168 |
| TOTAL          |    200 |       180 |      112 |    125 |       183 |     800 |

## Document counts by repository and document type

| Repository   |   thes |   docthes |   report |   book |   article |   TOTAL |
|--------------|--------|-----------|----------|--