# Import records from Google Sheets

The curation of metadata records is done in a [Google Sheets document](https://docs.google.com/spreadsheets/d/1qIBpnP93ywwY0Y0Ujz17XYgpB4T7etbSBlQtz43kaSQ/edit?usp=sharing) with four sheets (tabs), one per document type.

This notebook will read the current contents of the sheet into a Pandas dataframe. Only "completed" rows marked with a "V" are used.
The records are converted into Python dictionaries and then saved into JSONL files, split into separate train and test subsets.

In [1]:
# Read the metadata from the Google Sheets document into Pandas dataframes

import pandas as pd
import urllib.parse

DOC_ID = "1qIBpnP93ywwY0Y0Ujz17XYgpB4T7etbSBlQtz43kaSQ"  # Google Sheets id
LANGUAGES = ('fin', 'swe', 'eng')

SHEET_NAMES = {
    "thes": "Opinnäytteet (ei VK)",
    "docthes": "Väitöskirjat",
    "serial": "Sarjajulkaisut",
    "mono": "Yksittäisjulkaisut"
}

def read_sheet(doc_id, sheet_name):
    csv_url = f"https://docs.google.com/spreadsheets/d/{doc_id}/gviz/tq?tqx=out:csv&sheet={urllib.parse.quote(sheet_name)}"
    df = pd.read_csv(csv_url, dtype=str, na_filter=False)
    # restrict the dataframe to only the "completed" rows
    df = df.loc[df["Tila"].str.startswith("V", na=False)]
    # restrict to monolingual records in the languages we are interested in
    df = df.loc[df["language/iso"].isin(LANGUAGES)]
    return df

sheets = {sheet_id: read_sheet(DOC_ID, sheet_name)
          for sheet_id, sheet_name in SHEET_NAMES.items()}

In [2]:
# Calculate language statistics for each sheet (document type) and combine into an overview

language_counts = {sheet_id: sheet["language/iso"].value_counts().rename(sheet_id).astype(int)
                   for sheet_id, sheet in sheets.items()}

langstat = pd.concat(language_counts.values(), axis=1).fillna(0).astype(int)
langstat['TOTAL'] = langstat.sum(axis=1)
langstat.loc['TOTAL'] = langstat.sum()
langstat

Unnamed: 0_level_0,thes,docthes,serial,mono,TOTAL
language/iso,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
fin,85,31,55,34,205
eng,42,92,42,28,204
swe,29,25,21,2,77
TOTAL,156,148,118,64,486


In [3]:
DROP_COLS = ('Julkaisuarkisto', 'Kokoelma', 'Tila')
MULTI_VALUE_COLS = {
    'contributor',
    'format',
    'identifier',
    'publisher',
    'relation',
    'subject',
    'title',
    'type'
}

# every 1 out of TRAIN_TEST_SPLIT_FACTOR records (on average) will be placed in test set
TRAIN_TEST_SPLIT_FACTOR = 4

import json
import re
import zlib

def is_valid_col(col):
    if col in DROP_COLS:
        return False
    if col.endswith('X'):
        return False
    if col.startswith('Unnamed'):
        return False
    return True

def url_to_id(url):
    return re.sub(r"(.+)/bitstream/handle/(\d+)/(\d+)/.*", r"\1/handle/\2/\3", url)

def is_multivalue_col(col):
    return col.split('/')[0] in MULTI_VALUE_COLS

def convert_colname(col):
    if col == 'url':
        return col
    # hack for handling spelling variations
    # FIXME: should be done in Sheets instead
    if col == "subject/degreeprogramme":
        col = "subject/degreeprogram"
    return 'dc.' + '.'.join(col.split('/'))

def to_dict(row):
    d = {}
    for col, val in row.items():
        if col == 'url':
            d["id"] = url_to_id(val)
        if is_valid_col(col) and val != '':
            d[convert_colname(col)] = val.split("\n") if is_multivalue_col(col) else val
    return d

def is_test_record(rec):
    """deterministically select, based on the record ID, whether a record goes into the train or test set"""
    return 3 * zlib.crc32(rec['id'].encode('utf-8')) % TRAIN_TEST_SPLIT_FACTOR == 1

for sheet_id, sheet in sheets.items():
    for lang in LANGUAGES:
        df = sheet[sheet['language/iso'] == lang]
        records = [to_dict(row) for _, row in df.iterrows()]
        with (open(f"../metadata/train/{sheet_id}-{lang}.jsonl", "w") as trainfile,
              open(f"../metadata/test/{sheet_id}-{lang}.jsonl", "w") as testfile):
            ntrain = ntest = 0
            for rec in records:
                if is_test_record(rec):
                    outfile = testfile
                    ntest += 1
                else:
                    outfile = trainfile
                    ntrain += 1
                json.dump(rec, outfile)
                outfile.write("\n")
        print(f"{sheet_id}-{lang}:\twrote {len(records)} records ({ntrain} train, {ntest} test ({100*ntest/(ntrain+ntest):.1f} %))")

thes-fin:	wrote 85 records (65 train, 20 test (23.5 %))
thes-swe:	wrote 29 records (19 train, 10 test (34.5 %))
thes-eng:	wrote 42 records (31 train, 11 test (26.2 %))
docthes-fin:	wrote 31 records (24 train, 7 test (22.6 %))
docthes-swe:	wrote 25 records (20 train, 5 test (20.0 %))
docthes-eng:	wrote 92 records (76 train, 16 test (17.4 %))
serial-fin:	wrote 55 records (44 train, 11 test (20.0 %))
serial-swe:	wrote 21 records (14 train, 7 test (33.3 %))
serial-eng:	wrote 42 records (33 train, 9 test (21.4 %))
mono-fin:	wrote 34 records (26 train, 8 test (23.5 %))
mono-swe:	wrote 2 records (2 train, 0 test (0.0 %))
mono-eng:	wrote 28 records (19 train, 9 test (32.1 %))
