In [1]:
pip install pydictionaria

Collecting pydictionaria
  Downloading pydictionaria-2.2-py3-none-any.whl.metadata (1.3 kB)
Collecting cdstarcat>=1.0.0 (from pydictionaria)
  Downloading cdstarcat-1.4.0-py3-none-any.whl.metadata (3.3 kB)
Collecting cldfbench (from pydictionaria)
  Downloading cldfbench-1.14.0-py3-none-any.whl.metadata (15 kB)
Collecting clldutils>=3.5.1 (from pydictionaria)
  Downloading clldutils-3.22.2-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting csvw>=1.5.4 (from pydictionaria)
  Downloading csvw-3.3.1-py2.py3-none-any.whl.metadata (10 kB)
Collecting purl (from pydictionaria)
  Downloading purl-1.6-py2.py3-none-any.whl.metadata (7.0 kB)
Collecting pybtex (from pydictionaria)
  Downloading pybtex-0.24.0-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting pycdstar>=1.0.1 (from pydictionaria)
  Downloading pycdstar-1.1.0-py3-none-any.whl.metadata (3.5 kB)
Collecting pycldf>=1.20 (from pydictionaria)
  Downloading pycldf-1.39.0-py2.py3-none-any.whl.metadata (13 kB)
Collecting pyconcepticon>=1.1.1 (

In [2]:
! git clone https://github.com/dictionaria/kalamang.git

Cloning into 'kalamang'...
remote: Enumerating objects: 176, done.[K
remote: Counting objects: 100% (176/176), done.[K
remote: Compressing objects: 100% (96/96), done.[K
remote: Total 176 (delta 95), reused 152 (delta 74), pack-reused 0 (from 0)[K
Receiving objects: 100% (176/176), 949.08 KiB | 4.31 MiB/s, done.
Resolving deltas: 100% (95/95), done.


In [3]:
%cd kalamang

/content/kalamang


In [4]:
from collections import ChainMap
from itertools import chain
import pathlib
import re
import sys

from pydictionaria.sfm_lib import Database as SFM
from pydictionaria import sfm2cldf

from pydictionaria.preprocess_lib import (
    marker_fallback_sense, marker_fallback_entry, merge_markers
)

from cldfbench import CLDFSpec, Dataset as BaseDataset


SEMANTIC_DOMAINS = (
    'aquatic life',
    'arrange, hold, transfer',
    'birds',
    'bodily states, colours, dimensions, quantity',
    'body',
    'canoe parts',
    'culture and communication',
    'earth',
    'food, cooking, fire',
    'relational nouns',
    'house parts',
    'human artifacts',
    'impact, cut, break',
    'insects and small animals',
    'kin',
    'location, direction, time',
    'medicines',
    'motion',
    'other animals',
    'placenames',
    'plants',
    'sounds, smells, sensations, vision',
    'state',
    'values and emotions',
    'work',
)

PENDING_SIGNIFIERS = {'tentative', 'pending'}

VAR_MARKERS = {
    'lx',
    'lx_Kar',
    'hm',
    'ph_Kar',
    'va',
    'vt',
    'lf',
    'vet',
    'mn',
    'cet',
    'se',
    'co_Dut',
    'dt',
}


class DropTracker:

    def __init__(self, crossref_markers):
        self._dropped_ids = set()
        self._crossref_markers = crossref_markers

    def dropper_func(self, fun):
        def catch_dropped(entry):
            new_entry = fun(entry)
            if new_entry is False:
                self._dropped_ids.add(
                    '{}{}'.format(entry.get('lx', ''), entry.get('hm', '')))
            return new_entry
        return catch_dropped

    def _drop_crossrefs(self, mvpair):
        m, v = mvpair
        if m in self._crossref_markers:
            new_v = ' ; '.join(
                id_.strip()
                for id_ in v.split(';')
                if id_.strip() not in self._dropped_ids)
            return m, new_v
        else:
            return m, v

    def drop_crossrefs(self, entry):
        return entry.__class__(map(self._drop_crossrefs, entry))


def drop_mly(entry):
    if 'MLY' in entry.get('ps', ''):
        return False
    else:
        return entry


def drop_variant(entry):
    if {m for m, _ in entry} <= VAR_MARKERS:
        return False
    else:
        return entry


def is_pending(sense):
    return sense.get('z6', '').lower() in PENDING_SIGNIFIERS


def drop_pending(entry):
    prefix = entry.__class__()
    senses = []
    for marker, value in entry:
        if marker == 'sn':
            senses.append(entry.__class__())
            senses[-1].append((marker, value))
        elif senses:
            senses[-1].append((marker, value))
        else:
            prefix.append((marker, value))

    if senses:
        senses_left = [s for s in senses if not is_pending(s)]
        if senses_left:
            return entry.__class__(chain(prefix, *senses_left))
        else:
            return False
    elif is_pending(prefix):
        return False
    else:
        return entry


def parse_semantic_domains(value):
    rest = value.strip().lower()
    domains = []
    while rest:
        for dom in SEMANTIC_DOMAINS:
            if rest.startswith(dom):
                domains.append(dom)
                rest = rest[len(dom):].strip()
                break
        else:
            print('unkown semantic domain:', rest, file=sys.stderr)
            domains.append(rest)
            break
    return ' ; '.join(domains)


def merged_va(marker_dict):
    va = marker_dict.get('va') or ''
    vet = marker_dict.get('vet') or ''
    if va and vet:
        return '{}: {}'.format(vet, va)
    else:
        return va


def merge_mn(entry):
    mns = []
    for marker, value in entry:
        if marker == 'mn':
            mns.append(value)
        elif mns:
            yield 'mn', ' ; '.join(mns)
            yield marker, value
            mns = []
        else:
            yield marker, value
    if mns:
        yield 'mn', ' ; '.join(mns)


def mn_to_lv(entry):
    prev = None
    for marker, value in entry:
        if marker == 'mn' and prev == 'lf':
            yield 'lv', value
        else:
            yield marker, value
        prev = marker if value else None


def filter_sp_var(entry):
    if not entry.get('vet'):
        return entry
    new_entry = entry.__class__()
    prev_va = None

    for marker, value in entry:
        if marker == 'vet' and value == 'sp. var. of':
            prev_va = None
            continue

        if prev_va:
            new_entry.append(('va', prev_va))
            prev_va = None

        if marker == 'va':
            prev_va = value
        else:
            new_entry.append((marker, value))

    return new_entry


def merged_pc(marker_dict):
    eng = marker_dict.get('pc_Eng')
    kar = marker_dict.get('pc_Kar')
    if eng and kar:
        return '{} – {}'.format(eng, kar)
    else:
        return eng


def reorganize(sfm):
    dt = DropTracker({'lv', 'mn'})

    sfm.visit(dt.dropper_func(drop_mly))
    sfm.visit(dt.dropper_func(drop_variant))
    sfm.visit(dt.dropper_func(drop_pending))

    sfm.visit(dt.drop_crossrefs)

    return sfm


def preprocess(entry):
    entry = entry.__class__(
        (m, v)
        for m, v in entry
        if m != 'pc_Mal')
    entry = entry.__class__(
        (m, re.sub(r'\s*\&lt;(\s*)', r'\1', v) if m == 'esl' else v)
        for m, v in entry)

    entry = merge_markers(
        entry, ['pc_Eng', 'pc_Kar'], 'pc_Eng', format_fn=merged_pc)
    entry = marker_fallback_sense(entry, 'de', 'ge')
    entry = marker_fallback_sense(entry, 'd_Mal', 'g_Mal')

    entry = filter_sp_var(entry)

    if entry.get('mn'):
        entry = entry.__class__(merge_mn(entry))
        entry = entry.__class__(mn_to_lv(entry))

    if entry.get('sd'):
        entry = entry.__class__(
            (m, parse_semantic_domains(v) if m == 'sd' else v)
            for m, v in entry)

    return entry

def authors_string(authors):
    def is_primary(a):
        return not isinstance(a, dict) or a.get('primary', True)

    primary = ' and '.join(
        a['name'] if isinstance(a, dict) else a
        for a in authors
        if is_primary(a))
    secondary = ' and '.join(
        a['name']
        for a in authors
        if not is_primary(a))
    if primary and secondary:
        return '{} with {}'.format(primary, secondary)
    else:
        return primary or secondary


class Dataset(BaseDataset):
    dir = pathlib.Path.cwd()  # Set current working directory
    id = "kalamang"

    def cldf_specs(self):  # A dataset must declare all CLDF sets it creates.
        return CLDFSpec(
            dir=self.cldf_dir,
            module='Dictionary',
            metadata_fname='cldf-metadata.json')

    def cmd_download(self, args):
        """
        Download files to the raw/ directory. You can use helpers methods of `self.raw_dir`, e.g.

        >>> self.raw_dir.download(url, fname)
        """
        pass

    def cmd_makecldf(self, args):
        """
        Convert the raw data to a CLDF dataset.

        >>> args.writer.objects['LanguageTable'].append(...)
        """

        # read data

        md = self.etc_dir.read_json('md.json')
        properties = md.get('properties') or {}
        language_name = md['language']['name']
        isocode = md['language']['isocode']
        language_id = md['language']['isocode']
        glottocode = md['language']['glottocode']

        marker_map = ChainMap(
            properties.get('marker_map') or {},
            sfm2cldf.DEFAULT_MARKER_MAP)
        entry_sep = properties.get('entry_sep') or sfm2cldf.DEFAULT_ENTRY_SEP
        sfm = SFM(
            self.raw_dir / 'db.sfm',
            marker_map=marker_map,
            entry_sep=entry_sep)

        examples = sfm2cldf.load_examples(self.raw_dir / 'examples.sfm')

        if (self.etc_dir / 'cdstar.json').exists():
            media_catalog = self.etc_dir.read_json('cdstar.json')
        else:
            media_catalog = {}

        # preprocessing

        sfm = reorganize(sfm)
        sfm.visit(preprocess)

        # processing

        with open(self.dir / 'cldf.log', 'w', encoding='utf-8') as log_file:
            log_name = '%s.cldf' % language_id
            cldf_log = sfm2cldf.make_log(log_name, log_file)

            entries, senses, examples, media = sfm2cldf.process_dataset(
                self.id, language_id, properties,
                sfm, examples, media_catalog=media_catalog,
                glosses_path=self.raw_dir / 'glosses.flextext',
                examples_log_path=self.dir / 'examples.log',
                glosses_log_path=self.dir / 'glosses.log',
                cldf_log=cldf_log)

            # good place for some post-processing

            # cldf schema

            sfm2cldf.make_cldf_schema(
                args.writer.cldf, properties,
                entries, senses, examples, media)

            sfm2cldf.attach_column_titles(args.writer.cldf, properties)

            print(file=log_file)

            entries = sfm2cldf.ensure_required_columns(
                args.writer.cldf, 'EntryTable', entries, cldf_log)
            senses = sfm2cldf.ensure_required_columns(
                args.writer.cldf, 'SenseTable', senses, cldf_log)
            examples = sfm2cldf.ensure_required_columns(
                args.writer.cldf, 'ExampleTable', examples, cldf_log)
            media = sfm2cldf.ensure_required_columns(
                args.writer.cldf, 'media.csv', media, cldf_log)

            entries = sfm2cldf.remove_senseless_entries(
                senses, entries, cldf_log)

        # output

        args.writer.cldf.properties['dc:creator'] = authors_string(
            md.get('authors') or ())

        language = {
            'ID': language_id,
            'Name': language_name,
            'ISO639P3code': isocode,
            'Glottocode': glottocode,
        }
        args.writer.objects['LanguageTable'] = [language]

        args.writer.objects['EntryTable'] = entries
        args.writer.objects['SenseTable'] = senses
        args.writer.objects['ExampleTable'] = examples
        args.writer.objects['media.csv'] = media

In [6]:
# Install cldfbench
! pip install cldfbench

# Install pydictionaria and its dependencies
! pip install git+https://github.com/cldf/pydictionaria.git

# If 'sfm2cldf' is a separate package (it may be part of pydictionaria), install it like this:
! pip install git+https://github.com/cldf/sfm2cldf.git


Collecting git+https://github.com/cldf/pydictionaria.git
  Cloning https://github.com/cldf/pydictionaria.git to /tmp/pip-req-build-yidbacp8
  Running command git clone --filter=blob:none --quiet https://github.com/cldf/pydictionaria.git /tmp/pip-req-build-yidbacp8
  fatal: could not read Username for 'https://github.com': No such device or address
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mgit clone --[0m[32mfilter[0m[32m=[0m[32mblob[0m[32m:none --quiet [0m[4;32mhttps://github.com/cldf/pydictionaria.git[0m[32m [0m[32m/tmp/[0m[32mpip-req-build-yidbacp8[0m did not run successfully.
  [31m│[0m exit code: [1;36m128[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
[1;31merror[0m: [1msubprocess-exited-with-error[0m

[31m×[0m [32mgit clone --[0m[32mfilter[0m[32m=[0m[32mblob[0m[32m:none --quiet [0m[4;32mhttps://github.com/cldf/pydi

In [8]:
! python test.py

python3: can't open file '/content/test.py': [Errno 2] No such file or directory


In [9]:
! python -m pip install --upgrade pip

Collecting pip
  Using cached pip-24.2-py3-none-any.whl.metadata (3.6 kB)
Using cached pip-24.2-py3-none-any.whl (1.8 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-24.2


In [10]:
pip install pytest-cldf

Collecting pytest-cldf
  Downloading pytest_cldf-0.3.0-py3-none-any.whl.metadata (1.4 kB)
Downloading pytest_cldf-0.3.0-py3-none-any.whl (6.7 kB)
Installing collected packages: pytest-cldf
Successfully installed pytest-cldf-0.3.0


In [6]:
! python test.py --cldf-metadata=cldf/cldf-metadata.json