# Clean Notker

This project is supposed to clean all layers from disjunct annotations. Good luck 🍀

## Imports

In [8]:
# python std libs
import os
import pickle
import sys
# own libs
from ddd import ad_api
from ddd.ad_api import corpus
# 3rd party libs
import numpy as np

## Constants and behaviour

In [2]:
IGNORE_TIERS = ('edition', 'character', 'page', 'default', 'translation', 'posLemma', 'verse', 'comp', 'chapter', 'line', 'pos', 'inflectionClassLemma', 'rhyme', 'markup', 'inflectionClass', 'inflection')
CORPUS_HOME = ad_api.CONFIG[ad_api.NAMES.DATA_HOME]
TRAIN_NAMES = [
    'DDD-AD-Murbacher_Hymnen',
    'DDD-AD-Isidor_Latein',
    'DDD-AD-Tatian',
    'DDD-AD-Benediktiner_Regel',
    'DDD-AD-Kleinere_Althochdeutsche_Denkmäler',
    'DDD-AD-Kleinere_Altsächsische_Denkmäler',
    'DDD-AD-Otfrid',
    'DDD-AD-Genesis',
    'DDD-AD-Benediktiner_Regel_Latein',
    'DDD-AD-Physiologus',
    'DDD-AD-Monsee',
    'DDD-AD-Tatian_Latein',
    'DDD-AD-Heliand',
    'DDD-AD-Murbacher_Hymnen_Latein',
    'DDD-AD-Isidor'
]
NOTKER = [
    'DDD-AD-Z-Notker_Boethius-Categoriae',
    'DDD-AD-Z-Notker_Boethius-De_Interpretatione'
]

## Data

In [3]:
vocabs = {
    'text': {},
    'lemma': {}
}
data = {}
for cname in TRAIN_NAMES:
    cdir = os.path.join(CORPUS_HOME, cname)
    print('Starting', cdir, '...')
    subcorpus = ad_api.corpus.Corpus.from_directory(cdir, ignore_tiers=IGNORE_TIERS)    
    
    for doc in subcorpus:
        lang = max(doc.languages, key=lambda l: doc.languages[l])
        if lang not in data: data[lang] = {'text': [], 'lemma': []}
        doc_ix_lists = {
            'text': [],
            'lemma': []
        }
        for annotationset in doc:
            collected = {}
            predecessor = {'text': None, 'lemma': None}            
            for anno_key in annotationset:
                if anno_key == 'lemma' or anno_key == 'text':
                    collected[anno_key] = annotationset[anno_key]                
                    if predecessor[anno_key] == collected[anno_key]:
                        raise ValueError('Duplicate!')
                    elif len(collected) == 2:
                        predecessor = collected
            for key, anno in collected.items():
                if anno.value in vocabs[key]:
                    doc_ix_lists[key].append(vocabs[key][anno.value])
                else:
                    doc_ix_lists[key].append(len(vocabs[key]))
                    vocabs[key][anno.value] = len(vocabs[key])
            for k, v in doc_ix_lists.items():
                data[lang][k].append(v)
            
    print('Done with', cname)

Starting /home/klotzmaz/Documents/referenzkorpus_altdeutsch/elan_repo/DDD-AD-Murbacher_Hymnen ...
Done with DDD-AD-Murbacher_Hymnen
Starting /home/klotzmaz/Documents/referenzkorpus_altdeutsch/elan_repo/DDD-AD-Isidor_Latein ...
Done with DDD-AD-Isidor_Latein
Starting /home/klotzmaz/Documents/referenzkorpus_altdeutsch/elan_repo/DDD-AD-Tatian ...
Done with DDD-AD-Tatian
Starting /home/klotzmaz/Documents/referenzkorpus_altdeutsch/elan_repo/DDD-AD-Benediktiner_Regel ...
Done with DDD-AD-Benediktiner_Regel
Starting /home/klotzmaz/Documents/referenzkorpus_altdeutsch/elan_repo/DDD-AD-Kleinere_Althochdeutsche_Denkmäler ...
Done with DDD-AD-Kleinere_Althochdeutsche_Denkmäler
Starting /home/klotzmaz/Documents/referenzkorpus_altdeutsch/elan_repo/DDD-AD-Kleinere_Altsächsische_Denkmäler ...
Done with DDD-AD-Kleinere_Altsächsische_Denkmäler
Starting /home/klotzmaz/Documents/referenzkorpus_altdeutsch/elan_repo/DDD-AD-Otfrid ...
Done with DDD-AD-Otfrid
Starting /home/klotzmaz/Documents/referenzkorpus_a

In [7]:
for lang in data:    
    for k, l in data[lang].items():
        with open('{}_{}.npy'.format(lang, k), 'wb') as f:
            np.save(f, np.array(l))
    print('Done with', lang)

Done with gmh
Done with goh
Done with lat
Done with ohg
Done with osx


In [9]:
with open('vocabs.pkl', 'wb') as f:
    pickle.dump(vocabs, f)

## Model