In [13]:
import collections
import re

https://annotation.github.io/text-fabric/tf/convert/walker.html

https://nbviewer.org/github/annotation/banks/blob/master/programs/convert.ipynb

Nodetypes:
1 sign
2 word
3 verse
4 chapter
5 book
6 document

dicts needed:
1. otype
2. oslots
3. book
4. chapter
5. verse
6. trans [transcription]
7. 


    

In [14]:
import os
import re

from tf.fabric import Fabric
from tf.convert.walker import CV

### Basic exploration of characters in document

In [15]:
char_set = set()

with open('transcriptions.txt', 'r', encoding='utf-8') as file:
    for line in file:
        line = line.strip('\n')
        for char in line:
            char_set.add(char)
            
#char_set

### Conversion to TF

In [75]:
#BASE = os.path.expanduser('~/github')
#ORG = 'KU'
#REPO = 'NT'
#RELATIVE = 'tf'

TF_DIR = os.path.expanduser(f'{BASE}/{ORG}/{REPO}/{RELATIVE}')

VERSION = '0.1'

#TF_PATH = f'{TF_DIR}/{VERSION}'
TF_PATH = './tf'
TF = Fabric(locations=TF_PATH, silent=True)

In [76]:
slotType = 'sign'

In [77]:
generic = {
    'name': 'TRANSCRIPTION OF THE MANUSCRIPTS CONTAINING THE NEW TESTAMENT LETTER OF JUDE',
    'author': 'Tommy Wasserman',
    'contributors': 'Jan Krans, Dirk Roorda',
    'converter': 'Martijn Naaijer',
    'source': 'Data collected and edited by Tommy Wasserman',
    'url': 'https://easy.dans.knaw.nl/ui/datasets/id/easy-dataset:52008',
    'version': '0.1'
}

In [78]:
otext = {
    'fmt:text-orig-full': '{greek} ',
    'fmt:line-default': '{greek:XXX} ',
    'sectionTypes': 'book,chapter,verse',
    'sectionFeatures': 'book,chapter,verse',

}

# 'fmt:line-term': 'line#{terminator} ',
#    'structureTypes': 'book,chapter',
#    'structureFeatures': 'book,chapter',

In [79]:
intFeatures = {
  'chapter',
  'verse',
  'ns',
}

In [80]:
featureMeta = {
    'name': {
        'description': 'name of the codex or papyrus',
    },
    'book': {
        'description': 'name of a biblical book',
    },
    'chapter': {
        'description': 'chapter number in a biblical book',
    },
    'verse': {
        'description': 'verse number',
    },
    'greek': {
        'description': 'representation of a word/character in the text',
    },
    'ns': {
        'description': 'nomina sacra, divine name, often the name is abbreviated'
    },
    'vtype': {
        'description': 'verse type, has value "" if it is a verse with text, "0" if verse is not in manuscript (often verses 0 and 26)'
    },
    'wtype': {
        'description': 'word type, has value "" if it is an ordinary greek word, else indicates a lacuna in the text'
    },
    'stype': {
        'description': 'sign type, see word type'
    },
}

In [81]:
class Verse:
    def __init__(self, verse_string):
        """
        verse_string: string representation of a Greek verse.
        """
        self.verse_string = verse_string 
        
    def process_lacunae(self):
        """
        Checks if a verse is lacking,
        indicated with '&lac;' or '&lacfilm;' in the text.
        Returns:
        '': this is a normal verse
        'lac' or '&lacfilm;': there is a lacuna
        '0': empty verse, it is not in the manuscript
        """
        if self.verse_string == '&lac;':
            self.verse_string = ''
            return 'lac'
        
        elif self.verse_string == '&lacfilm;':
            self.verse_string = ''
            return 'lacfilm'
        
        elif self.verse_string == '0':
            self.verse_string = ''
            return 'empty'
        
        else:
            return ''
        

class Word:
    def __init__(self, word_string):
        """
        word_string: string representation of a Greek word in the text
        """
        self.word_string = word_string
        
    def process_lacunae(self):
        """
        Checks if a word is a lucuna,
        indicated with '&lac;' or '&lacfilm;' in the text.
        Returns:
        'lac' or '&lacfilm;': there is a lacuna
        '0': empty verse
        '0': Normal word
        
        """
        if self.word_string == '&lac;':
            self.word_string = ''
            return 'lac'
        
        elif self.word_string == '&lacfilm;':
            self.word_string = ''
            return 'lacfilm'
        
        elif self.word_string == '0':
            self.word_string = ''
            return 'empty'
        
        else:
            return ''
    
    def process_nomina_sacra(self):
        """
        If word has Nomina Sacra tags ('<NS>XXX</NS>'), these are removed.
        The cleaned string is returned, together with an integer:
        0: the word IS NOT a NS
        1: the word IS a NS
        """
        if self.word_string.startswith('<NS>'):
            self.word_string = self.word_string.replace('<NS>', '').replace('</NS>', '')
            return self.word_string, 1
        else:
            return self.word_string, 0
        
    

In [82]:
all_lacs = set()


In [83]:
def director(cv):
    
    no_line = 0
    
    #counter = dict(
    #  chapter=0,
    #  verse=0,
    #)
    cur = dict(
      document=None,
      book=None,
      chapter=None,
      verse=None,
      word=None
    )
    
    with open('transcriptions.txt', 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip('\n')
            
            if not line:
                continue
                
            if line.startswith('<S>'):
                
            
                for ntp in ('verse', 'chapter', 'book', 'document'):
                    cv.terminate(cur[ntp])
                    cur[ntp] = None         
                document_name = line.strip('<S>').strip('</S>')
                cur['document'] = cv.node('document')
                cv.feature(cur['document'], name=document_name)
                
                cur['book'] = cv.node('book')  
                cv.feature(cur['book'], book='letter_of_jude') 
                
                cur['chapter'] = cv.node('chapter')  
                cv.feature(cur['chapter'], chapter=1)
                
                
            elif line.startswith('<V '):
                
                # One line starts with: '<V [20]>'
                if line.startswith('<V ['):
                    line = line.replace('<V [', '<V ')
                if ']>' in line:
                    line = line.replace(']>', '>')
                
                verse_nr_re = re.compile(r'<V [0-9]+>')
                verse = verse_nr_re.search(line)
                verse = verse.group() 
                verse_number = verse.strip('<V ').strip('>')
                
                
                line = line.replace(verse, '').strip()
                if not line:
                    line = '0'

                verse = Verse(line)
                vtype = verse.process_lacunae()
                
                cur['verse'] = cv.node('verse')
                cv.feature(cur['verse'], verse=int(verse_number))
                cv.feature(cur['verse'], vtype=vtype)
                
                for w in line.split():
                    if 'lac' in w:
                        all_lacs.add(w)
                        
                    word = Word(w)
                    cur['word'] = cv.node('word')
                    
                    wtype = word.process_lacunae()
                    cv.feature(cur['word'], wtype=wtype)
                    
                    cleaned_w, ns = word.process_nomina_sacra()
                    cv.feature(cur['word'], ns=ns)
                    cv.feature(cur['word'], greek=cleaned_w)
                    
                    
                    for char in w:
                        
                        s = cv.slot()
                        cv.feature(s, greek=char)
                        cv.feature(s, stype=wtype)
                        
                    cv.terminate(cur['word'])

                cv.terminate(cur['verse'])
            
        for ntp in ('verse', 'chapter', 'book', 'document'):
            cv.terminate(cur[ntp])
            

In [84]:
cv = CV(TF)

good = cv.walk(
    director,
    slotType,
    otext=otext,
    generic=generic,
    intFeatures=intFeatures,
    featureMeta=featureMeta,
)

good

  0.00s Importing data from walking through the source ...
   |     0.00s Preparing metadata... 
   |     0.00s No structure nodes will be set up
   |   SECTION   TYPES:    book, chapter, verse
   |   SECTION   FEATURES: book, chapter, verse
   |   STRUCTURE TYPES:    
   |   STRUCTURE FEATURES: 
   |   TEXT      FEATURES:
   |      |   line-default         greek
   |      |   text-orig-full       greek
   |     0.01s OK
   |     0.00s Following director... 
   |     5.58s "edge" actions: 0
   |     5.58s "feature" actions: 3709205
   |     5.58s "node" actions: 267497
   |     5.58s "resume" actions: 0
   |     5.58s "slot" actions: 1462661
   |     5.58s "terminate" actions: 268065
   |        564 x "book" node 
   |        564 x "chapter" node 
   |        564 x "document" node 
   |    1462661 x "sign" node  = slot type
   |      15224 x "verse" node 
   |     250581 x "word" node 
   |    1730158 nodes of all types
   |     5.71s OK
   |     0.00s checking for nodes and edges ... 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



   |     1.19s T stype                to ~/Kopenhagen/KopenhagenResearch/NewTestament/LetterOfJude/tf
   |     0.01s T verse                to ~/Kopenhagen/KopenhagenResearch/NewTestament/LetterOfJude/tf
   |     0.01s T vtype                to ~/Kopenhagen/KopenhagenResearch/NewTestament/LetterOfJude/tf
   |     0.21s T wtype                to ~/Kopenhagen/KopenhagenResearch/NewTestament/LetterOfJude/tf
   |     0.86s T oslots               to ~/Kopenhagen/KopenhagenResearch/NewTestament/LetterOfJude/tf
   |     0.00s M otext                to ~/Kopenhagen/KopenhagenResearch/NewTestament/LetterOfJude/tf
  4.70s Exported 10 node features and 1 edge features and 1 config features to ~/Kopenhagen/KopenhagenResearch/NewTestament/LetterOfJude/tf


True

In [46]:
all_lacs

{'&lac;',
 '&lacfilm;',
 '&lacfilm;</T>',
 '&lacfilm;</U>',
 '<H>&lacfilm;</H>',
 '<U>&lacfilm;',
 'απ<H>&lacfilm;</H>',
 'γρα<U>&lacfilm;',
 'εξεχυ<H>&lacfilm;</H><C2>θησαν'}