In [1]:
import collections
import re

https://annotation.github.io/text-fabric/tf/convert/walker.html

https://nbviewer.org/github/annotation/banks/blob/master/programs/convert.ipynb

Nodetypes:
1 sign
2 word
3 verse
4 chapter
5 book
6 document

dicts needed:
1. otype
2. oslots
3. book
4. chapter
5. verse
6. trans [transcription]
7. 


    

In [2]:
import os
import re

from tf.fabric import Fabric
from tf.convert.walker import CV

### Basic exploration of characters in document

In [12]:
char_set = set()

with open('transcriptions.txt', 'r', encoding='utf-8') as file:
    for line in file:
        line = line.strip('\n')
        for char in line:
            char_set.add(char)
            
#char_set

### Conversion to TF

In [3]:
BASE = os.path.expanduser('~/github')
ORG = 'KU'
REPO = 'NT'
RELATIVE = 'tf'

TF_DIR = os.path.expanduser(f'{BASE}/{ORG}/{REPO}/{RELATIVE}')

VERSION = '0.1'

TF_PATH = f'{TF_DIR}/{VERSION}'
TF = Fabric(locations=TF_PATH, silent=True)

In [4]:
slotType = 'sign'

In [5]:
generic = {
    'name': 'TRANSCRIPTION OF THE MANUSCRIPTS CONTAINING THE NEW TESTAMENT LETTER OF JUDE',
    'author': 'Tommy Wasserman',
    'contributors': 'Jan Krans, Dirk Roorda',
    'converter': 'Martijn Naaijer',
    'source': 'Data collected and edited by Tommy Wasserman',
    'url': 'https://easy.dans.knaw.nl/ui/datasets/id/easy-dataset:52008',
    'version': '0.1'
}

In [6]:
otext = {
    'fmt:text-orig-full': '{g_cons} ',
    'fmt:line-default': '{g_cons:XXX} ',
    'sectionTypes': 'book,chapter,verse',
    'sectionFeatures': 'book,chapter,verse',

}

# 'fmt:line-term': 'line#{terminator} ',
#    'structureTypes': 'book,chapter',
#    'structureFeatures': 'book,chapter',

In [7]:
intFeatures = {
  'chapter',
  'verse',
  'ns',
}

In [8]:
featureMeta = {
    'name': {
        'description': 'name of the codex or papyrus',
    },
    'book': {
        'description': 'name of a biblical book',
    },
    'chapter': {
        'description': 'chapter number in a biblical book',
    },
    'verse': {
        'description': 'verse number',
    },
    'g_cons': {
        'description': 'representation of a word in the text',
    },
    'ns': {
        'description': 'nomina sacra, divine name, often the name is abbreviated'
    },
    
    'character': {
        'description': 'greek letter',
    },
}

In [9]:
class Word:
    def __init__(self, word_string):
        """
        word_string: string representation of a Greek word in the text
        """
        self.word_string = word_string
        
        
    def process_nomina_sacra(self):
        """
        If word has Nomina Sacra tags ('<NS>XXX</NS>'), these are removed.
        The cleaned string is returned, together with an integer:
        0: the word IS NOT a NS
        1: the word IS a NS
        """
        if self.word_string.startswith('<NS>'):
            self.word_string = self.word_string.replace('<NS>', '').replace('</NS>', '')
            return self.word_string, 1
        else:
            return self.word_string, 0

In [12]:
def director(cv):
    #counter = dict(
    #  chapter=0,
    #  verse=0,
    #)
    cur = dict(
      document=None,
      book=None,
      chapter=None,
      verse=None,
      word=None
    )
    
    with open('transcriptions.txt', 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip('\n')
            
            if not line:
                continue
                
            if line.startswith('<S>'):
                
            
                for ntp in ('verse', 'chapter', 'book', 'document'):
                     cv.terminate(cur[ntp])
                     cur[ntp] = None         
                document_name = line.strip('<S>').strip('</S>')
                cur['document'] = cv.node('document')
                cv.feature(cur['document'], name=document_name)
                
                cur['book'] = cv.node('book')  
                cv.feature(cur['book'], book='letter_of_jude') 
                
                cur['chapter'] = cv.node('chapter')  
                cv.feature(cur['chapter'], chapter=1)
                
                
            elif line.startswith('<V '):
                
                # One line starts with: '<V [20]>'
                if line.startswith('<V ['):
                    line = line.replace('<V [', '<V ')
                if ']>' in line:
                    line = line.replace(']>', '>')
                
                verse_nr_re = re.compile(r'<V [0-9]+>')
                verse = verse_nr_re.search(line)
                verse = verse.group() 
                verse_clean = verse.strip('<V ').strip('>')
                
                line = line.replace(verse, '').strip()
                cur['verse'] = cv.node('verse')
                cv.feature(
                    cur['verse'],
                    verse=int(verse_clean),
                    )
                for w in line.split():
                    word = Word(w)
                    cur['word'] = cv.node('word')
                    
                    cleaned_w, ns = word.process_nomina_sacra()
                    cv.feature(cur['word'], ns=ns)
                    cv.feature(cur['word'], g_cons=cleaned_w)
                    
                    for char in w:
                        s = cv.slot()
                        cv.feature(s, character=char)
                    cv.terminate(cur['word'])

                cv.terminate(cur['verse'])
            
                
        for ntp in ('verse', 'chapter', 'book', 'document'):
            cv.terminate(cur[ntp])
            

In [13]:
cv = CV(TF)

good = cv.walk(
    director,
    slotType,
    otext=otext,
    generic=generic,
    intFeatures=intFeatures,
    featureMeta=featureMeta,
)

good

  0.00s Importing data from walking through the source ...
   |     0.00s Preparing metadata... 
   |     0.00s No structure nodes will be set up
   |   SECTION   TYPES:    book, chapter, verse
   |   SECTION   FEATURES: book, chapter, verse
   |   STRUCTURE TYPES:    
   |   STRUCTURE FEATURES: 
   |   TEXT      FEATURES:
   |      |   line-default         g_cons
   |      |   text-orig-full       g_cons
   |     0.00s OK
   |     0.00s Following director... 
   |     4.27s "edge" actions: 0
   |     4.27s "feature" actions: 1979890
   |     4.27s "node" actions: 267214
   |     4.27s "resume" actions: 0
   |     4.27s "slot" actions: 1462378
   |     4.27s "terminate" actions: 267782
   |        564 x "book" node 
   |        564 x "chapter" node 
   |        564 x "document" node 
   |    1462378 x "sign" node  = slot type
   |      15224 x "verse" node 
   |     250298 x "word" node 
   |    1729592 nodes of all types
   |     4.40s OK
   |     0.05s Removing unlinked nodes ... 
  

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



   |     0.02s T verse                to ~/github/KU/NT/tf/0.1
   |     0.85s T oslots               to ~/github/KU/NT/tf/0.1
   |     0.00s M otext                to ~/github/KU/NT/tf/0.1
  3.33s Exported 8 node features and 1 edge features and 1 config features to ~/github/KU/NT/tf/0.1


True