In [35]:
import os
from pprint import pprint
from bs4 import BeautifulSoup
from string import punctuation
punctuation += "—"
from collections import Counter
from tqdm.notebook import tqdm

In [16]:
import cltk

# POS-tagger

In [5]:
from cltk.corpus.utils.importer import CorpusImporter

corpus_importer = CorpusImporter('latin')
pprint(corpus_importer.list_corpora)
len(corpus_importer.list_corpora)

['latin_text_perseus',
 'latin_treebank_perseus',
 'latin_text_latin_library',
 'phi5',
 'phi7',
 'latin_proper_names_cltk',
 'latin_models_cltk',
 'latin_pos_lemmata_cltk',
 'latin_treebank_index_thomisticus',
 'latin_lexica_perseus',
 'latin_training_set_sentence_cltk',
 'latin_word2vec_cltk',
 'latin_text_antique_digiliblt',
 'latin_text_corpus_grammaticorum_latinorum',
 'latin_text_poeti_ditalia',
 'latin_text_tesserae']


16

In [14]:
corpus_importer.import_corpus('latin_models_cltk')

In [5]:
from cltk.tag.pos import POSTag

tagger = POSTag('latin')

tagger.tag_ngram_123_backoff('prohibeo !\nCaesar')

[('prohibeo', None), ('!', None), ('Caesar', None)]

In [6]:
def text_and_POStag_text(file):  
    with open(f'perseus_tagged_corpus/{file}', encoding='utf-8') as f:
        text = f.read()
        soup = BeautifulSoup(text, 'xml')
        sentences = []
        POStag_sentences = []
        for sent in soup.find_all('sentence'):
            wordforms = []
            POStags = []
            for w in sent.find_all('word'):
                try:
                    if w.attrs['postag'] and w.attrs['form'] not in punctuation:
                        form = w.attrs['form']
                        for punct in punctuation:
                            form = form.replace(punct, '')
                        wordforms.append(form)
                        POStags.append(w.attrs['postag'])
                except:
                    pass
            sentences.append(wordforms)
            POStag_sentences.append(POStags)
    return (sentences, POStag_sentences)

In [20]:
tagged_perseus_files = os.listdir('perseus_tagged_corpus')

In [21]:
tagged_perseus_files

['phi0690.phi003.perseus-lat1.tb.xml',
 'perseus-lattb.1248.1.xml',
 'perseus-lattb.2219.1.xml',
 'phi0631.phi001.perseus-lat1.tb.xml',
 'phi0972.phi001.perseus-lat1.tb.xml',
 'phi0474.phi013.perseus-lat1.tb.xml',
 'phi0620.phi001.perseus-lat1.tb.xml',
 'phi0448.phi001.perseus-lat1.tb.xml',
 'phi0959.phi006.perseus-lat1.tb.xml',
 'tlg0031.tlg027.perseus-lat1.tb.xml']

In [9]:
texts = {}
POStag_texts = {}
for file in tagged_perseus_files:
    (sentences, POStag_sentences) = text_and_POStag_text(file)
    texts[file] = sentences
    POStag_texts[file] = POStag_sentences

In [10]:
for file in texts.keys():
    with open(f'perseus_text/{file[:-4]}.txt', 'w', encoding='utf-8') as f:
        f.write('\n'.join([' '.join(sent) for sent in texts[file]]))
for file in POStag_texts.keys():
    with open(f'perseus_tags/{file[:-4]}.txt', 'w', encoding='utf-8') as f:
        f.write('\n'.join([' '.join(sent) for sent in POStag_texts[file]]))

In [22]:
def sentence_tagger_stats(POS_tagger, files):
    percent_sum = 0
    for file in files:
        with open(f'perseus_text/{file[:-4]}.txt', encoding='utf-8') as f:
            text = f.read()
            spl_text = text.split()
        with open(f'perseus_tags/{file[:-4]}.txt', encoding='utf-8') as f:
            spl_tags = f.read().split()
        cltk_tagged = []
        for sent in text.split('\n'):
            tagged_sent = POS_tagger(sent)
            cltk_tagged.extend(tagged_sent)
        correct_tag_count = 0
        total_tag_count = 0
        wrong_tags = []
        for i, (word, tag) in enumerate(cltk_tagged):
                correct = False
                if not tag or tag == 'Unk' and spl_tags[i] == 'u--------':
                    correct = True
                    correct_tag_count += 1
                elif tag and tag.lower() == spl_tags[i]:
                    correct = True
                    correct_tag_count += 1
                else:
                    wrong_tags.append((word, tag))
                if word != spl_text[i]:
                    print(word + ' ' + spl_text[i])
                #if spl_text[i] == 'que'
                total_tag_count += 1
        print(f'{file}:\nТОЧНОСТЬ: {correct_tag_count/total_tag_count}\n')
        percent_sum += correct_tag_count/total_tag_count
        pprint(Counter(wrong_tags).most_common(10))
        print()
    print(f'СРЕДНЯЯ ТОЧНОСТЬ: {percent_sum/len(files)}')

In [23]:
sentence_tagger_stats(tagger.tag_ngram_123_backoff, tagged_perseus_files)

phi0690.phi003.perseus-lat1.tb.xml:
ТОЧНОСТЬ: 0.8931198615318044

[(('ea', 'P-S---FB-'), 3),
 (('quam', 'C--------'), 3),
 (('ardens', 'T-SPPAFN-'), 2),
 (('miseratus', 'T-SRPPMN-'), 2),
 (('hoc', 'P-S---NA-'), 2),
 (('ingens', 'A-S---MN-'), 2),
 (('cum', 'R--------'), 2),
 (('dictis', 'T-PRPPNB-'), 2),
 (('horrendas', 'T-PPGPFA-'), 2),
 (('involvens', 'T-SPPAMN-'), 2)]

perseus-lattb.1248.1.xml:
ТОЧНОСТЬ: 0.8428519461622408

[(('inquit', 'V3SPIA---'), 53),
 (('quod', 'C--------'), 18),
 (('ne', 'D--------'), 15),
 (('qui', 'P-S---MN-'), 7),
 (('quae', 'P-S---FN-'), 6),
 (('suos', 'A-P---MA-'), 6),
 (('mea', 'A-S---FN-'), 6),
 (('dedit', 'V3SPIA---'), 5),
 (('genus', 'N-S---NN-'), 5),
 (('tua', 'A-S---FN-'), 5)]

perseus-lattb.2219.1.xml:
ТОЧНОСТЬ: 0.8571036752605595

[(('etiam', 'D--------'), 43),
 (('ne', 'D--------'), 38),
 (('cum', 'C--------'), 17),
 (('se', 'P-S---MA-'), 13),
 (('p', '---------'), 11),
 (('quo', 'P-S---NB-'), 11),
 (('quam', 'C--------'), 9),
 (('quam', 'D-------

In [24]:
sentence_tagger_stats(tagger.tag_tnt, tagged_perseus_files)

phi0690.phi003.perseus-lat1.tb.xml:
ТОЧНОСТЬ: 0.7858070099524016

[(('Aeneas', 'Unk'), 10),
 (('O', 'Unk'), 7),
 (('Sibyllae', 'Unk'), 5),
 (('Tum', 'Unk'), 5),
 (('Tu', 'Unk'), 4),
 (('At', 'Unk'), 3),
 (('Triviae', 'Unk'), 3),
 (('In', 'Unk'), 3),
 (('Phoebi', 'Unk'), 3),
 (('Non', 'Unk'), 3)]

perseus-lattb.1248.1.xml:
ТОЧНОСТЬ: 0.40887595489268824

[(('inquit', 'V3SPIA---'), 53),
 (('Et', 'Unk'), 43),
 (('Qui', 'Unk'), 20),
 (('quod', 'C--------'), 17),
 (('Sed', 'Unk'), 16),
 (('Hoc', 'Unk'), 14),
 (('Quod', 'Unk'), 13),
 (('ne', 'D--------'), 13),
 (('Nec', 'Unk'), 13),
 (('Tunc', 'Unk'), 12)]

perseus-lattb.2219.1.xml:
ТОЧНОСТЬ: 0.4506308283049918

[(('etiam', 'D--------'), 45),
 (('ne', 'D--------'), 30),
 (('uel', 'Unk'), 23),
 (('M', 'Unk'), 20),
 (('neque', 'Unk'), 15),
 (('se', 'P-S---MA-'), 12),
 (('quam', 'C--------'), 12),
 (('Antonio', 'Unk'), 11),
 (('gessit', 'Unk'), 11),
 (('C', 'Unk'), 10)]

phi0631.phi001.perseus-lat1.tb.xml:
ТОЧНОСТЬ: 0.8592721287490855

[(('Catil

In [12]:
def tagger_stats(POS_tagger, files):
    percent_sum = 0
    for file in files:
        with open(f'perseus_text/{file[:-4]}.txt', encoding='utf-8') as f:
            spl_text = f.read().split()
        cltk_tagged = POS_tagger(' '.join(spl_text))
        with open(f'perseus_tags/{file[:-4]}.txt', encoding='utf-8') as f:
            spl_tags = f.read().split()
        correct_tag_count = 0
        total_tag_count = 0
        wrong_tags = []
        for i, (word, tag) in tqdm(enumerate(cltk_tagged)):
                correct = False
                if not tag or tag == 'Unk' and spl_tags[i] == 'u--------':
                    correct = True
                    correct_tag_count += 1
                elif tag and tag.lower() == spl_tags[i]:
                    correct = True
                    correct_tag_count += 1
                else:
                    wrong_tags.append((word, tag))
                if word != spl_text[i]:
                    print(word + ' ' + spl_text[i])
                #if spl_text[i] == 'que'
                total_tag_count += 1
        print(f'{file}:\nТОЧНОСТЬ: {correct_tag_count/total_tag_count}\n')
        percent_sum += correct_tag_count/total_tag_count
        pprint(Counter(wrong_tags).most_common(10))
        print()
    print(f'СРЕДНЯЯ ТОЧНОСТЬ: {percent_sum/len(files)}')

# Макронайзер

In [2]:
from cltk.prosody.latin.macronizer import Macronizer

In [19]:
macrons = {"ā": "a",
           "ē": "e",
           "ī": "i",
           "ō": "o",
           "ū": "u",
           "Ā": "a",
           "Ē": "e",
           "Ī": "i",
           "Ō": "o",
           "Ū": "u"}

In [3]:
macronizer = Macronizer('tag_ngram_123_backoff')

In [30]:
Aeneid_text_premacronized = """Arma virumque canō, Trōiae quī prīmus ab ōrīs
Ītaliam, fātō profugus, Lāvīniaque vēnit
lītora, multum ille et terrīs iactātus et altō
vī superum saevae memorem Iūnōnis ob īram;
multa quoque et bellō passūs, dum conderet urbem,
inferretque deōs Latiō, genus unde Latīnum,
Albānīque patrēs, atque altae moenia Rōmae.
Mūsa, mihī causās memorā, quō nūmine laesō,
quidve dolēns, rēgīna deum tot volvere cāsūs
īnsīgnem pietāte virum, tot adīre labōrēs
impulerit. Tantaene animīs caelestibus īrae?
Urbs antīqua fuit, Tyriī tenuēre colōnī,
Karthāgō, Ītaliam contrā Tiberīnaque longē
ōstia, dīves opum studiīsque asperrima bellī,
quam Iūnō fertur terrīs magis omnibus ūnam
posthabitā coluisse Samō; hīc illius arma,
hīc currus fuit; hōc rēgnum dea gentibus esse,
sī quā Fāta sinant, iam tum tenditque fovetque.
Prōgeniem sed enim Trōiānō ā sanguine dūcī
audierat, Tyriās olim quae verteret arcēs;
hinc populum lātē regem bellōque superbum
ventūrum excidiō Libyae: sīc volvere Parcās.
Id metuēns, veterisque memor Sāturnia bellī,
prīma quod ad Trōiam prō cārīs gesserat Argīs—
necdum etiam causae īrārum saevīque dolōrēs
exciderant animō: manet altā mente repostum
iūdicium Paridis sprētaeque iniūria fōrmae,
et genus invīsum, et raptī Ganymēdis honōrēs.
Hīs accēnsa super, iactātōs aequore tōtō
Trōas, rēliquiās Danaum atque immītis Achillī,
arcēbat longē Latiō, multōsque per annōs
errābant, āctī Fātīs, maria omnia circum.
Tantae mōlis erat Rōmānam condere gentem!"""

In [31]:
Aeneid_text = Aeneid_text_premacronized.replace("ō", "o").replace("ē", "e").replace("ā", "a").replace("ī", "i").replace("ū", "u").replace("Ō", "O").replace("Ē", "E").replace("Ā", "A").replace("Ī", "I").replace("Ū", "U")



In [16]:
print(macronizer.macronize_text(Aeneid_text))

arma virumque cano , trojae quī prīmus ab ōrīs i_taliam , fato profugus , laviniaque venit lītora , multum ille et terrīs iactatus et altō vī superum saevae memorem jūnōnis ob īram ; multa quoque et bellō passūs , dum conderet urbem , inferretque deōs lātiō , genus unde latinum , albanique patrēs , atque altae moenia Rōmae . musa , mihi causās memora , quō nūmine laeso , quidve dolēns , rēgīna deum tot volvere cāsūs insignem pietāte virum , tot adīre labōrēs impulerit . tantaene animīs caelestibus īrae ? urbs antīqua fuit , tyrii tenuēre colōnī , karthago , i_taliam contrā tiberinaque longē ōstia , dīves opum studiisque asperrima bellī , quam jūnō fertur terrīs magis omnibus ūnam posthabita coluisse samo ; hic illīus arma , hic currūs fuit ; hoc rēgnum dea gentibus esse , sī qua fāta sinant , jam tum tenditque fovetque . progeniem sed enim troiano ā sanguine dūcī audierat , tyrias ōlim quae verteret arcēs ; hinc populum lātē rēgem belloque superbum ventūrum excidio libyae : sīc volvere

In [36]:
Aeneid_text_premacronized_modified = ""
for symbol in Aeneid_text_premacronized:
    if symbol.upper() == symbol and symbol in macrons.keys():
        Aeneid_text_premacronized_modified += macrons[symbol]+"_"
    elif symbol in punctuation:
        Aeneid_text_premacronized_modified += " "+symbol
    elif symbol == "\n":
        Aeneid_text_premacronized_modified += " "
    else:
        Aeneid_text_premacronized_modified += symbol
Aeneid_text_premacronized_modified = Aeneid_text_premacronized_modified.lower()

In [25]:
Aeneid_text_premacronized_modified

'arma virumque canō , trōiae quī prīmus ab ōrīs i_taliam , fātō profugus , lāvīniaque vēnit lītora , multum ille et terrīs iactātus et altō vī superum saevae memorem iūnōnis ob īram ; multa quoque et bellō passūs , dum conderet urbem , inferretque deōs latiō , genus unde latīnum , albānīque patrēs , atque altae moenia rōmae .  mūsa , mihī causās memorā , quō nūmine laesō , quidve dolēns , rēgīna deum tot volvere cāsūs īnsīgnem pietāte virum , tot adīre labōrēs impulerit . tantaene animīs caelestibus īrae ?  urbs antīqua fuit , tyriī tenuēre colōnī , karthāgō , i_taliam contrā tiberīnaque longē ōstia , dīves opum studiīsque asperrima bellī , quam iūnō fertur terrīs magis omnibus ūnam posthabitā coluisse samō ; hīc illius arma , hīc currus fuit ; hōc rēgnum dea gentibus esse , sī quā fāta sinant , iam tum tenditque fovetque . prōgeniem sed enim trōiānō ā sanguine dūcī audierat , tyriās olim quae verteret arcēs ; hinc populum lātē regem bellōque superbum ventūrum excidiō libyae : sīc volv

In [46]:
def word_from_letter_index(i, text):
    word = ""
    left = ""
    right = ""
    for symbol in text[i-1::-1]:
        if symbol not in " \n":
            left += symbol
        else:
            break
    for symbol in text[i+1:]:
        if symbol not in " \n":
            right += symbol
        else:
            break
    word = left[::-1]+text[i]+right
    return word

In [50]:
def vowel_count_latin(text):
    vowel_count = 0
    for symbol in text:
        if symbol.lower() in "aeiouāēīōū":
            vowel_count += 1
    return vowel_count

In [53]:
wrong_vowel_count = 0
for i, symbol in enumerate(macronizer.macronize_text(Aeneid_text)):
    if symbol.lower() in "aeiouāēīōū" and symbol != Aeneid_text_premacronized_modified[i]:
        print(word_from_letter_index(i, macronizer.macronize_text(Aeneid_text))+" "+word_from_letter_index(i, Aeneid_text_premacronized_modified))
        wrong_vowel_count += 1

cano canō
trojae trōiae
fato fātō
fato fātō
laviniaque lāvīniaque
laviniaque lāvīniaque
venit vēnit
iactatus iactātus
lātiō latiō
latinum latīnum
albanique albānīque
albanique albānīque
musa mūsa
mihi mihī
memora memorā
laeso laesō
insignem īnsīgnem
insignem īnsīgnem
tyrii tyriī
karthago karthāgō
karthago karthāgō
tiberinaque tiberīnaque
studiisque studiīsque
posthabita posthabitā
samo samō
hic hīc
illīus illius
hic hīc
currūs currus
hoc hōc
qua quā
progeniem prōgeniem
troiano trōiānō
troiano trōiānō
troiano trōiānō
tyrias tyriās
ōlim olim
rēgem regem
belloque bellōque
excidio excidiō
parcas parcās
troiam trōiam
caris cārīs
caris cārīs
argis argīs
irarum īrārum
irarum īrārum
saevique saevīque
alta altā
spretaeque sprētaeque
rapti raptī
ganymedis ganymēdis
iactatos iactātōs
iactatos iactātōs
totō tōtō
troas trōas
reliquiās rēliquiās
immitis immītis
arcebat arcēbat
lātiō latiō
multosque multōsque
errabant errābant
acti āctī
acti āctī
molis mōlis


In [54]:
# такой процень гласных угадывает макронайзер
wrong_vowel_count / vowel_count_latin(Aeneid_text_premacronized_modified)

0.11565836298932385