In [1]:
from os import listdir
from os.path import isfile, join
import collections
import random

import gensim

from cltk.corpus.readers import get_corpus_reader
from cltk.tokenize.word import WordTokenizer
from cltk.stem.latin.j_v import JVReplacer

from pprint import pprint

In [2]:
latin_corpus = get_corpus_reader(corpus_name = 'latin_text_latin_library', language = 'latin')

In [3]:
print(latin_corpus)

<FilteredPlaintextCorpusReader in '/Users/jameswinestock 1/cltk_data/latin/text/latin_text_latin_library'>


In [4]:
files = latin_corpus.fileids()

In [5]:
livy = [file for file in files if 'livy' in file and 'per' not in file]

In [21]:
livy_words = latin_corpus.words(livy)

In [22]:
livy_words_list = list(livy_words)

In [23]:
print(livy_words_list[:10])

['Livy', ':', 'Book', 'I', 'TITI', 'LIVI', 'AB', 'VRBE', 'CONDITA', 'LIBER']


In [24]:
from cltk.stop.latin import STOPS_LIST
from cltk.stem.lemma import LemmaReplacer

low_livy_list = [word.lower() for word in livy_words_list]
lemmatizer = LemmaReplacer('latin')
lematize_livy = [lemmatizer.lemmatize(word) for word in low_livy_list]
S = STOPS_LIST
flat_list = [item for sublist in lematize_livy for item in sublist]
livy_stops_removed = [w for w in flat_list if w not in STOPS_LIST]
junk = ['cn.', 't.', 'q.', "'", 'm.', 'p.', '[', ']', '.', ',', ' ', ':', ';', 'qui1', '-', 'que', '$', '%', '&','*','+', '-', '/', '<', '=', '>', '@', '^', '_',  '`', '{', '|', '}', '~', '?', '!', '«', '»']
livy_junk_removed = [w for w in livy_stops_removed if w not in junk]
clean_livy = livy_junk_removed

In [25]:
print(clean_livy[:10])

['livy', 'book', 'eo1', 'titi', 'livi', 'vrbe', 'condio', 'libo1', 'eo1', '1']


In [6]:
word_tokenizer = WordTokenizer('latin')
replacer = JVReplacer()

In [7]:
# Preprocess texts

import html
import re
from cltk.stem.lemma import LemmaReplacer
lemmatizer = LemmaReplacer('latin')

def preprocess(text, remove_list=[]):

    if remove_list:
        for pattern in remove_list:
            text = re.sub(pattern, '', text)
    
    text = html.unescape(text) # Handle html entities
    text = re.sub(r'&nbsp;?', ' ',text) #&nbsp; stripped incorrectly in corpus?
    text = re.sub(r'\x00',' ',text) #Another space problem?
    
    text = text.lower()
    text = replacer.replace(text) #Normalize u/v & i/j

    
    
    punctuation ="\"#$%&\'()*+,-/:;<=>@[\]^_`{|}~.?!«»"
    translator = str.maketrans({key: " " for key in punctuation})
    text = text.translate(translator)
    
    
    translator = str.maketrans({key: " " for key in '0123456789'})
    text = text.translate(translator)
    
    text = re.sub('[ ]+',' ', text) # Remove double spaces
    text = re.sub('\s+\n+\s+','\n', text) # Remove double lines and trim spaces around new lines
    text = lemmatizer.lemmatize(text, return_string=True)
    
    return text

In [8]:
livy_raw = []

for file in livy:
    raw = latin_corpus.raw(file)
    start = raw.find('[ 1 ]')
    raw = raw[start:]
    raw = preprocess(raw, remove_list = [r'Livy', r'The Latin Library', r'The Classics Page'])
    livy_raw.append(raw.strip())

In [23]:
#print(livy_raw[1])

checkpoint text cleaned and lematized

In [13]:
import nltk
import numpy
import matplotlib
from nltk.probability import FreqDist

In [47]:
#livy_string = str(livy_raw)
livy_string = [str(para) for para in livy_raw]

In [50]:
print(livy_string[:10])

['1 jam primus omne sero1 consto troia capio in ceter saevio edo1 troianos duo aeneus antenorique et vetustus jus hospitium et quia pax reddo helenae semper auctoro sum1 omne jus bellus achiuos abstineo casus1 deinde varius1 antenorem cum multitudo ex-neo1 qui1 seditio ex paphlagonia pello et sedeo et dux rego pylaemene ad troiam amitto quaero venio in intimus mare hadriatici sinum euganeisque qui1 inter mare alpesque incolo1 pello ex-neo1 troianosque eo1 teneo terra et in qui1 primus egredior sum1 locus troia voco pagus indo troiano nomen edo1 gens universus ueneti appello aeneus ab similis clades domo profugus sed ad magnus res initio duco for primus in macedoniam venio indo in sicilio quaero sedeo defero ab sicilia classis ad laurentem ager teneo troia et hic loco nomen edo1 ibi egredior troiani ut qui1 ab immensus prope error nihil praeter armo et navo supersum cum praeda ex ager ago latinus rex aboriginesque qui1 tum is teneo loco ad arceo vis advena armo ex urbs atque ager concur

In [52]:
#tokens = nltk.word_tokenize(livy_string)
tokens = [nltk.word_tokenize(para) for para in livy_string]

In [68]:
print(tokens[34])

['1', 'sequor', 'hic', 'annus', 'nobilis', 'clades', 'romana', 'caudina', 'pax', 't', 'ueturio', 'caluino', 'sp', 'postumio', 'consul', 'samnites', 'eo1', 'adnato', 'imperator', 'c', 'pons', 'herenni', 'filius', 'habeo', 'pater', 'longus', 'prudens', 'nascor', 'primus', 'ipse', 'bellator', 'dux', 'eo1', 'ubi', 'lego1', 'qui1', 'ad', 'dedo', 'reor', 'mitto', 'sum1', 'pax', 'inficio', 'redeo', 'neo1', 'nihil', 'ago', 'inquam', 'hic', 'legatio', 'censeo1', 'expio', 'edo1', 'quisquis', 'ex', 'foedo', 'rumpo', 'ira', 'in', 'nos', 'caelestis', 'sum1', 'sero1', 'scio', 'quicumque', 'dives', 'chordus1', 'sum1', 'subigo', 'nos', 'ad', 'necessitas', 'dedo', 'reor', 'qui1', 'ab', 'nos', 'ex', 'foedo', 'repeto', 'sum1', 'is', 'non', 'sum1', 'chordus1', 'tam', 'superbus', 'ab', 'romanis', 'foedo', 'expiatio', 'sperno', 'quis1', 'enim', 'ulter', 'fio', 'ad', 'placo', 'deus', 'mitigo', 'homo', 'possum', 'qui1', 'qui1', 'nos', 'facio', 'reor', 'hostis', 'in', 'praeda', 'capio', 'qui1', 'bellus', 'jus'

In [35]:
mytext = nltk.Text(livy_raw)

In [37]:
#print(mytext[:10])

In [32]:
from pprint import pprint
from gensim import corpora
from collections import defaultdict
from gensim.corpora import Dictionary

In [59]:
dictionary = corpora.Dictionary(tokens)

In [61]:
print(dictionary)

Dictionary(15082 unique tokens: ['1', '10', '11', '12', '13']...)


In [63]:
print(dictionary.token2id)

{'1': 0, '10': 1, '11': 2, '12': 3, '13': 4, '14': 5, '15': 6, '16': 7, '17': 8, '18': 9, '19': 10, '2': 11, '20': 12, '21': 13, '22': 14, '23': 15, '24': 16, '25': 17, '26': 18, '27': 19, '28': 20, '29': 21, '3': 22, '30': 23, '31': 24, '32': 25, '33': 26, '34': 27, '35': 28, '36': 29, '37': 30, '38': 31, '39': 32, '4': 33, '40': 34, '41': 35, '42': 36, '43': 37, '44': 38, '45': 39, '46': 40, '47': 41, '48': 42, '49': 43, '5': 44, '50': 45, '51': 46, '52': 47, '53': 48, '54': 49, '55': 50, '56': 51, '57': 52, '58': 53, '59': 54, '6': 55, '60': 56, '7': 57, '8': 58, '9': 59, 'Hercules': 60, 'Penates': 61, 'a-migro': 62, 'a-sum1': 63, 'ab': 64, 'abdo': 65, 'abduco': 66, 'abeo': 67, 'aberat—consequi': 68, 'abhorreo': 69, 'abigo': 70, 'ablego': 71, 'abnuo': 72, 'aboleuit': 73, 'aborigines': 74, 'aboriginesque': 75, 'aboriginum': 76, 'abrogo': 77, 'absentiumque': 78, 'absolvo': 79, 'absonus': 80, 'abstergeo': 81, 'abstineo': 82, 'absum': 83, 'accedo': 84, 'accendo2': 85, 'accingo': 86, 'ac