In [1]:
### import dependencies 

import re

from cltk.corpus.readers import get_corpus_reader

In [2]:
# Get corpus

latin_corpus = get_corpus_reader(corpus_name = 'latin_text_latin_library', language = 'latin')

In [3]:
### check file path
print(latin_corpus)

<FilteredPlaintextCorpusReader in '/Users/diyclassics/cltk_data/latin/text/latin_text_latin_library'>


In [4]:
### check how many items in corpus

# PJB: Use fileids to get this number faster

len(list(latin_corpus.fileids()))

2141

In [5]:
### show first 10 file ids that you can use in the text reader
print(latin_corpus.fileids()[:10])

['12tables.txt', '1644.txt', 'abbofloracensis.txt', 'abelard/dialogus.txt', 'abelard/epistola.txt', 'abelard/historia.txt', 'addison/barometri.txt', 'addison/burnett.txt', 'addison/hannes.txt', 'addison/machinae.txt']


In [6]:
### set a variable that is the list of all file names so we can iterate over it
files = latin_corpus.fileids()

In [7]:
### iterate over the files list and return only files which contain Livy and don't contain per

# PJB: livy_files would be more accurate

livy_files = [file for file in files if 'livy' in file and 'per' not in file]

In [8]:
# livy_path_sorted = sorted(livy_path)
# print(livy_path_sorted)

# PJB: See below for a book sort; with a workaround here for the Preface

livy_files.remove('livy/liv.pr.txt')
livy_files_order = [int(" ".join(re.findall(r'\d+', item))) for item in livy_files]
livy_files_sorted = ['livy/liv.pr.txt']
livy_files_sorted += [x for _, x in sorted(zip(livy_files_order, livy_files))]
print(livy_files_sorted)

['livy/liv.pr.txt', 'livy/liv.1.txt', 'livy/liv.2.txt', 'livy/liv.3.txt', 'livy/liv.4.txt', 'livy/liv.5.txt', 'livy/liv.6.txt', 'livy/liv.7.txt', 'livy/liv.8.txt', 'livy/liv.9.txt', 'livy/liv.10.txt', 'livy/liv.21.txt', 'livy/liv.22.txt', 'livy/liv.23.txt', 'livy/liv.24.txt', 'livy/liv.25.txt', 'livy/liv.26.txt', 'livy/liv.27.txt', 'livy/liv.28.txt', 'livy/liv.29.txt', 'livy/liv.30.txt', 'livy/liv.31.txt', 'livy/liv.32.txt', 'livy/liv.33.txt', 'livy/liv.34.txt', 'livy/liv.35.txt', 'livy/liv.36.txt', 'livy/liv.37.txt', 'livy/liv.38.txt', 'livy/liv.39.txt', 'livy/liv.40.txt', 'livy/liv.41.txt', 'livy/liv.42.txt', 'livy/liv.43.txt', 'livy/liv.44.txt', 'livy/liv.45.txt']


In [9]:
### check that we're getting what we want; yep Livy books 1-45 n.b 11-20 don't exist so won't show up plus 1 is .
#print(livy_path_sorted)

In [10]:
# check that 36 texts show up i.e. books 1-10;21-45 and the preface
len(livy_files_sorted) == 36

True

In [11]:
#use the .words method from the cltk corpus reader function to read the texts in Livy
#and return each word in each text
livy_words = latin_corpus.words(livy_files)
livy_words

<generator object FilteredPlaintextCorpusReader.words at 0x1121f0d68>

In [12]:
### convert the generator object into a list of words
livy_words_list = list(livy_words)

In [13]:
#show the first 100 words so we can get an idea for the data we have
# print(livy_words_list[:100])

In [14]:
# convert the words into lower case
low_livy_list = [word.lower() for word in livy_words_list]

In [15]:
#check the result
#print(low_livy_list[:100])

In [16]:
#import lematizer

# PJB: Deprecated; different code below; make sure to reload the latin models corpus
# from cltk.corpus.utils.importer import CorpusImporter
# corpus_importer.import_corpus('latin_models_cltk')

# from cltk.stem.lemma import LemmaReplacer

In [17]:
# # tell the lematizer it will be the latin version as opposed to e.g. Ancient Greek
# lemmatizer = LemmaReplacer('latin')


In [18]:
#iterate over list of words and lemamatize each
# lematize_livy = [lemmatizer.lemmatize(word) for word in low_livy_list]

In [19]:
#print(lematize_livy[:100])

In [20]:
# New lemmatizer code

from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer
lemmatizer = BackoffLatinLemmatizer()
lemmas = lemmatizer.lemmatize(low_livy_list)
print(lemmas[:10])

[('livy', 'livy'), (':', 'punc'), ('book', 'book'), ('i', 'eo'), ('titi', 'titus'), ('livi', 'livi'), ('ab', 'ab'), ('vrbe', 'vrbe'), ('condita', 'condio'), ('liber', 'liber')]


still working, checkpoint

In [21]:
# import cltk's pre-made stopword list
from cltk.stop.latin import STOPS_LIST

In [22]:
S = STOPS_LIST

In [23]:
# check what they have in their list
#print(S)

In [24]:
# when we lematized the list it wrapped it in a second list so we have to get back to just one list
flat_list = [item for sublist in lemmas for item in sublist]

In [25]:
# check that flat list
#print(flat_list[:100])

In [26]:
livy_stops_removed = [w for w in flat_list if w not in STOPS_LIST]

In [27]:
# declare my own list of junk words or symbols not caught by cltk
junk = ['cn.', 't.', 'q.', "'", 'm.', 'p.', '[', ']', '.', ',', ' ', ':', ';', 'qui1', '-', 'que', '$', '%', '&','*','+', '-', '/', '<', '=', '>', '@', '^', '_',  '`', '{', '|', '}', '~', '?', '!', '«', '»']
livy_junk_removed = [w for w in livy_stops_removed if w not in junk]

In [28]:
#compare the number of words before and after the junk is removed
len(livy_stops_removed)

947872

In [29]:
len(livy_junk_removed)

866395

In [30]:
# re-declare the variable so it makes more sense
clean_livy = livy_junk_removed

In [31]:
# this base python method will help us create a dictionary of word frequencies
from collections import Counter

In [32]:
# apply the counter to our cleaned text
livy_word_counts_counter = Counter(clean_livy)

In [33]:
# it works, just put whatever word you want in and it will return its word count
print(livy_word_counts_counter['manubiae'])

9


^^^ checkpoint: lematized, cleaned, counter working. ^^^

In [34]:
import nltk
import numpy
import matplotlib
from nltk.probability import FreqDist

ModuleNotFoundError: No module named 'numpy'

In [None]:
# now to interact with the more interesting natural language processing libraries we need to convert our processed text
# into nltk tokens.
mytext = nltk.Text(clean_livy)

In [None]:
print(mytext[:10])

In [None]:
# use the nltk dispersion plot function to show how words we are interested in are spread across a text
# n.b must run TWICE before it works
mytext.dispersion_plot(["praeda", "manubiae", "aurum", "argentum", "stipendium", "tribuo", "bos", "ager", "consul"])

In [None]:
fdist1 = FreqDist(mytext)

In [None]:
print(fdist1)

In [None]:
# let's get a list of word counts throughout Livy
# praeda is in the top 250 nouns or verbs list
# we could further refine this to work with only nouns
fdist1.most_common(250)

In [None]:
# let's look at a graph of word occurences
fdist1.plot(50, cumulative=False)

In [None]:
my_bigrams = list(nltk.bigrams(mytext))

In [None]:
print(my_bigrams[100:110])

In [None]:
def find_bigrams(keyword):
    for tuple in my_bigrams:
        if keyword in tuple:
            print(tuple)

In [None]:
find_bigrams("praeda")

In [None]:
# we are now going to build a netword of words in Livy using the co-occurence method
import networkx as nx

In [None]:
# generate a network whereby each word is connected by an edge to the words either side of it
G=nx.Graph()
for i in range(1, len(mytext)):
    G.add_edge(mytext[i-1],mytext[i])
print(nx.info(G))

In [None]:
# how many connections does each word have?
degree = nx.degree(G)

In [None]:
#degree

In [None]:
sorted_degree = sorted(dict(nx.degree(G)).items(),key=lambda x:x[1], reverse=True)

In [None]:
#show the 250 words with most connections
#these ranks will be similar to our word frequency score
for word, degree in sorted_degree[:250]:
        print(word, degree)

In [None]:
# now we look for degree centrality. That, similarly, is a score of how close to the middle of a network a node is.
sorted_degree_centrality = sorted(nx.degree_centrality(G).items(),key=lambda x:x[1], reverse=True)

In [None]:
sorted_degree_centrality

In [None]:
# betweeness is more interesting though. It finds the shortest path between each node on a network and then tells us
# how frequently a node is on that shortest path
# N.b. this is a VERY long computation so don't run it unless you really want to know the betweeness centrality!
#sorted_betweeness = sorted(nx.betweenness_centrality(G).items(),key=lambda x:x[1], reverse=True)

In [None]:
sorted_betweeness

In [None]:
# save our network to a file that other applications can undertsand.
nx.write_gexf(G, "livy_network.gexf")