In [None]:
# This notebook creates a word2vec model using the Bamman 2012 corpus lemmatized with TreeTagger

In [1]:
# Imports

import os
import re
import time
import multiprocessing

import gensim
from gensim.models import Word2Vec

#from cltk.stem.latin.j_v import JVReplacer
#from cltk.tokenize.sentence import TokenizeSentence
from cltk.sentence.lat import LatinPunktSentenceTokenizer
#from cltk.lemmatize.latin.backoff import BackoffLatinLemmatizer
from cltk.lemmatize.lat import LatinBackoffLemmatizer as BackoffLatinLemmatizer
from tqdm.notebook import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class JVReplacer:  # pylint: disable=too-few-public-methods
    """Replace J/V with I/U.
    Latin alphabet does not distinguish between J/j and I/i and V/v and U/u;
    Yet, many texts bear the influence of later editors and the predilections of other languages.

    In practical terms, the JV substitution is recommended on all Latin text preprocessing; it
    helps to collapse the search space.

    >>> replacer = JVReplacer()
    >>> replacer.replace("Julius Caesar")
    'Iulius Caesar'

    >>> replacer.replace("In vino veritas.")
    'In uino ueritas.'

    """

    def __init__(self):
        """Initialization for JVReplacer, reads replacement pattern tuple."""
        patterns = [(r"j", "i"), (r"v", "u"), (r"J", "I"), (r"V", "U")]
        self.patterns = [(re.compile(regex), repl) for (regex, repl) in patterns]

    def replace(self, text):
        """Do j/v replacement"""
        for pattern, repl in self.patterns:
            text = re.subn(pattern, repl, text)[0]
        return text

In [3]:
# Set up NLP tools

replacer = JVReplacer()
lemmatizer = BackoffLatinLemmatizer()
tokenizer = LatinPunktSentenceTokenizer() #TokenizeSentence('latin')

In [4]:
%%capture

# Specific imports

import treetaggerwrapper

# Create Latin tagger
# NB: TreeTagger must be installed for this to work properly;
# cf. https://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/ and https://treetaggerwrapper.readthedocs.io/en/latest/
# Using the Latin parameter file at https://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/data/latin.par.gz

TT = treetaggerwrapper.TreeTagger(TAGLANG='la', TAGOPT='-token -lemma -sgml -quiet')

def lemmatize(text):
    lemmas = []
    tags = TT.tag_text(text)    
    for tag in tags:
        if '\t' in tag:
            lemmas.append(tag.split('\t')[2].replace('-a',''))
        else:
            lemmas.append('<unknown>')
    return lemmas

TreeTaggerError: Can't locate TreeTagger directory (and no TAGDIR specified).

In [5]:
# Preprocess texts

def preprocess(text):
    import html
    import string
    import re
        
    text = text.lower()
    text = replacer.replace(text) #Normalize u/v & i/j
    
    punctuation ="\"#$%&\'()*+,-/:;<=>@[\]^_`{|}~.?!«»"
    translator = str.maketrans({key: " " for key in punctuation})
    text = text.translate(translator)

    translator = str.maketrans({key: " " for key in '0123456789'})
    text = text.translate(translator)
    return text

In [None]:
# This step requires the Bamman corpus (latin_txt.tar.gz) to be downloaded, placed in the folder ../models/data/, 
# uncompressed; i.e. there should be a folder of files named ../models/data/latin_txt. The Bamman corpus can be 
# downloaded from https://docs.google.com/uc?id=0B5pGKi0iCsnbZEdHZ3N6d216am8&export=download; see more at:
# http://www.cs.cmu.edu/~dbamman/latin.html

In [None]:
#helper iterator class to process raw text and to handle file by file. Avoids memory issues. 

class MySentences(object):
    def __init__(self, dirname):
        self.dirname = dirname
 
    
    def __iter__(self):
        for fname in tqdm(os.listdir(self.dirname)):
            with open(os.path.join(self.dirname, fname), encoding='utf-8') as file:
                #sents = tokenizer.tokenize_sentences(file.read().replace('\n', ''))
                sents = file.readlines()
                for sent in sents:
                    sent = preprocess(sent)
                    lemmas = lemmatize(sent)
                    yield lemmas

In [None]:
%%time

# Build Latin word2vec on Bamman data

cores = multiprocessing.cpu_count()

latin_w2v_model = Word2Vec(MySentences("../models/data/cc100-latin/la.nolorem.tok.latalphabetonly.v2.txt"), size = 50, min_count=100, workers=cores-1, iter =1)

In [None]:
latin_w2v_model.save("../models/latin_w2v_bamman_lemma_tt")