# Data Augmentation

Methods:
* Replace only Etruscan proper names -> for LM
* Replace both Etruscan and English proper names -> for MT
* Use category / POS tags instead of actual words

Abbreviations are used only for proper names.

In [1]:
import pandas as pd
import numpy as np
import nltk
import sys
sys.path.append("../..")
import utils
import random
import re
from collections import defaultdict, deque
from typing import Tuple, List, Dict, Optional, Union
from tqdm import tqdm
tqdm.pandas()

## Load data

In [2]:
docs = pd.read_csv("../Etruscan.csv", index_col=0)
docs["Etruscan"] = docs["Etruscan"].apply(lambda x: utils.replace(x, utils.to_latin))
docs["Translation"] = docs["Translation"].apply(lambda x: np.nan if x is np.nan else x.lower().strip())
docs

Unnamed: 0,ID,City,Year - From,Year - To,Etruscan,Translation,key
0,ETP 192,Ager Tarquiniensis,275.0,250.0,cleusinas laris larisal clan,"laris cleusinas, son of laris.",
1,Cr 2.20,Caere,675.0,650.0,mi karkanas thahvna,i (am) the container of karkana,
2,Cm 2.46,Campania,500.0,450.0,mi e.i. mi.n.pi capi mi numar thevru.c.l.na...,'don't take me. i (am) nunar. (i am the proper...,
3,ETP 269,,625.0,600.0,mini muluvanice tetana ve.l.ka.s.na.s. veleli...,tetana velkasnas gave me to velellia.,
4,Ta 3.2,Tarquinia,580.0,580.0,itun turuce vene.l a.telinas. tinas dlniiaras,venel atelinas dedicated this (vase) to the so...,
...,...,...,...,...,...,...,...
5815,20926,,,,reithu,,1
5819,20982,,,,ecnatnial,,2
5822,21003,,,,ivnii,,1C
5830,21065,,,,marces,mr-marces,1A


In [3]:
vocab = utils.load_pos("../ETP_POS.csv")
vocab["Etruscan"] = vocab["Etruscan"].apply(lambda x: utils.replace(x, utils.to_latin))
vocab

Unnamed: 0,Etruscan,Translations,POS,Is inferred,Is suffix,Abbreviation of,Suffix indexes,city name,place name,name,...,1st pert,2nd pert,1st pers,2nd pers,3rd pers,pl,gen,abl,pert,TAG
0,isa,"((True, the),)",def art nom,False,True,,,,,,...,0.0,0.0,,,,,False,False,False,ADP
1,isha,"((True, the),)",def art nom,False,True,,,,,,...,0.0,0.0,,,,,False,False,False,PRON
2,x,"((True, and),)",enclitic conj,False,True,,,,,,...,,,,,,,False,False,False,PRT
3,ishla,"((True, the),)",def art 2nd gen,False,True,,,,,,...,0.0,0.0,,,,,True,False,False,DET
4,cle,"((True, the),)",dem pro loc,False,True,,,,,,...,0.0,0.0,,,,,False,False,False,ADP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,fleres,"((True, divine spirit), (True, divinity))",1st gen,False,False,,,,,,...,0.0,0.0,,,,,True,False,False,NOUN
1118,flerthrce,(),past act,False,False,,,,,,...,,,,,,,False,False,False,VERB
1119,fratuce,"((False, incised),)",past act,False,False,,,,,,...,,,,,,,False,False,False,VERB
1120,frontac,"((True, of lightning),)",nom acc,False,False,,,,,,...,0.0,0.0,,,,,False,False,False,ADJ


In [4]:
name_columns = ["city name", "place name", "name", "epithet", "theo", "cogn", "prae", "nomen"]

def is_proper_name(df: pd.DataFrame, include_abbreviation: bool=False) -> pd.Series:
    """
    Check if the entry is a proper name.

    Args:
        df: POS dataframe
        include_abbreviation: whether to include abbreviated names in the name mask
    Returns:
        Bool mask that selects the proper names
    """
    mask = df[name_columns].apply(pd.Series.any, axis = 1)
    if not include_abbreviation:
        mask = mask & df["Abbreviation of"].isna()  # If NA -> not an abbreviation
    return mask


In [5]:
def compute_index(df: pd.DataFrame) -> Tuple[List[int], Dict[Tuple[bool], int]]:
    """
    Compute the indexes for the proper names. Names with the same characteristics 
    have the same index.

    Args:
        df: dataframe with the category columns of the proper names (i.e., exclude "Translation", "POS", etc..., keep "nom", "acc", etc...)
    Return:
        Tuple with list of indexes and dictionary with tuple describing the name and the index.
    """
    indexes = []
    map_ = {}
    current_index = -1
    for row in df.iloc:
        tmp = tuple((row >= 0.5).to_list())        
        candidate = map_.get(tmp)
        if candidate is None: # New item
            current_index += 1
            indexes.append(current_index)
            map_[tmp] = current_index
        else: # Not new
            indexes.append(candidate)
    
    return indexes, map_

def expand_index(df: pd.DataFrame) -> pd.DataFrame:
    """ 
    Expand the translations: a single translation for each row.

    Args:
        df: dataframe with Etruscan, Translations and Index
    Returns:
        Dataframe with expanded translations
    """
    tmp = []
    for row in df.iloc:
        if len(row["Translations"]) == 0:
            tmp.append((row["Etruscan"], np.nan, row["Index"]))
        else:
            for t in row["Translations"]:
                tmp.append((row["Etruscan"], t[1], row["Index"]))
    return pd.DataFrame.from_records(tmp, columns=["Etruscan", "Translations", "Index"])

def create_index(df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series, Dict[Tuple[bool], int]]:
    """
    Create the index dataframe.

    Args:
        df: POS dataframe
    Returns:
        Index dataframe, name mask, index map
    """
    name_mask = is_proper_name(df)
    indexes, map_ = compute_index(df[name_mask][utils.tags])

    index_df = pd.DataFrame({"Etruscan": df[name_mask]["Etruscan"], "Translations": df[name_mask]["Translations"], "Index": indexes}).reset_index(drop=True)
    index_df = expand_index(index_df)
    return index_df, name_mask, map_
        

In [6]:
index_df, name_mask, map_ = create_index(vocab)
print("Indexes:", index_df["Index"].max()+1)
index_df

Indexes: 45


Unnamed: 0,Etruscan,Translations,Index
0,capue,in capua,0
1,enash,ena,1
2,veldthi,velca,2
3,vipshl,vipsa,2
4,kamarte,kamarta,0
...,...,...,...
508,fulu.s.la,fulu,13
509,fufle,fufle,35
510,[---]aninal,,12
511,[----]aninalc,,12


In [7]:
def index_df_to_map(
        index_df: pd.DataFrame, 
        lang:str = "et", 
        name_to_index:bool=True
        ) -> Union[Dict[str, int], Dict[Tuple[str, str], int], Dict[int, List[str]], Dict[int, List[Tuple[str,str]]]]:
    """
    Convert the index dataframe to a dictionary for efficiency

    Args:
        index_df: index dataframe
        lang: language of the map (values in ["et", "eng", "bi"])
        name_to_index: if True the output maps a name to an index. Otherwise, it maps an index to a list of names
    Returns:
        Dictionary
    """
    if lang == "et":
        col = "Etruscan"
    elif lang == "eng":
        col = "Translations"
    elif lang == "bi":
        col = ["Etruscan", "Translations"]
    index_df = index_df.dropna()
    if name_to_index:
        if lang == "bi":
            # Dict: (et name, eng name): index            
            return dict(zip(list(index_df[col].itertuples(index=False, name=None)), index_df["Index"]))
        else:
            # Dict: name: index
            return dict(zip(index_df[col].to_list(), index_df["Index"]))
    else:
        d = defaultdict(list)
        if lang == "bi":
            for row in index_df.iloc:
                d[row["Index"]].append(tuple(row[col]))
        else:
            for row in index_df.iloc:
                d[row["Index"]].append(row[col])
        return d

## Replace Etruscan names

In [8]:
def mark_text_single(text: str, index_df: Union[pd.DataFrame, Dict[str, int]], fmt: str="§{}§") -> str:
    """
    Replace proper name with an index.

    Args:
        text: text to process
        index_df: index dataframe
        fmt: string to format the indexes (e.g., "§0§", "§10§")
    Return:
        Text with index instead of the proper names.
    """    
    if isinstance(index_df, pd.DataFrame):
        index_df = index_df_to_map(index_df, "et", True)

    for name, index in index_df.items():        
        r = re.compile(fr"\b{name}\b")
        text = r.sub(fmt.format(index), text)
    return text

In [9]:
def generate_single(
        text: str, 
        index_df: Union[pd.DataFrame, Dict[int, List[str]]],
        fmt: str="§{}§", 
        index_threshold:Tuple[int, int] = (8, 20),
        max_replacements:Tuple[int, int, int]=(3, 2, 1),
        rng: random.Random = None
        ) -> List[str]:        
    """
    Replace the index with all the compatible proper names.

    Args:
        text: text to process
        index_df: index dataframe
        etruscan: wheter it is an Etruscan text. English otherwise
        fmt: string to format the indexes (e.g., "§0§", "§10§")
        index_threshold: use a different number of replacements based on the number of indexes
        max_replacements: up to this number of replacements for each index
        rng: random number generator used to select the replacements
    Return:
        List of text with proper names instead of indexes.    

    Note: it might generate duplicated entries
    """    
    if rng is None:
        rng = random.Random(0)

    if isinstance(index_df, pd.DataFrame):
        index_df = index_df_to_map(index_df, "et", False)
    # RE to find the marks
    mark = re.compile(fmt.format(r"(?P<index>[0-9]+)"))
    
    # Results, list of strings
    out = []

    # Strore string that could have marks in it
    q = deque()
    q.append(text)

    n = len(mark.findall(text))
    # print(n)
    
    if n > index_threshold[1]:
        threshold = max_replacements[2]
    elif n > index_threshold[0]:
        threshold = max_replacements[1]
    else:
        threshold = max_replacements[0]

    while len(q) != 0:
        # Get next string
        t = q.popleft()        
        match_ = mark.search(t)
        
        if match_ is None: # All substitution are done                   
            out.append(t)
        else: # Still some mark to replace
            index = int(match_.group("index"))
            this_mark = re.compile(match_.group())  # e.g., §0§ instead of §.*§
            # candidates = index_df[index_df["Index"] == index][col].to_list()
            candidates = index_df[index]
            
            # Too many raplacements: select few
            if len(candidates) > threshold:                
                # candidates = rng.choice(candidates, threshold)
                candidates = rng.sample(candidates, k=threshold)

            for c in candidates:
                q.append(this_mark.sub(c, t, 1)) # Replace only the first match
    
    # Remove duplicates      
    return out

In [10]:
def generate_etruscan(
        docs: pd.DataFrame, 
        index_df: pd.DataFrame, 
        index_threshold:Tuple[int, int] = (8, 20),
        max_replacements:Tuple[int, int, int]=(3, 2, 1),
        seed:int=0
    ) -> Tuple[List[str], List[str]]:
    """
    Generate new Etruscan texts.

    Args:
        docs: dataframe with Etruscan texts. Column: "Etruscan"
        index_df: index dataframe
        index_threshold: use a different number of replacements based on the number of indexes
        max_replacements: up to this number of replacements for each index
    Return:
        Tuple with generated texts and marked texts
    """
    rng = random.Random(seed)
    out = [None] * len(docs) # List of lists

    name_to_index = index_df_to_map(index_df, "et", True)
    index_to_name = index_df_to_map(index_df, "et", False)

    # Add the indexes
    marked = docs["Etruscan"].progress_apply(lambda x: mark_text_single(x, name_to_index)).to_list()
    
    # mark = re.compile(r"§(?P<index>[0-9]+)§")
    
    # Replace the indexes
    for i, j in enumerate(tqdm(marked)): 
        out[i] = generate_single(j, index_to_name, index_threshold=index_threshold, max_replacements=max_replacements, rng=rng)

    # Flatten the output
    out = [j for i in out for j in i]
    out = list(set(out))
    return out, marked

In [11]:
gen, mark = generate_etruscan(docs, index_df)
print("Original texts:", len(docs))
print("Generated texts:", len(gen))

  r = re.compile(fr"\b{name}\b")
  r = re.compile(fr"\b{name}\b")
  r = re.compile(fr"\b{name}\b")
100%|██████████| 7139/7139 [00:03<00:00, 1932.75it/s]
100%|██████████| 7139/7139 [00:03<00:00, 2071.68it/s]

Original texts: 7139
Generated texts: 82013





In [12]:
# Use only CIEP
gen_ciep, _ = generate_etruscan(docs.dropna(subset="key"), index_df)
print("Original texts:", len(docs.dropna(subset="key")))
print("Generated texts:", len(gen_ciep))

# Use only ETP
gen_etp, masked = generate_etruscan(docs[docs["key"].isna()], index_df)
print("Original texts:", len(docs[docs["key"].isna()]))
print("Generated texts:", len(gen_etp))
print("Masked texts:", len(masked))

  r = re.compile(fr"\b{name}\b")
  r = re.compile(fr"\b{name}\b")
  r = re.compile(fr"\b{name}\b")
100%|██████████| 6578/6578 [00:03<00:00, 2047.70it/s]
100%|██████████| 6578/6578 [00:01<00:00, 6386.28it/s]


Original texts: 6578
Generated texts: 5884


100%|██████████| 561/561 [00:00<00:00, 1561.36it/s]
100%|██████████| 561/561 [00:02<00:00, 229.90it/s] 

Original texts: 561
Generated texts: 76144
Masked texts: 561





In [13]:
import gc
gc.collect()

11

## Replace Etruscan and English names

In [129]:
def mark_text_paired(
        pair: Tuple[str, str],
        pair_index: Optional[Dict[str,int]] = None,
        fmt: str="§{}§") -> str:
    """
    Replace proper name with an index.

    Args:
        text: text to process
        pair_index_df: index dataframe
        fmt: string to format the indexes (e.g., "§0§", "§10§")
    Return:
        Text with index instead of the proper names.
    """    
    if isinstance(pair_index, pd.DataFrame):
        pair_index = index_df_to_map(pair_index, "bi", name_to_index=True)

    for (name_et, name_eng), index in pair_index.items():
        if name_et is not np.nan and name_eng is not np.nan and len(name_et) != 0 and len(name_eng) != 0:            
            r_et = re.compile(fr"\b{name_et}\b")
            r_eng = re.compile(fr"\b{name_eng}\b")

            if len(r_et.findall(pair[0])) == len(r_eng.findall(pair[1])) != 0:                               
                pair = (
                    r_et.sub(fmt.format(index), pair[0]),
                    r_eng.sub(fmt.format(index), pair[1])
                )
    return pair

In [130]:
def generate_paired(
        pair: Tuple[str, str],         
        pair_index: Union[pd.DataFrame,Dict[str,int]],
        fmt: str="§{}§",
        index_threshold:Tuple[int, int] = (8, 20),
        max_replacements:Tuple[int, int, int]=(3, 2, 1),
        rng: random.Random = None
        ) -> List[str]:        
    """
    Replace the index with all the compatible proper names.

    Args:
        text: text to process
        index_df: index dataframe
        etruscan: wheter it is an Etruscan text. English otherwise
        fmt: string to format the indexes (e.g., "§0§", "§10§")
        index_threshold: use a different number of replacements based on the number of indexes
        max_replacements: up to this number of replacements for each index
        rng: random number generator used to select the replacements
    Return:
        List of text with proper names instead of indexes.    

    Note: it might generate duplicated entries
    """    
    if rng is None:
        rng = np.random.RandomState(0)

    if isinstance(pair_index, pd.DataFrame):
        pair_index = index_df_to_map(pair_index, "bi", False)
    # RE to find the marks
    mark = re.compile(fmt.format(r"(?P<index>[0-9]+)"))
    
    # Results, list of strings
    out = []

    # Strore string that could have marks in it
    q = deque()
    q.append(pair)

    n = len(mark.findall(pair[0]))
    # print(n)
    
    if n > index_threshold[1]:
        threshold = max_replacements[2]
    elif n > index_threshold[0]:
        threshold = max_replacements[1]
    else:
        threshold = max_replacements[0]

    while len(q) != 0:
        # Get next string
        t = q.popleft()        
        match_et = mark.search(t[0])
        # match_eng = mark.search(t[1])
        
        if match_et is None: # All substitution are done                   
            out.append(t)
        else: # Still some mark to replace
            index = int(match_et.group("index"))
            this_mark = re.compile(match_et.group())  # e.g., §0§ instead of §.*§
            # candidates = index_df[index_df["Index"] == index][col].to_list()
            candidates = pair_index[index]
            
            # Too many raplacements: select few
            if len(candidates) > threshold:                
                # candidates = rng.choice(candidates, threshold, replace=False)
                candidates = rng.sample(candidates, k=threshold)

            for c in candidates:
                q.append((
                    this_mark.sub(c[0], t[0], 1),
                    this_mark.sub(c[1], t[1], 1)
                )) # Replace only the first match
    
    # Remove duplicates      
    return out

In [131]:
def generate_translations(
        docs: pd.DataFrame, 
        index_df: pd.DataFrame, 
        index_threshold:Tuple[int, int] = (8, 20),
        max_replacements:Tuple[int, int, int]=(3, 2, 1),
        seed:int=0
    ) -> Tuple[List[str], List[str]]:
    """
    Generate new Etruscan texts.

    Args:
        docs: dataframe with Etruscan texts. Column: "Etruscan", "Translations"
        index_df: index dataframe
        index_threshold: use a different number of replacements based on the number of indexes
        max_replacements: up to this number of replacements for each index
    Return:
        Tuple with generated texts and marked texts
    """
    docs = docs.dropna(subset=["Etruscan", "Translation"])
    rng = random.Random(seed)
    out = [None] * len(docs) # List of lists

    name_to_index = index_df_to_map(index_df, "bi", True)
    index_to_name = index_df_to_map(index_df, "bi", False)

    # Add the indexes
    marked = docs.progress_apply(lambda x: mark_text_paired((x["Etruscan"], x["Translation"]), name_to_index),axis=1).to_list()
    
    # mark = re.compile(r"§(?P<index>[0-9]+)§")
    
    # Replace the indexes
    for i, j in enumerate(tqdm(marked)):         
        out[i] = generate_paired(j, index_to_name, index_threshold=index_threshold, max_replacements=max_replacements, rng=rng)

    # Flatten the output
    out = [j for i in out for j in i]
    out = list(set(out))
    return out, marked

In [135]:
gen, mark = generate_translations(docs, index_df)
print("Original texts:", len(docs))
print("Generated texts:", len(gen))

  r_et = re.compile(fr"\b{name_et}\b")
100%|██████████| 2891/2891 [00:56<00:00, 51.16it/s]
100%|██████████| 2891/2891 [00:00<00:00, 38068.75it/s]

Original texts: 7139
Generated texts: 14022





In [136]:
# Use only CIEP
gen_ciep, _ = generate_translations(docs.dropna(subset="key"), index_df)
print("Original texts:", len(docs.dropna(subset="key")))
print("Generated texts:", len(gen_ciep))

# Use only ETP
gen_etp, masked = generate_translations(docs[docs["key"].isna()], index_df)
print("Original texts:", len(docs[docs["key"].isna()]))
print("Generated texts:", len(gen_etp))
print("Masked texts:", len(masked))

  r_et = re.compile(fr"\b{name_et}\b")
100%|██████████| 2652/2652 [00:50<00:00, 52.32it/s]
100%|██████████| 2652/2652 [00:00<00:00, 190646.91it/s]


Original texts: 6578
Generated texts: 2382


  r_et = re.compile(fr"\b{name_et}\b")
100%|██████████| 239/239 [00:04<00:00, 50.95it/s]
100%|██████████| 239/239 [00:00<00:00, 3298.95it/s]

Original texts: 561
Generated texts: 11641
Masked texts: 239





In [137]:
import gc
gc.collect()

0

## To POS

In [8]:
def only_alpha(t):
    return re.sub(r"[^a-zA-Z ]", "", t)

def make_pos_train_set(vocab):
    words = vocab["Etruscan"].apply(only_alpha)
    tags = vocab["TAG"]
    return [[(i,j)] for i,j in zip(words, tags)]

def simple_tokenizer(x):
    tmp = [i.strip() for i in re.split(r"[:• ]", x.lower())]
    return [i for i in tmp if len(i) != 0]

In [42]:
def count_none(tagger, docs):
    toks = docs["Etruscan"].apply(simple_tokenizer)
    tot = 0
    none = 0
    for i in toks:
        tags = tagger.tag(i)
        tot += len(tags)
        for j in tags:
            if j[1] is None or j[1] is np.nan:
                none += 1
    return none, tot

def tag(tagger, docs):
    toks = docs["Etruscan"].apply(simple_tokenizer)
    return [tagger.tag(i) for i in toks]    
    

In [38]:
tagger = nltk.tag.UnigramTagger(make_pos_train_set(vocab), verbose=True) # Use the entire vocab, just for testing
tagger.tag(simple_tokenizer(docs.iloc[0]["Etruscan"]))

[Trained Unigram tagger: size=959, backoff=6.77%, pruning=0.00%]


[('cleusinas', 'NOUN'),
 ('laris', 'NOUN'),
 ('larisal', 'NOUN'),
 ('clan', 'NOUN')]

In [40]:
none, tot = count_none(tagger, docs)
print("Tot tokens:", tot)
print("None tags:", none)
print("None ratio:", none/tot)

Tot tokens: 10340
None tags: 8132
None ratio: 0.7864603481624758


In [41]:
print("--ETP--")
none, tot = count_none(tagger, docs[docs["key"].isna()])
print("Tot tokens:", tot)
print("None tags:", none)
print("None ratio:", none/tot)

print("--CIEP--")
none, tot = count_none(tagger, docs.dropna(subset="key"))
print("Tot tokens:", tot)
print("None tags:", none)
print("None ratio:", none/tot)

--ETP--
Tot tokens: 2546
None tags: 1039
None ratio: 0.40809112333071484
--CIEP--
Tot tokens: 7794
None tags: 7093
None ratio: 0.9100590197587888


## Test other taggers

### Brill: no

In [81]:
nltk.tbl.Template._cleartemplates()
templates = [nltk.tbl.Template(nltk.tag.brill.Pos([-1])), nltk.tbl.Template(nltk.tag.brill.Pos([-1]), nltk.tag.brill.Word([0]))]
trainer = nltk.tag.BrillTaggerTrainer(nltk.tag.RegexpTagger([]), templates, trace=3) # As doc says, UnigramTagger does not work
brill_tagger = trainer.train(tag(tagger, docs), min_score=0, max_rules=20)


TBL train (fast) (seqs: 7139; tokens: 10340; tpls: 2; min score: 0; min acc: None)
Finding initial useful rules...
    Found 924 useful rules.

           B      |
   S   F   r   O  |        Score = Fixed - Broken
   c   i   o   t  |  R     Fixed = num tags changed incorrect -> correct
   o   x   k   h  |  u     Broken = num tags changed correct -> incorrect
   r   e   e   e  |  l     Other = num tags changed incorrect -> incorrect
   e   d   n   r  |  e
------------------+-------------------------------------------------------
  24  24   0   0  | None->NOUN if Pos:None@[-1] & Word:a@[0]
  20  20   0   0  | None->NOUN if Pos:None@[-1] & Word:clan@[0]
  20  20   0   0  | None->NOUN if Pos:None@[-1] & Word:v@[0]
  20  20   0   0  | None->PRON if Pos:None@[-1] & Word:mi@[0]
  20  20   0   0  | None->VERB if Pos:None@[-1] & Word:turce@[0]
  14  14   0   0  | None->NOUN if Pos:None@[-1] & Word:arnthial@[0]
  14  14   0   0  | None->NOUN if Pos:None@[-1] & Word:avils@[0]
  14  14   0   0  | 

In [82]:
brill_tagger.print_template_statistics()

TEMPLATE STATISTICS (TRAIN)  2 templates, 20 rules)
TRAIN (  10340 tokens) initial  2585 0.7500 final:  2311 0.7765
#ID | Score (train) |  #Rules     | Template
--------------------------------------------
001 |   249   0.909 |  18   0.900 | Template(Pos([-1]),Word([0]))
000 |    25   0.091 |   2   0.100 | Template(Pos([-1]))

UNUSED TEMPLATES (0)



In [83]:
brill_tagger.tag(simple_tokenizer(docs["Etruscan"].iloc[0]))

[('cleusinas', None), ('laris', 'NOUN'), ('larisal', None), ('clan', 'NOUN')]

### Bigram

In [88]:
bigram_tagger = nltk.tag.BigramTagger(tag(tagger, docs), verbose=True)
bigram_tagger.tag(simple_tokenizer(docs.iloc[0]["Etruscan"]))

[Trained Unigram tagger: size=7237, backoff=0.00%, pruning=0.00%]


[('cleusinas', 'NOUN'),
 ('laris', 'NOUN'),
 ('larisal', 'NOUN'),
 ('clan', 'NOUN')]

### Affix

In [94]:
affix_tagger = nltk.tag.AffixTagger(tag(tagger, docs), verbose=True, backoff=tagger, min_stem_length=1)
affix_tagger.tag(simple_tokenizer(docs.iloc[0]["Etruscan"]))

[Trained Unigram tagger: size=60, backoff=93.30%, pruning=95.75%]


[('cleusinas', 'NOUN'),
 ('laris', 'NOUN'),
 ('larisal', 'NOUN'),
 ('clan', 'NOUN')]

### Trigram

In [95]:
trigram_tagger = nltk.tag.TrigramTagger(tag(tagger, docs), verbose=True)
trigram_tagger.tag(simple_tokenizer(docs.iloc[0]["Etruscan"]))

[Trained Unigram tagger: size=7497, backoff=0.00%, pruning=0.00%]


[('cleusinas', 'NOUN'),
 ('laris', 'NOUN'),
 ('larisal', 'NOUN'),
 ('clan', 'NOUN')]

## To Category

In [32]:
def get_categories(multi_case=True):
    cols = utils.tags.copy()
    if not multi_case:
        cases = ["gen", "abl", "pert"]
        for i in ["1st", "2nd"]:
            for j in cases:
                cols.remove(f"{i} {j}")
        cols += cases    
    return cols

def make_category_train_set(vocab, multi_case=True):
    cols = get_categories(multi_case)
    words = vocab["Etruscan"].apply(only_alpha)
    tags = vocab[cols].itertuples(index=False, name=None)
    return [[(i,j)] for i,j in zip(words, tags)]

def category_description(cat: Tuple, to_dict:bool=False) -> Union[Dict[str, Union[float, bool]], str]:
    if len(cat) == 54:
        names = get_categories(multi_case=True)
    elif len(cat) == 51:
        names = get_categories(multi_case=False)
    else:
        raise Exception("Invalid category length")
    
    d = dict(zip(names, cat))
    if to_dict:
        return d
    else:
        lines = [f"{i}: {j}" for i,j in d.items()]
        return "\n".join(lines)

In [33]:
cat_tagger = nltk.tag.UnigramTagger(make_category_train_set(vocab), verbose=True) # Use the entire vocab, just for testing
cat_tagger.tag(simple_tokenizer(docs.iloc[0]["Etruscan"]))

[Trained Unigram tagger: size=959, backoff=14.53%, pruning=0.00%]


[('cleusinas',
  (nan,
   nan,
   nan,
   nan,
   nan,
   0.0,
   0.0,
   1.0,
   0.0,
   0.0,
   1.0,
   0.0,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   1.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   nan,
   nan,
   nan,
   nan)),
 ('laris',
  (nan,
   nan,
   nan,
   nan,
   nan,
   0.0,
   1.0,
   0.0,
   1.0,
   1.0,
   1.0,
   0.0,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   nan,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   nan,
   nan,
   nan,
   nan)),
 ('larisal',
  (nan,
   nan,
   nan,
   nan,
   nan,
   0.0,
   1.0,
   0.0,
   0.0,
   0.0,
   1.0,
   0.