# ETP - Part of Speech

In [1]:
import numpy as np
import pandas as pd
import nltk
from typing import Tuple, List
from tqdm import tqdm
import re

In [2]:
names = "ETPNames.txt"
words = "ETPWords.txt"

In [3]:
# Additional properties
# !: inferred
# -: suffix
# +: suffix attached
# abbrev. of: abbreviation

In [4]:
replacements = {    
    "s'": "ś",
    "s*": "σ",    
    "s^x" : "š",
    "0": "θ",
    "‘": "'",
    "’": "'",
    "ç": "ς",    
    "x*": "χ",
    "f*": "φ",
    #"s^+": 
    "e'": "ê",
    "|" : " | ",
    # ":" : " : ",
    "  ": " "
}
def preprocess(l):
    for old, new in replacements.items():
        l = l.replace(old, new)
    return l

## Parse the files

In [5]:
def get_lines(path):
    with open(path) as f:
        lines = f.readlines()
    return [preprocess(i.lower().strip()) for i in lines]

In [6]:
name_lines = get_lines(names)
word_lines = get_lines(words)

In [7]:
def process_line(l: str) -> Tuple[str, List[str], str, bool, bool, bool]:
    """
    Args:
        l: line from the data file (assume no white spaces)
    Returns:
        Tuple with Etruscan text, list of translations, pos tags (str), flag for inferred word, tag for suffix, tag for abbreviation
    """
    optional = re.compile("\([a-zA-Z]+\)")

    abbreviation = l.startswith("abbrev. of")
    inferred = l.startswith("!")
    suffix = l.startswith("+") or l.startswith("-")
    etruscan = ""
    translations = []
    pos = ""
    if abbreviation: 
        etruscan = l.split("abbrev. of")[1].strip()
    else:
        if inferred or l.startswith("+"):        
            l = l.split(" ", 1)[1]
        if l.startswith("-"):
            l = l[1:]
        # Process word and translation
        l = l.split("\t")
        l = [i.strip() for i in l if i.strip() != ""] # Len 2: 0 -> word and translation, 1 -> pos
        tmp = l[0].split(" ", 1)
        
        etruscan = tmp[0]
        if len(tmp) >= 2:
            translations = tmp[1].split(",")

        # Process translations
        tmp = []
        for i in translations:
            # Remove unknown translations
            if i != "?": 
                sure = "(?)" not in i
                i = i.replace("(?)", "").strip()
                if optional.search(i) is not None:
                    tmp.append((sure, optional.sub(i, "").strip()))
                    tmp.append((sure, i.replace("(", "").replace(")", "").strip()))
                else:
                    tmp.append((sure, i))
        translations = tmp       
        
        # Process pos
        if len(l) >= 2:
            pos = l[1]
    return (etruscan, tuple(translations), pos, inferred, suffix, abbreviation)

In [8]:
def process_lines(lines: List[str]) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Args:
        lines: list of lines to parse
    Returns:
        DataFrame with the words. 
        The columns are "Etruscan", "Translations", "POS", "Is inferred", "Is suffix", "Is abbreviation", "Suffix indexes"
    """
    suffixes = []
    words = []
    tot = len(lines)
    for i in tqdm(range(tot), total=tot):
        l = lines[i]
        if not l.startswith("+"):
            suff = []
            abbr = []
            j = i + 1
            # Find suffixes and abbreviations
            while j < tot and (lines[j].startswith("+") or lines[j].startswith("-") or lines[j].startswith("abbrev.")):
                if lines[j].startswith("+") or lines[j].startswith("-"):
                    suff.append(lines[j])                
                else:
                    abbr.append(lines[j])
                j += 1
            
            # Process suffixes
            # Add extra column for suff. indexes (not used -> None)
            # suff = [(*process_line(i), None) for i in suff]        
            suff = [(*process_line(i)[:-1], None, None) for i in suff]

            # Process abbreviations     
            # Take only the Etruscan text
            abbr = [process_line(i) for i in abbr]
            if len(abbr) > 1:
                print(f"Multiple abbreviation found for {i}:{l}")
            if len(abbr) == 0:
                abbr = None
            else:
                abbr = abbr[0][0] # Etruscan        
            

            # Manage suffix indexes 
            suff_indexes = list(range(len(suffixes), len(suffixes) + len(suff)))
            # suff_indexes = len(suffixes) # Only 1 suffix <- THIS IS WRONG
            # if len(suff) > 1:
                # print(f"Multiple suffixes found for {i}:{l}")
            if len(suff) == 0:
                suff_indexes = None  
            else:          
                suff_indexes = tuple(suff_indexes)

            try:            
                et, tr, pos, inf, s, _ = process_line(l)
            except:
                print(f"Generic error in {i}:{l}")
            # Make new items: Etruscan, translations, pos, suffix indexes, inferred, abbreviation
            l = (et, tr, pos, s, inf, abbr, suff_indexes) # Include the abbreviation in the list
            
            # Add to list
            suffixes.extend(suff)
            words.append(l)
    
    # Make dataframes
    # tmp = suffixes + words
    columns=["Etruscan", "Translations", "POS", "Is inferred", "Is suffix", "Abbreviation of", "Suffix indexes"]
    df_words = pd.DataFrame.from_records(words, columns=columns)
    df_suff = pd.DataFrame.from_records(suffixes, columns=columns)
    return df_words, df_suff

In [9]:
def merge(df_words_1: pd.DataFrame, df_words_2: pd.DataFrame, df_suff_1: pd.DataFrame, df_suff_2: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Merge the dataframes of actual words and the dataframes of suffixes

    Args:
        df_words_1: first dataFrame
        df_words_2: seconds dataFrame
        df_suff_1: suffixes for the first dataFrame
        df_suff_2: suffixes for the second dataFrame 
    """

    # Shift the index of the 
    offset = len(df_suff_1)
    shift = lambda t: tuple(i + offset for i in t) if t is not None else None
    df_words_2["Suffix indexes"] = df_words_2["Suffix indexes"].map(shift)
    
    df_words = pd.concat([df_words_1, df_words_2], ignore_index=True)
    df_suff = pd.concat([df_suff_1, df_suff_2], ignore_index=True)
    return df_words, df_suff

def remove_duplicated(df_words: pd.DataFrame, df_suff: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Remove duplicate words and suffixes. Map words to the right suffix

    Args:
        df_words: DataFrame with words
        df_suff: DataFrame with suffixes
    Return:
        df_words, df_suff
    """
    unique = list({tuple(i) for i in df_suff.iloc})
    
    # Remap the indexes
    def remap(suff_indexes):
        if suff_indexes is not None:
            new_indexes = []
            for index in suff_indexes:
                suff = tuple(df_suff.iloc[index])
                new_indexes.append(unique.index(suff))
            return tuple(new_indexes)
        return None
    df_words["Suffix indexes"] = df_words["Suffix indexes"].map(remap)
    df_words = df_words[~df_words.duplicated()]
    # Recreate the suffix dataframe
    df_suff = pd.DataFrame.from_records(unique, columns=df_suff.columns)
    
    return df_words, df_suff

def concat_words_suffixes(df_words: pd.DataFrame, df_suff: pd.DataFrame) -> pd.DataFrame:
    """
    Combine the suffix and the word dataframe
    """
    return pd.concat([df_suff, df_words], ignore_index=True)

In [10]:
df_names, df_name_suff = process_lines(name_lines)
df_words, df_word_suff = process_lines(word_lines)

df_all_words, df_all_suff = merge(df_names, df_words, df_name_suff, df_word_suff)
df_all_words, df_all_suff = remove_duplicated(df_all_words, df_all_suff)
df = concat_words_suffixes(df_all_words, df_all_suff)

df

100%|██████████| 646/646 [00:00<00:00, 92091.65it/s]
100%|██████████| 551/551 [00:00<00:00, 89243.96it/s]


Unnamed: 0,Etruscan,Translations,POS,Is inferred,Is suffix,Abbreviation of,Suffix indexes
0,isa,"((True, the),)",def art nom,False,True,,
1,iσ'a,"((True, the),)",def art nom,False,True,,
2,x,"((True, and),)",enclitic conj,False,True,,
3,iσ'la,"((True, the),)",def art 2nd gen,False,True,,
4,cle,"((True, the),)",dem pro loc,False,True,,
...,...,...,...,...,...,...,...
1117,flereσ,"((True, divine spirit), (True, divinity))",1st gen,False,False,,
1118,flerθrce,(),past act,False,False,,
1119,fratuce,"((False, incised),)",past act,False,False,,
1120,frontac,"((True, of lightning),)",nom acc,False,False,,


## Find possible POS tags

In [11]:
# "dubious" is used for those categories that are unclear: e.g., nom (?)
# When used as bool, "dubious" is True

In [12]:
pos = set(df["POS"])

In [13]:
all_tags = [i.split(" ") for i in pos]
all_tags = {i.strip() for j in all_tags for i in j}
# "" and (?) -> empty and not sure
# Double check for typos
# Typos: abbrev.
print(sorted(all_tags))

['', '(?)', '1st', '2nd', '3rd', 'abl', 'acc', 'act', 'adv', 'anim', 'art', 'as-part', 'city', 'cogn', 'conj', 'def', 'deictic', 'dem', 'enclitic', 'epithet', 'fem', 'gen', 'impv', 'inanim', 'indef', 'jussive', 'loc', 'masc', 'name', 'nas-part', 'nasa-part', 'necess', 'neg', 'nom', 'nomen', 'non-past', 'num', 'particle', 'pass', 'past', 'pers', 'pert', 'pl', 'place', 'post', 'prae', 'pro', 'rel', 'subord', 'theo', 'u-part', 'θ-impv', 'θ-part', 'θas-part']


In [14]:
# None if not specified
# True if specified
# 1/2/3 if 1st/2nd/3rd
# False: if surely false
tags = [
    "city name",
    "place name",
    "name", # Unspecified (?) name
    
    "epithet",
    'theo', # Theonomin
    "cogn", # Cognomen
    'prae', # Praenomen
    "nomen", # Nomen

    "nom", # Nominative
    "acc", # Accusative
    
    "masc", # Masculine
    "fem", # Feminine  

    "nas-part", 
    "nasa-part", 
    'u-part', 
    'θ-impv', # θ-Imperative
    'θ-part', 
    'θas-part',
    "as-part",

    "act", # Active
    "pass", # Passive
    
    "non-past",
    "past", # Past
    
    "impv", # Imperative
    "jussive",
    "necess",
    
    "inanim", # Inanimate
    "anim", # Animate
    
    "indef", # Indefinite (pronoun)
    "def", # Definite (article)
    
    "deictic particle",
    "enclitic particle", 
    "enclitic conj",    

    "dem", # Demonstrative
    "adv", # Adverb    
    "art", # Article
    "conj", # Conjunction
    'post', # Post-position
    'pro', # Pronoun
    'rel', # Relative
    'subord', # Subordinator 
    "neg",
    "num", # Numeral
    # "particle", 

    "1st gen", # Genitive (1st/2nd)    
    "2nd gen", 
    "1st abl", # Ablative (1st/2nd)
    "2nd abl", 
    "loc", # Locative
    "1st pert", # Pertinentive
    "2nd pert",

    "1st pers", # 1st/2nd/3rd Person/Personal
    "2nd pers",
    "3rd pers",

    "pl", # Plural
]

incompatible = [ # Only one can be true
    ["city name", "place name", "name",],
    ["masc", "fem"],
    ["act", "pass"],
    ["past", "non-past"],
    ["cogn", 'prae', "nomen"],
    ["anim", "inanim"],
    ["def", "indef"], 
    ["1st gen", "2nd gen", "1st abl", "2nd abl", "loc", "1st pert", "2nd pert", "acc"], # Nom and acc can be toghether
    ["1st gen", "2nd gen", "1st abl", "2nd abl", "loc", "1st pert", "2nd pert", "nom"],
    ["as-part", "nas-part", "nasa-part", 'u-part', 'θ-impv', 'θ-part', 'θas-part']
]

In [15]:
def extract_pos(df: pd.DataFrame) -> pd.DataFrame:   
    pos = df["POS"].copy()

    # Add columns
    for tag in tags:
        dubious_tag = f"{tag} (?)"
        
        # Find dubious tags
        tmp1 = pos.map(lambda x: "dubious" if dubious_tag in x else None)
        
        # Remove the tags
        pos = pos.map(lambda x: x.replace(dubious_tag, ""))

        # Find tags
        tmp2 = pos.map(lambda x: True if tag in x else None)

        # Remove the tags
        pos = pos.map(lambda x: x.replace(tag, ""))
          
        tmp1[tmp1.isna()] = tmp2

        df[tag] = tmp1 #.astype(bool_cat)     
            
    # Check incompatible tags
    for i in incompatible:
        for j in i: # Fix j
            for k in i: # Compare with k
                if j != k:
                    df[k][df[j].astype(bool)] = False

    # Additional tags: gen, abl, pert
    additional = ["gen", "abl", "pert"]
    for i in additional:
        df[i] = df[f"1st {i}"].astype(bool) | df[f"2nd {i}"].astype(bool)
    return df


In [16]:
df = extract_pos(df)
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[k][df[j].astype(bool)] = False


Unnamed: 0,Etruscan,Translations,POS,Is inferred,Is suffix,Abbreviation of,Suffix indexes,city name,place name,name,...,loc,1st pert,2nd pert,1st pers,2nd pers,3rd pers,pl,gen,abl,pert
0,isa,"((True, the),)",def art nom,False,True,,,,,,...,False,False,False,,,,,False,False,False
1,iσ'a,"((True, the),)",def art nom,False,True,,,,,,...,False,False,False,,,,,False,False,False
2,x,"((True, and),)",enclitic conj,False,True,,,,,,...,,,,,,,,False,False,False
3,iσ'la,"((True, the),)",def art 2nd gen,False,True,,,,,,...,False,False,False,,,,,True,False,False
4,cle,"((True, the),)",dem pro loc,False,True,,,,,,...,True,False,False,,,,,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,flereσ,"((True, divine spirit), (True, divinity))",1st gen,False,False,,,,,,...,False,False,False,,,,,True,False,False
1118,flerθrce,(),past act,False,False,,,,,,...,,,,,,,,False,False,False
1119,fratuce,"((False, incised),)",past act,False,False,,,,,,...,,,,,,,,False,False,False
1120,frontac,"((True, of lightning),)",nom acc,False,False,,,,,,...,False,False,False,,,,,False,False,False


In [17]:
df.to_csv("ETP_POS.csv", columns=df.columns)

## Map to universal tags

**NOT USED**

From: [https://www.nltk.org/api/nltk.tag.mapping.html](https://www.nltk.org/api/nltk.tag.mapping.html)

Tags from Petrov, Das, & McDonald.

Tags
* VERB: verbs 
* NOUN: nouns
* PRON: pronouns
* ADJ: adjectives 
* ADV: adverbs 
* ADP: adpositions (prepositions and postpositions) 
* CONJ: conjunctions 
* DET: determiners 
* NUM: cardinal numbers 
* PRT: particles or other function words 
* X: other

In [18]:
# verbs = [
#     "act",
#     "pass",
#     "past",
#     "non-past",
#     "impv",
#     "jussive",
#     "necess",
#     "as-part",
#     "nas-part", 
#     "nasa-part", 
#     "u-part", 
#     'θ-impv', 
#     "θ-part", 
#     "θas-part", 
# ]

# nouns = [
#     "city name",
#     "place name",
#     "name",    
#     "masc",
#     "fem",
# ]

# pronouns = [
#     "indef", # 1 case: indef. pronoun
#     "1st pers",
#     "2nd pers",
#     "3rd pers",
#     "dem",
#     "pro",
#     "rel",
# ]

# adjectives = []

# adverbs = [
#     "adv",
# ]

# tags = [
    
#     # NOUN if not another tag
#     "nom", 
#     "acc", 
    
#     "epithet": "NOUN",
#     "theo": "NOUN",
#     "cogn": "NOUN",
#     "prae": "NOUN",
#     "nomen": "NOUN",

#     "anim",
#     "inanim", # Inanimate
    
#     "def": "DET", # Always article
    

#     "deictic particle": "PRT",
#     "enclitic particle": "PRT", 
#     "enclitic conj": "CONJ",    


#     "art": "DET", 
#     "conj": "CONJ",
#     "post": "ADP", 
    
#     "subord": "CONJ",
#     "neg": "PRT",
#     "num": "NUM",
#     # "particle", 

#     # ADJ/NOUN if not another tag
#     "1st gen", 
#     "2nd gen", 
#     "1st abl", 
#     "2nd abl", 
#     "loc", 
#     "1st pert",
#     "2nd pert",

#     # NOUN if not another tag
#     "pl", # NOUN or PRON    
# }

# Fix CSV & Add POS tags

In [19]:
import pandas as pd
import numpy as np
from ast import literal_eval

In [20]:
xlsx = "TAGS.xlsx"  # it has a columns with pos tags
csv =  "ETP_POS.csv"

In [21]:
df_xlsx = pd.read_excel(xlsx)
df_xlsx["TAG"].replace({" NOUN":"NOUN", 0: np.nan}, inplace=True)
df_xlsx

Unnamed: 0,Column1,Etruscan,Translations,POS,TAG,Is inferred,Is suffix,Abbreviation of,Suffix indexes,city name,...,loc,1st pert,2nd pert,1st pers,2nd pers,3rd pers,pl,gen,abl,pert
0,0.0,ri,"((True, 'on behalf of'),)",post,ADP,0.0,1.0,,,,...,,,,,,,,0.0,0.0,0.0
1,1.0,ca,"((True, 'the'),)",dem pro nom acc,PRON,0.0,1.0,,,,...,0.0,0.0,0.0,,,,,0.0,0.0,0.0
2,2.0,pi,(),enclitic particle,PRT,0.0,1.0,,,,...,,,,,,,,0.0,0.0,0.0
3,3.0,ς'va,"((True, 'the'),)",def art pl nom,DET,0.0,1.0,,,,...,0.0,0.0,0.0,,,,1.0,0.0,0.0,0.0
4,4.0,ti,"((True, 'in'),)",post,ADP,0.0,1.0,,,,...,,,,,,,,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1118,1118.0,flerθrce,(),past act,VERB,0.0,0.0,,,,...,,,,,,,,0.0,0.0,0.0
1119,1119.0,fratuce,"((False, 'incised'),)",past act,VERB,0.0,0.0,,,,...,,,,,,,,0.0,0.0,0.0
1120,1120.0,frontac,"((True, 'of lightning'),)",nom acc,ADJ,0.0,0.0,,,,...,0.0,0.0,0.0,,,,,0.0,0.0,0.0
1121,1121.0,[---]e,(),,,0.0,0.0,,,,...,,,,,,,,0.0,0.0,0.0


In [22]:
set(df_xlsx["TAG"].to_list())

{'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRON', 'PRT', 'VERB', nan}

In [23]:
df_csv = pd.read_csv(
    "ETP_POS.csv",
    converters={"Translations": literal_eval},
    true_values=["True", "TRUE"],
    false_values=["False", "FALSE"],
    index_col=0
    )
df_csv

Unnamed: 0,Etruscan,Translations,POS,Is inferred,Is suffix,Abbreviation of,Suffix indexes,city name,place name,name,...,loc,1st pert,2nd pert,1st pers,2nd pers,3rd pers,pl,gen,abl,pert
0,isa,"((True, the),)",def art nom,False,True,,,,,,...,False,False,False,,,,,False,False,False
1,iσ'a,"((True, the),)",def art nom,False,True,,,,,,...,False,False,False,,,,,False,False,False
2,x,"((True, and),)",enclitic conj,False,True,,,,,,...,,,,,,,,False,False,False
3,iσ'la,"((True, the),)",def art 2nd gen,False,True,,,,,,...,False,False,False,,,,,True,False,False
4,cle,"((True, the),)",dem pro loc,False,True,,,,,,...,True,False,False,,,,,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,flereσ,"((True, divine spirit), (True, divinity))",1st gen,False,False,,,,,,...,False,False,False,,,,,True,False,False
1118,flerθrce,(),past act,False,False,,,,,,...,,,,,,,,False,False,False
1119,fratuce,"((False, incised),)",past act,False,False,,,,,,...,,,,,,,,False,False,False
1120,frontac,"((True, of lightning),)",nom acc,False,False,,,,,,...,False,False,False,,,,,False,False,False


In [24]:
# BUG: they are not in the same order for some reason
# df_csv["TAG"] = df_xlsx["TAG"]

In [None]:
# Slow inefficient O(n^2) merge
tags = []
for row_csv in df_csv.iloc:
    missing = True
    for row_xlsx in df_xlsx.iloc:
        if row_xlsx["Etruscan"].strip() == row_csv["Etruscan"].strip():
            tags.append(row_xlsx["TAG"])
            missing = False
            break    
    if missing:
        tags.append(None)
df_csv["TAG"] = tags

In [25]:
df_csv

Unnamed: 0,Etruscan,Translations,POS,Is inferred,Is suffix,Abbreviation of,Suffix indexes,city name,place name,name,...,1st pert,2nd pert,1st pers,2nd pers,3rd pers,pl,gen,abl,pert,TAG
0,isa,"((True, the),)",def art nom,False,True,,,,,,...,False,False,,,,,False,False,False,ADP
1,iσ'a,"((True, the),)",def art nom,False,True,,,,,,...,False,False,,,,,False,False,False,PRON
2,x,"((True, and),)",enclitic conj,False,True,,,,,,...,,,,,,,False,False,False,PRT
3,iσ'la,"((True, the),)",def art 2nd gen,False,True,,,,,,...,False,False,,,,,True,False,False,DET
4,cle,"((True, the),)",dem pro loc,False,True,,,,,,...,False,False,,,,,False,False,False,ADP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1117,flereσ,"((True, divine spirit), (True, divinity))",1st gen,False,False,,,,,,...,False,False,,,,,True,False,False,NOUN
1118,flerθrce,(),past act,False,False,,,,,,...,,,,,,,False,False,False,VERB
1119,fratuce,"((False, incised),)",past act,False,False,,,,,,...,,,,,,,False,False,False,VERB
1120,frontac,"((True, of lightning),)",nom acc,False,False,,,,,,...,False,False,,,,,False,False,False,ADJ


In [None]:
# Added later: use 0-0.5-1 instead of bools
columns = ['city name', 'place name', 'name', 'epithet', 'theo', 'cogn', 'prae', 'nomen', 'nom', 'acc', 'masc', 'fem',
        'nas-part', 'nasa-part', 'u-part', 'θ-impv', 'θ-part', 'θas-part',
        'as-part', 'act', 'pass', 'non-past', 'past', 'impv', 'jussive',
        'necess', 'inanim', 'anim', 'indef', 'def', 'deictic particle',
        'enclitic particle', 'enclitic conj', 'dem', 'adv', 'art', 'conj',
        'post', 'pro', 'rel', 'subord', 'neg', 'num', '1st gen', '2nd gen',
        '1st abl', '2nd abl', 'loc', '1st pert', '2nd pert', '1st pers',
        '2nd pers', '3rd pers', 'pl', 'gen', 'abl', 'pert']
df_csv[columns] = df_csv[columns].replace({"dubious": 0.5, True: 1, False: 0, "True": 1, "False": 0})

In [26]:
df_csv.to_csv("ETP_POS.csv", columns=df_csv.columns)