In [3]:
# Load and test Latincy
import spacy
nlp = spacy.load("la_core_web_lg")

# Example text
text = "arma virumque cano Troiae qui primus ab urbis"

# Process the text
doc = nlp(text)

# Print each token and its part-of-speech tag
for token in doc:
    print(f"{token.text:15} {token.pos_:10} {token.tag_:10}")


arma            NOUN       noun      
virum           NOUN       noun      
que             CCONJ      conjunction
cano            VERB       verb      
Troiae          PROPN      proper_noun
qui             PRON       pronoun   
primus          ADJ        adjective 
ab              ADP        preposition
urbis           NOUN       noun      


In [9]:
# This script performs NER with own manual selection of names and extracts the relevant sentences
import json

#Initialize text
def load_text(filename: str) -> list:
    with open(filename, 'r', encoding='utf-8') as file:
        text_list = file.readlines()
        text = []
        for line in text_list:
            line = line.split("\t")
            text.append(line)
    return text

# Dictionaries with names
names_aeneid = {
    "Aeneas": ["Aeneas", "Aenean", "Aeneae", "Aenea", "Anchisiades", "Anchisiaden", "Anchisiadeo", "Anchisiade", "Anchisiadae"],
    "Turnus": ["Turnus", "Turnum", "Turni", "Turno", "Turne"],
    "Anchises": ["Anchises", "Anchisen", "Anchisae", "Anchisa", "Anchiseo"],
    "Dido": ["Dido", "Didonem", "Didonis", "Didoni", "Didone", "Didus", "Elissa", "Elissae", "Elissam"],
    "Ascanius": ["Ascanius", "Ascanium", "Ascanii", "Ascani", "Ascanio", "Iulus", "Iulum", "Iuli", "Iulo"], 
    "Achates": ["Achates", "Achaten", "Achati", "Achata", "Achate", "Achatae"], 
    "Lavinia": ["Lavinia", "Laviniam", "Laviniae"],
    "Camilla": ["Camilla", "Caemillae", "Camillam"],
    "Mnestheus": ["Mnestheus", "Mnestheo", "Mnesthei", "Mnestheum", "Mnesthea"],
    "Iuppiter": ["Iuppiter", "Iovem", "Iovis", "Iovi", "Iove"],
    "Iuno": ["Iuno", "Iunonem", "Iunonis", "Iunoni", "Iunone"],
    "Venus": ["Venus", "Venerem", "Veneris", "Veneri", "Venere"],
    "Minverva": ["Minerva", "Minervam", "Minervae", "Pallada", "Palladem", "Palladis", "Palladi", "Pallade"],
    "Apollo": ["Apollo", "Apollonem", "Apollonis", "Apolloni", "Apollone", "Phoebus", "Phoebum", "Phoebi", "Phoebo"],
    "Mars": ["Mars", "Martem", "Martis", "Marti", "Marte", "Mavors", "Mavortem", "Mavortis", "Mavorti", "Mavorte", "Gradivus", "Gradivum", "Gradivi", "Gradivo"],
}

names_pharsalia = {
    "Caesar": ["Caesar", "Caesarem", "Caesaris", "Caesari", "Caesare"],
    "Pompeius": ["Pompeius", "Pompeium", "Pompeii", "Pompei", "Pompeio", "Magnus", "Magnum", "Magni", "Magno", "Magne"],
    "Cornelia": ["Cornelia", "Corneliam", "Corneliae"],
    "Cato": ["Cato", "Catonem", "Catonis", "Catoni", "Catone"],
    "Cleopatra": ["Cleopatra", "Cleopatram", "Cleopatrae"],
    "Curio": ["Curio", "Curionem", "Curionis", "Curioni", "Curione"],
    "Scaeva": ["Scaeva", "Scaevam", "Scaevae"],
    "Iuppiter": ["Iuppiter", "Iovem", "Iovis", "Iovi", "Iove"],
    "Apollo": ["Apollo", "Apollonem", "Apollonis", "Apolloni", "Apollone", "Phoebus", "Phoebum", "Phoebi", "Phoebo"],
    "Mars": ["Mars", "Martem", "Martis", "Marti", "Marte", "Mavors", "Mavortem", "Mavortis", "Mavorti", "Mavorte", "Gradivus", "Gradivum", "Gradivi", "Gradivo"],
    "Iuno": ["Iuno", "Iunonem", "Iunonis", "Iunoni", "Iunone"],
    "Venus": ["Venus", "Venerem", "Veneris", "Veneri", "Venere"],
}

names_argonautica = {
    "Iason": ["Iason", "Iasonem", "Iasona", "Iasonis", "Iasoni", "Iasone", "Aesonides", "Aesoniden", "Aesonidis", "Aesonidi", "Aesonide"],
    "Medea": ["Medea", "Medeam", "Medeae"],
    "Iuppiter": ["Iuppiter", "Iovem", "Iovis", "Iovi", "Iove"],
    "Iuno": ["Iuno", "Iunonem", "Iunonis", "Iunoni", "Iunone"],
    "Venus": ["Venus", "Venerem", "Veneris", "Veneri", "Venere"],
    "Apollo": ["Apollo", "Apollonem", "Apollonis", "Apolloni", "Apollone", "Phoebus", "Phoebum", "Phoebi", "Phoebo"],
    "Mars": ["Mars", "Martem", "Martis", "Marti", "Marte", "Mavors", "Mavortem", "Mavortis", "Mavorti", "Mavorte", "Gradivus", "Gradivum", "Gradivi", "Gradivo"],
    "Aeetes": ["Aeetes", "Aeeten", "Aeetis", "Aeeti", "Aeete"],
    "Hercules": ["Hercules", "Herculem", "Herculis", "Herculi", "Herculeo", "Hercule", "Alcides", "Alciden", "Alcidis", "Alcidi", "Alcide"],
    "Hylas": ["Hylas", "Hylan", "Hyladis", "Hyladi", "Hylade"],
    "Pollux": ["Pollux", "Pollicem", "Pollicis", "Pollici", "Pollice"],
    "Pelias": ["Pelias", "Pelian", "Peliae", "Pelia"]

}

names_thebaid = {
    "Iuppiter": ["Iuppiter", "Iovem", "Iovis", "Iovi", "Iove"],
    "Apollo": ["Apollo", "Apollonem", "Apollonis", "Apolloni", "Apollone", "Phoebus", "Phoebum", "Phoebi", "Phoebo"],
    "Mars": ["Mars", "Martem", "Martis", "Marti", "Marte", "Mavors", "Mavortem", "Mavortis", "Mavorti", "Mavorte", "Gradivus", "Gradivum", "Gradivi", "Gradivo"],
    "Iuno": ["Iuno", "Iunonem", "Iunonis", "Iunoni", "Iunone"],
    "Venus": ["Venus", "Venerem", "Veneris", "Veneri", "Venere"],
    "Capaneus": ["Capaneus", "Capenea", "Capaneos", "Capaneo", "Capaneu"],
    "Hippomedon": ["Hippomedon", "Hippomedonta", "Hippomedontis", "Hippomedonti", "Hippomedonte"],
    "Tydeus": ["Tydeus", "Tydea", "Tydeo", "Tydeos"],
    "Creon": ["Creon", "Creonta", "Creontis", "Creonti", "Creonte"],
    "Adrastus": ["Adrastus", "Adrastum", "Adrasti", "Adrasto"],
    "Amphion": ["Amphion", "Amphiona", "Amphionis", "Amphioni", "Amphione"],
    "Tisiphone": ["Tisiphone", "Tisiphones"],
    "Antigone": ["Antigone", "Antigonen", "Antigones"],
    "Polynices": ["Polynices", "Polynicen", "Polynicis", "Polynici", "Polynice"],
    "Eteocles": ["Eteoclis", "Eteoclea", "Eteocleos"],
    "Iocasta": ["Iocasta", "Iocastam", "Iocastae"],
    "Amphiaraus": ["Amphiaraus", "Amphiarae"],
    "Theseus": ["Theseus", "Thesea", "Theseos", "Theseu"]
}

names_punica = {
    "Hannibal": ["Hannibal", "Hannibalem", "Hannibalis", "Hannibali", "Hannibale", "Poenus", "Poenum", "Poeni", "Poeno"],
    "Scipio": ["Scipio", "Scipionem", "Scipionis", "Scipioni", "Scipione"],
    "Fabius": ["Fabius", "Fabium", "Fabii", "Fabi", "Fabio", "Cunctator"],
    "Iuppiter": ["Iuppiter", "Iovem", "Iovis", "Iovi", "Iove"],
    "Apollo": ["Apollo", "Apollonem", "Apollonis", "Apolloni", "Apollone", "Phoebus", "Phoebum", "Phoebi", "Phoebo"],
    "Mars": ["Mars", "Martem", "Martis", "Marti", "Marte", "Mavors", "Mavortem", "Mavortis", "Mavorti", "Mavorte", "Gradivus", "Gradivum", "Gradivi", "Gradivo"],
    "Iuno": ["Iuno", "Iunonem", "Iunonis", "Iunoni", "Iunone"],
    "Venus": ["Venus", "Venerem", "Veneris", "Veneri", "Venere"],
    "Mago": ["Mago", "Magonem", "Magonis", "Magoni", "Magone"], 
    "Varro": ["Varro", "Varronem", "Varronis", "Varroni", "Varrone"],
    "Marcellus": ["Marcellus", "Marcellum", "Marcelli", "Marcello"],
    "Fulvius": ["Fulvius", "Fulvium", "Fulvii", "Fulvi", "Fulvio"],
    "Regulus": ["Regulus", "Regulum", "Reguli", "Regulo"],
    "Hasdrubal": ["Hasdrubal", "Hasdrubalem", "Hasdrubalis", "Hasdrubali", "Hasdrubale"], 
    "Flaminius": ["Flaminius", "Flaminium", "Flamini", "Flaminii", "Flaminio"],

}

all_names_dict = {
    "aeneid": names_aeneid,
    "pharsalia": names_pharsalia,
    "thebaid": names_thebaid,
    "argonautica": names_argonautica,
    "punica": names_punica
}

def ner(text: list, name_dict: dict) -> dict:
    # Dict to store the relevant sentences
    person_occur_dict = {}
    
    # Iterate through each sentence and its number in the text
    for number, sentence in text:
        # Check each name and its inflectional forms in the dictionary
        for name, forms in name_dict.items():
            # Search for the name and each of its forms in the sentence
            if any(form in sentence for form in forms):
                
                # If the name is one of the two ambigous ones ("Poenus" or "Magnus"), do a POS tagging of the sentence and only keep them, if they are nouns
                special_tokens = ["Poenus", "Poenum", "Poeni", "Poeno", "Poene", "Magnus", "Magnum", "Magni", "Magno", "Magne"]
                allowed_pos = ["NOUN", "PROPN"]
                if any(form in special_tokens for form in forms if form in sentence):
                    doc = nlp(sentence)
                    for token in doc:
                        if token.text in special_tokens and token.pos_ in allowed_pos:
                            # If any form is found, process the occurrence
                            if name not in person_occur_dict:
                                # Create a new entry if the name isn't already in the dictionary
                                person_occur_dict[name] = {"name": name, "occurrences": [{"sentence": sentence, "number": number}]}
                            else:
                                # Append a new occurrence to the existing list
                                person_occur_dict[name]["occurrences"].append({"sentence": sentence, "number": number})
                else: 
                    # If any form is found, process the occurrence
                    if name not in person_occur_dict:
                        # Create a new entry if the name isn't already in the dictionary
                        person_occur_dict[name] = {"name": name, "occurrences": [{"sentence": sentence, "number": number}]}
                    else:
                        # Append a new occurrence to the existing list
                        person_occur_dict[name]["occurrences"].append({"sentence": sentence, "number": number})
                    
    return person_occur_dict


def occurences_in_dict(p_dict: dict) -> dict:
    count_dict = {}
    for key, value in p_dict.items():  # Use .items() to iterate over key-value pairs
        # Count the number of occurrences for each person based on the length of their occurrences list
        count_dict[key] = len(value["occurrences"])
    
    # Sort the dictionary by its values in descending order
    sorted_tuples = sorted(count_dict.items(), key=lambda item: item[1], reverse=True)

    # Convert the sorted list of tuples back into a dictionary
    sorted_dict = dict(sorted_tuples)

    #print(sorted_dict)
    return sorted_dict

def main(input: str, output_freq: str, output_dict: str, title: str):
    text = load_text(input)
    dict_name = all_names_dict[title]
    final_dict = ner(text, dict_name)
    counts = occurences_in_dict(final_dict)
    
    # Prepare the strings to write to the file
    lines_to_write = [f"{key}: {value}\n" for key, value in counts.items()]

    # Write the sorted list of strings to a file
    with open(output_freq, 'w', encoding='utf-8') as file:
        file.writelines(lines_to_write)
    
    # Save the final dictionary to a JSON file
    with open(output_dict, 'w', encoding='utf-8') as json_file:
        json.dump(final_dict, json_file, ensure_ascii=False, indent=4)

works = ["aeneid", "pharsalia", "thebaid", "argonautica", "punica"]
for epic in works:
    main(f"../texts/{epic}_clean.txt", f"../analysis/{epic}_freq.txt", f"../analysis/{epic}_dict.json", epic)