# Extraction and classification of classical references from the ConDÉ corpus

Script written by Morgane Pica for a submission to the symposium ["Lire les classiques en Normandie"](https://rmblf.be/2022/02/04/appel-a-contribution-lire-les-classiques-en-normandie/) (oct 2022), to be written by herself and Mathieu Goux.

## Imports & declarations

In [27]:
from tqdm.notebook import tqdm #tqdm est bibliothèque qui permet d'avoir une barre de progression
import xml.etree.ElementTree as ET
import csv

ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
ET.register_namespace('xml','http://www.w3.org/XML/1998/namespace')

# Not all witnesses were enriched with reference identification.
witnesses = ["basnage","berault","merville","pesnelle","terrien"]
binpath = "/home/mpica/Progs/perso/CONDE/editions/base-version/"
einpath = "_base.xml"

listfile = "authors.csv"
tablefile = "mentions.csv"
checklist = "checklist.xml"

## FUNCTION: extract authors and store in a dictionary

In [22]:
authors = {}

def extract(witness, path):
    print(witness)
    
    """
    Function taking the name and path to a TEI-XML text file and
    analyzing the references, doing two things:
        - returning an XML element named after the current witness,
            itself containing the copy of each reference declaration,
        - completing the general author dictionnary with new elements,
            whether new authors or authors whose informations were
            incomplete.
            
    :param witness: Name (=id) of the current witness as str (no space)
    :param path: Path to the current witness TEI-XML file.
    
    """
    
    # Create the witness element.
    liste = ET.Element(witness)
    
    # Open and parse TEI-XML file.
    with open(path) as filein:
        tree = ET.parse(filein)
        root = tree.getroot()
        
        # Look for all declared authors.
        for author in tqdm(root.findall('.//{http://www.tei-c.org/ns/1.0}listPerson/{http://www.tei-c.org/ns/1.0}person')):
            
            # Get <birth> element.
            fullbirth = author.find('.//{http://www.tei-c.org/ns/1.0}birth')
            fulldeath = author.find('.//{http://www.tei-c.org/ns/1.0}death')
            
            try:
                # Get current author identifier.
                ident = author.get("{http://www.w3.org/XML/1998/namespace}id")
                
                if ident not in authors.keys():
                    # Create a dict. entry for the current author.
                    authors[ident] = {}
            
                    try:
                        # Get current author birth date.
                        
                        if "when" in fullbirth.attrib.keys():
                            authors[ident]["mbirth"] = "certain"
                            authors[ident]["birth"] = fullbirth.get("when")
                        else:
                            authors[ident]["mbirth"] = fullbirth.get("notBefore")
                            authors[ident]["birth"] = fullbirth.get("notAfter")
                            
                    except:
                        authors[ident]["birth"] = "none"
                        liste.append(author)
                    
                    try:
                        # Get current author death date.
                        
                        if "when" in fullbirth.attrib.keys():
                            authors[ident]["mdeath"] = "certain"
                            authors[ident]["death"] = fulldeath.get("when")
                        else:
                            authors[ident]["mdeath"] = fulldeath.get("notBefore")
                            authors[ident]["death"] = fulldeath.get("notAfter")
                            
                    except:
                        authors[ident]["birth"] = "none"
                        liste.append(author)
                        

                    try:
                        # Create a dict. to store all recorded names for current author.
                        lg = {}
                        
                        # Loop on names, store their language.
                        for name in author.findall('.//{http://www.tei-c.org/ns/1.0}persName'):
                            namelang = name.get("{http://www.w3.org/XML/1998/namespace}lang")
                            
                            if name.text:
                                # If name is not split into <forename>/<surname> elements,
                                # there is text directly into <persName> element and we
                                # make this the current language text.
                                lg[namelang] = name.text
                                
                            else:
                                # If name is split, the order is unsure, therefore
                                # we store each kind into its own entry
                                # within names dict. and make a final str out of it.
                                names = {}
                                for nchild in name.findall('*'):
                                    if nchild.tag == "{http://www.tei-c.org/ns/1.0}forename":
                                        names["fn"] = nchild.text
                                    elif nchild.tag == "{http://www.tei-c.org/ns/1.0}surname":
                                        names["sn"] = nchild.text
                                    
                                lg[namelang] = names["sn"] + ", " + names["fn"]
                                
                        
                        # Setting an order of preference for final display of name:
                        # preferably French, if not, Latin, and if neither, English.
                        # (These are the only three name languages within the corpus.)
                        if "fr" in lg.keys():
                            authors[ident]["name"] = lg["fr"]
                        elif "la" in lg.keys():
                            authors[ident]["name"] = lg["la"]
                        elif "eng" in lg.keys():
                            authors[ident]["name"] = lg["eng"]
                            
                    except:
                        authors[ident]["name"] = "none"
                        liste.append(author)
                    
                # If the author was recorded in a previous witness but has no name,
                # we try to make a name string again with this witness.
                
                elif authors[ident]["name"] == "none":
                    
                    try:
                        lg = {}
                        for name in author.findall('.//{http://www.tei-c.org/ns/1.0}persName'):
                            namelang = name.get("{http://www.w3.org/XML/1998/namespace}lang")
                            if name.text:
                                lg[namelang] = name.text
                                
                            else:
                                names = {}
                                for nchild in name.findall('*'):
                                    if nchild.tag == "{http://www.tei-c.org/ns/1.0}forename":
                                        names["fn"] = nchild.text
                                    elif nchild.tag == "{http://www.tei-c.org/ns/1.0}surname":
                                        names["sn"] = nchild.text
                                        
                                lg[namelang] = names["sn"] + ", " + names["fn"]
                                                            
                        if "fr" in lg.keys():
                            authors[ident]["name"] = lg["fr"]
                        elif "la" in lg.keys():
                            authors[ident]["name"] = lg["la"]
                        elif "eng" in lg.keys():
                            authors[ident]["name"] = lg["eng"]
                            
                    except:
                        authors[ident]["name"] = "none"
                        liste.append(author)
                    
            except:
                continue
    
    return liste

## FUNCTION : sort extracted authors: keep those born before year 550

In [20]:
def sort(dico):
    
    """
    Function taking a dictionary of authors shaped like so:
    {'authorID': {'birth':'0000', 'name':'AuthorName'}}
    and returning the same dictionnary where all authors with a
    number greater than 550 as a birthdate were removed.
    
    :param dico: dict
    
    """
    print("Now sorting authors.")
    # Looping on author identifiers (=keys of dict.)
    for author in tqdm(dico.keys()):
        
        birth = authors[author]["birth"]
        
        try:
            # If the author was born before 550, the entry is added
            # to the new dictionary.
            
            if int(birth) < 550:
                final[author] = authors[author]
        except:
            
            # If there is an error (no birth date), print the author
            # as we want to know if they are interesting now
            # (if so, we can correct the XML itself).
            
            print(authors[author])
    
    # Show me the final dictionary to assess the data.
    print(final)
    
    return final

## Function: get locations of all references

In [40]:
def get_refs(witness, path, authors):
    print(witness)
    """
    Function taking the name and path to a witness, as well
    as a list of author identifiers previously extracted, and
    fills the mentions dict. with identification information
    about each mention of each author in the list.
    
    
    :param authors: list of xml identifiers.
    """
    
    partcount = 0
    chptcount = 0
    sctcount = 0
    
    # Open and parse TEI-XML file.
    with open(path) as filein:
        tree = ET.parse(filein)
        root = tree.getroot()
        
        # Enter each part, chapter and section, keeping track of
        # their respective numbers. Chapter and section counts do not
        # start anew with each new parent, so as to have a unique number
        # within the document.
        for part in tqdm(root.findall('.//{http://www.tei-c.org/ns/1.0}div[@type="part"]')):
            partcount += 1
            
            for chapter in part.findall('.//{http://www.tei-c.org/ns/1.0}div[@type="chapter"]'):
                chptcount += 1
                
                for section in chapter.findall('.//{http://www.tei-c.org/ns/1.0}div[@type="section"]'):
                    sctcount += 1
                    
                    for ref in section.findall('.//{http://www.tei-c.org/ns/1.0}ref'):
                        
                        # Only work if the mention as an @corresp (there are others and
                        # these are of no interest here).
                        
                        if ref.get('corresp'):
                            ident = ref.get('corresp').replace("#","")
                            
                            if ident in authors and ident in mentions:
                                mentions[ident] += [[witness, str(partcount), str(chptcount), str(sctcount)]]
                            
                            elif ident in authors:
                                mentions[ident] = [[witness, str(partcount), str(chptcount), str(sctcount)]]

## Function : make a CSV file out of all this

In [35]:
def initial_csv(mentions, info):
    
    """
    Function writing the final CSV compiling author information
    and author mentions.
    
    :param mentions: a dictionary with author identifiers as keys
        (and a list of his mentions within the corpus as value)
        
    :param info: a dictionary with author identifiers as keys
        (and a dictionary of his personal information as value)
    """
    
    # Columns for the new CSV file.
    
    columns = [
        "Author",
        "Birth start",
        "Birth stop",
        "Death start",
        "Death stop",
        "Witness",
        "Part",
        "Chapter",
        "Section"
    ]
    
    
    # Open and prepare CSV file.
    with open(tablefile, 'w') as csvtobe:
        csvwriting = csv.DictWriter(csvtobe, fieldnames = columns)
        csvwriting.writeheader()
    
        # Loop on author keys in information dict.
        # and keep the associated value in "local" var.
        for author in info.keys():
            local = info[author]
            
            # Loop on all mentions of current author,
            # and combine with author information.
            for mention in mentions[author]:
                
                csvwriting.writerow({
                    "Author": local["name"],
                    "Birth start": local["mbirth"],
                    "Birth stop" : local["birth"],
                    "Death start" : local["mdeath"],
                    "Death stop" : local["death"],
                    "Witness" : mention[0],
                    "Part" : mention [1],
                    "Chapter" : mention [2],
                    "Section" : mention[3]
                })

## Using the previously declared functions

In [23]:
# Initiate the root element for the XML debugging file.
listroot = ET.Element("people")

# Loop on witnesses: construct the path from initial vars,
# then trigger extract() function on current witness,
# so as to both make the according element for the XML debugging file,
# and fill the general author dictionary.

for witness in witnesses:
    fullpath = binpath + witness + einpath
    
    listroot.append(extract(witness, fullpath))

# Start a new dictionary for the sorted authors.
final = {}

# Fill the dictionary with only desired authors.
final = sort(authors)

# Write XML debugging file.
with open(checklist, "w") as failures:
    a_ecrire = ET.tostring(listroot, encoding="unicode", method="xml")
    failures.write(a_ecrire)

basnage


  0%|          | 0/360 [00:00<?, ?it/s]

berault


  0%|          | 0/145 [00:00<?, ?it/s]

merville


  0%|          | 0/38 [00:00<?, ?it/s]

pesnelle


  0%|          | 0/207 [00:00<?, ?it/s]

terrien


  0%|          | 0/92 [00:00<?, ?it/s]

Now sorting authors.


  0%|          | 0/489 [00:00<?, ?it/s]

{'birth': 'none', 'name': 'Anian'}
{'birth': 'none', 'name': 'Herold, Johann Berthold'}
{'birth': 'none', 'name': 'Arq.'}
{'birth': 'none', 'name': 'Goncanus'}
{'birth': 'none', 'name': 'Febur.'}
{'birth': 'none', 'name': '\u2028Michel (saint)\u2028'}
{'birth': 'none', 'name': 'Masurier'}
{'birth': 'none', 'name': 'Chartier, Alexis'}
{'birth': 'none', 'name': 'Apulée'}
{'birth': 'none', 'name': 'Ald. Manut.'}
{'birth': 'none', 'name': 'Frigentius'}
{'birth': 'none', 'name': 'Fillesac'}
{'birth': 'none', 'name': 'Isidore'}
{'birth': 'none', 'name': 'Adrien (pape) '}
{'birth': 'none', 'name': '\u2028Matthieu (saint)\u2028'}
{'birth': 'none', 'name': 'Fortin'}
{'birth': 'none', 'name': 'Adrien'}
{'birth': 'none', 'name': 'Antipater de Thessalonique'}
{'birth': 'none', 'name': 'Moïse'}
{'birth': 'none', 'name': 'Froben'}
{'birth': 'none', 'name': '\u2028Thomas (saint)\u2028'}
{'birth': 'none', 'name': 'Josephe'}
{'birth': 'none', 'name': 'Balduin'}
{'birth': 'none', 'name': 'Jean-Baptiste 

In [38]:
# Start a general dictionary to store any mention information.
mentions = {}

# Once again loop on witnesses and construct path from initial vars,
# then trigger the get_refs() function to collect mention information.
for witness in witnesses:
    fullpath = binpath + witness + einpath
    get_refs(witness, fullpath, final.keys())

# Show the obtained dict. for checking.
print(mentions)

basnage


  0%|          | 0/4 [00:00<?, ?it/s]

berault


  0%|          | 0/6 [00:00<?, ?it/s]

merville


  0%|          | 0/4 [00:00<?, ?it/s]

pesnelle


  0%|          | 0/3 [00:00<?, ?it/s]

terrien


  0%|          | 0/16 [00:00<?, ?it/s]

{'marculphe': [['basnage', '1', '1', '1'], ['basnage', '1', '1', '1'], ['basnage', '1', '1', '13'], ['basnage', '1', '1', '13'], ['basnage', '1', '1', '13'], ['basnage', '1', '9', '105'], ['basnage', '1', '9', '109'], ['basnage', '1', '9', '109'], ['basnage', '1', '9', '114'], ['basnage', '1', '9', '147'], ['basnage', '1', '9', '147'], ['basnage', '1', '11', '241'], ['basnage', '1', '11', '241'], ['basnage', '1', '11', '241'], ['basnage', '1', '11', '255'], ['basnage', '1', '11', '265'], ['basnage', '1', '11', '267'], ['basnage', '1', '13', '338'], ['basnage', '1', '15', '377'], ['basnage', '1', '15', '377'], ['basnage', '1', '15', '400'], ['basnage', '1', '15', '402'], ['pesnelle', '1', '2', '1'], ['pesnelle', '1', '2', '18'], ['pesnelle', '1', '10', '117'], ['pesnelle', '1', '12', '247'], ['pesnelle', '1', '12', '255'], ['pesnelle', '1', '12', '267'], ['pesnelle', '1', '12', '267'], ['pesnelle', '1', '18', '379'], ['pesnelle', '1', '18', '383'], ['pesnelle', '1', '18', '383'], ['pesn

In [39]:
# Write compilation CSV.
initial_csv(mentions, final)

['basnage', '1', '1', '3']
['basnage', '1', '9', '156']
['basnage', '1', '11', '242']
['basnage', '1', '11', '242']
['basnage', '1', '15', '380']
['basnage', '1', '15', '380']
['basnage', '1', '15', '380']
['basnage', '1', '15', '380']
['basnage', '1', '15', '400']
['basnage', '1', '15', '419']
['basnage', '1', '16', '423']
['basnage', '1', '17', '443']
['basnage', '1', '21', '551']
['terrien', '1', '1', '2']
['terrien', '1', '1', '2']
['terrien', '1', '1', '2']
['terrien', '1', '1', '2']
['terrien', '2', '14', '48']
['terrien', '2', '14', '48']
['terrien', '11', '175', '685']
['terrien', '11', '175', '685']
['basnage', '1', '1', '1']
['basnage', '1', '1', '1']
['basnage', '1', '1', '13']
['basnage', '1', '1', '13']
['basnage', '1', '1', '13']
['basnage', '1', '9', '105']
['basnage', '1', '9', '109']
['basnage', '1', '9', '109']
['basnage', '1', '9', '114']
['basnage', '1', '9', '147']
['basnage', '1', '9', '147']
['basnage', '1', '11', '241']
['basnage', '1', '11', '241']
['basnage', 

KeyError: 'appius-claudius'