# Extraction and classification of classical references from the ConDÉ corpus

Script written by Morgane Pica for a submission to the symposium ["Lire les classiques en Normandie"](https://rmblf.be/2022/02/04/appel-a-contribution-lire-les-classiques-en-normandie/) (oct 2022), to be written by herself and Mathieu Goux.

## Imports & declarations

In [13]:
from tqdm.notebook import tqdm #tqdm est bibliothèque qui permet d'avoir une barre de progression
import xml.etree.ElementTree as ET

ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
ET.register_namespace('xml','http://www.w3.org/XML/1998/namespace')

# Not all witnesses were enriched with reference identification.
witnesses = ["basnage","berault","merville","pesnelle","terrien"]
binpath = "/home/mpica/Progs/perso/CONDE/editions/base-version/"
einpath = "_base.xml"

listfile = "authors.csv"
tablefile = "mentions.csv"

## Extract authors and store in a dictionary

In [33]:
authors = {}

def extract(path):
    print(path)
    
    with open(path) as filein:
        tree = ET.parse(filein)
        root = tree.getroot()
        
        for author in tqdm(root.findall('.//{http://www.tei-c.org/ns/1.0}listPerson/{http://www.tei-c.org/ns/1.0}person')):
            fullbirth = author.find('.//{http://www.tei-c.org/ns/1.0}birth')
            
            try:
                ident = author.get("{http://www.w3.org/XML/1998/namespace}id")
                if ident not in authors.keys():
                    authors[ident] = {}
            
                    try:
                        if "when" in fullbirth.attrib.keys():
                            authors[ident]["birth"] = fullbirth.get("when")
                        else:
                            authors[ident]["birth"] = "?" + fullbirth.get("notBefore")
                            
                    except:
                        authors[ident]["birth"] = "none"

                    try:
                        lg = {}
                        for name in author.findall('.//{http://www.tei-c.org/ns/1.0}persName'):
                            try:
                                lg[name.get("{http://www.w3.org/XML/1998/namespace}lang")] = name.find("./{http://www.tei-c.org/ns/1.0}surname").text + ", " + name.find("./{http://www.tei-c.org/ns/1.0}firstname").text
                            except:
                                lg[name.get("{http://www.w3.org/XML/1998/namespace}lang")] = name.text
                        if "fr" in lg.keys():
                            authors[ident]["name"] = lg["fr"]
                        elif "la" in lg.keys():
                            authors[ident]["name"] = lg["la"]
                        else:
                            authors[ident]["name"] = lg["en"]
                            
                    except:
                        authors[ident]["name"] = "none"
            except:
                continue
            

for witness in witnesses:
    fullpath = binpath + witness + einpath
    
    extract(fullpath)

final = {}
for author in authors.keys():
    
    if "?" in authors[author]["birth"]:
        birth = authors[author]["birth"].replace("?","")
    try:
        if int(birth) < 550:
            final[author] = authors[author]
    except:
        print(authors[author])

print(final)

/home/mpica/Progs/perso/CONDE/editions/base-version/basnage_base.xml


  0%|          | 0/360 [00:00<?, ?it/s]

0it [00:00, ?it/s]

/home/mpica/Progs/perso/CONDE/editions/base-version/berault_base.xml


  0%|          | 0/145 [00:00<?, ?it/s]

0it [00:00, ?it/s]

/home/mpica/Progs/perso/CONDE/editions/base-version/merville_base.xml


  0%|          | 0/38 [00:00<?, ?it/s]

0it [00:00, ?it/s]

/home/mpica/Progs/perso/CONDE/editions/base-version/pesnelle_base.xml


  0%|          | 0/207 [00:00<?, ?it/s]

0it [00:00, ?it/s]

/home/mpica/Progs/perso/CONDE/editions/base-version/terrien_base.xml


  0%|          | 0/92 [00:00<?, ?it/s]

0it [00:00, ?it/s]

{'pausanias': {'birth': '?0100', 'name': 'Pausanias'}, 'lampride': {'birth': '?0300', 'name': 'Lampride'}, 'bergier': {'birth': '1567', 'name': None}, 'andegav': {'birth': '1536', 'name': None}, 'connan': {'birth': '1508', 'name': None}, 'homere': {'birth': '?-0899', 'name': 'Homère'}, 'virgile': {'birth': '-0070', 'name': 'Virgile'}, 'goncanus': {'birth': 'none', 'name': 'Goncanus'}, 'bodin': {'birth': '1530', 'name': None}, 'martini': {'birth': '1614', 'name': None}, 'hotman': {'birth': '1524', 'name': None}, 'febur': {'birth': 'none', 'name': 'Febur.'}, 'bald': {'birth': '1327', 'name': 'Balde'}, 'herauld': {'birth': '1579', 'name': None}, 'pommeraye': {'birth': '1617', 'name': None}, 'caesar': {'birth': '-0100', 'name': None}, 's-germain': {'birth': '?0490', 'name': ' Germain (saint) '}, 'artemidore': {'birth': '?0100', 'name': "Artemidore d'Éphèse"}, 'turnebe': {'birth': '1512', 'name': None}, 'spartian': {'birth': '?0300', 'name': None}, 'const': {'birth': '0905', 'name': 'Consta