# Extraction and classification of classical references from the ConDÉ corpus

Script written by Morgane Pica for a submission to the symposium ["Lire les classiques en Normandie"](https://rmblf.be/2022/02/04/appel-a-contribution-lire-les-classiques-en-normandie/) (oct 2022), to be written by herself and Mathieu Goux.

## Imports & declarations

In [6]:
from tqdm.notebook import tqdm #tqdm est bibliothèque qui permet d'avoir une barre de progression
import xml.etree.ElementTree as ET

ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
ET.register_namespace('xml','http://www.w3.org/XML/1998/namespace')

# Not all witnesses were enriched with reference identification.
witnesses = ["basnage","berault","merville","pesnelle","terrien"]
binpath = "/home/mpica/Progs/perso/CONDE/editions/base-version/"
einpath = "_base.xml"

listfile = "authors.csv"
tablefile = "mentions.csv"
checklist = "checklist.xml"

## Extract authors and store in a dictionary

In [5]:
authors = {}

def extract(witness, path):
    print(witness)
    
    liste = ET.Element(witness)
    
    with open(path) as filein:
        tree = ET.parse(filein)
        root = tree.getroot()
        
        for author in tqdm(root.findall('.//{http://www.tei-c.org/ns/1.0}listPerson/{http://www.tei-c.org/ns/1.0}person')):
            fullbirth = author.find('.//{http://www.tei-c.org/ns/1.0}birth')
            
            try:
                ident = author.get("{http://www.w3.org/XML/1998/namespace}id")
                if ident not in authors.keys():
                    authors[ident] = {}
            
                    try:
                        if "when" in fullbirth.attrib.keys():
                            authors[ident]["birth"] = fullbirth.get("when")
                        else:
                            authors[ident]["birth"] = "?" + fullbirth.get("notBefore")
                            
                    except:
                        authors[ident]["birth"] = "none"
                        liste.append(author)

                    try:
                        lg = {}
                        for name in author.findall('.//{http://www.tei-c.org/ns/1.0}persName'):
                            try:
                                lg[name.get("{http://www.w3.org/XML/1998/namespace}lang")] = name.find("./{http://www.tei-c.org/ns/1.0}surname").text + ", " + name.find("./{http://www.tei-c.org/ns/1.0}firstname").text
                            except:
                                lg[name.get("{http://www.w3.org/XML/1998/namespace}lang")] = name.text
                        if "fr" in lg.keys():
                            authors[ident]["name"] = lg["fr"]
                        elif "la" in lg.keys():
                            authors[ident]["name"] = lg["la"]
                        else:
                            authors[ident]["name"] = lg["en"]
                            
                    except:
                        authors[ident]["name"] = "none"
                        liste.append(author)
            except:
                continue
    
    return liste
            

listroot = ET.Element("people")    

for witness in witnesses:
    fullpath = binpath + witness + einpath
    
    listroot.append(extract(witness, fullpath))

final = {}

for author in authors.keys():
    
    if "?" in authors[author]["birth"]:
        birth = authors[author]["birth"].replace("?","")
    else:
        birth = authors[author]["birth"]
    try:
        if int(birth) < 550:
            final[author] = authors[author]
    except:
        print(authors[author])

print(final)

with open(checklist, "w") as failures:
    a_ecrire = ET.tostring(listroot, encoding="unicode", method="xml")
    failures.write(a_ecrire)

basnage


  0%|          | 0/360 [00:00<?, ?it/s]

berault


  0%|          | 0/145 [00:00<?, ?it/s]

merville


  0%|          | 0/38 [00:00<?, ?it/s]

pesnelle


  0%|          | 0/207 [00:00<?, ?it/s]

terrien


  0%|          | 0/92 [00:00<?, ?it/s]

{'birth': 'none', 'name': 'Anian'}
{'birth': 'none', 'name': None}
{'birth': 'none', 'name': 'Arq.'}
{'birth': 'none', 'name': 'Goncanus'}
{'birth': 'none', 'name': 'Febur.'}
{'birth': 'none', 'name': '\u2028Michel (saint)\u2028'}
{'birth': 'none', 'name': 'Masurier'}
{'birth': 'none', 'name': None}
{'birth': 'none', 'name': 'Apulée'}
{'birth': 'none', 'name': 'Ald. Manut.'}
{'birth': 'none', 'name': 'Frigentius'}
{'birth': 'none', 'name': 'Fillesac'}
{'birth': 'none', 'name': 'Isidore'}
{'birth': 'none', 'name': 'Adrien (pape) '}
{'birth': 'none', 'name': '\u2028Matthieu (saint)\u2028'}
{'birth': 'none', 'name': 'Fortin'}
{'birth': 'none', 'name': 'Adrien'}
{'birth': 'none', 'name': 'Antipater de Thessalonique'}
{'birth': 'none', 'name': 'Moïse'}
{'birth': 'none', 'name': 'Froben'}
{'birth': 'none', 'name': '\u2028Thomas (saint)\u2028'}
{'birth': 'none', 'name': 'Josephe'}
{'birth': 'none', 'name': 'Balduin'}
{'birth': 'none', 'name': 'Jean-Baptiste (saint)\u2028'}
{'birth': 'none', '