# Extraction and classification of classical references from the ConDÉ corpus

Script written by Morgane Pica for a submission to the symposium ["Lire les classiques en Normandie"](https://rmblf.be/2022/02/04/appel-a-contribution-lire-les-classiques-en-normandie/) (oct 2022), to be written by herself and Mathieu Goux.

This script contains several functions, in this exact order in the file:
* `extract(witness, path)` -> Function taking the identifier of a TEI-XML file and the general path containing the TEI-XML corpus in need of analysis. Each new author declaration will be added to a general dictionary external to the function.
* `get_w_text(word)` -> Function taking a `<tei:w>` element and returning its compiled textual content.
* `title_str(div, div_type, div_count)` -> Function taking a `<tei:div>` element with lemmatized text and returning its title, if any.
* `p_tokens(paragraph)` -> Function taking a `<tei:p>` element and returning all its words as a dictionary.
* `dict_to_str(token_dict)` -> Function taking a dictionary as produced by `p_tokens()` and returning the corresponding string.
* `get_context(paragraph, ref_numbers)` -> Function taking a dictionary as produced by `p_tokens()` and the list of the numbers of the tokens of a particular `<tei:ref>` element inside that paragraph, and sorting the tokens between left context, relevant text and right context.
* `concordances(author_id, book_path)` -> Function taking the identifier of an author and the path to a TEI-XML file, and using the previous functions to gather relevant information on all mentions of this author in this file. It returns a dictionary for this author.

## Imports & declarations

In [1]:
from tqdm.notebook import tqdm #tqdm est bibliothèque qui permet d'avoir une barre de progression
from SPARQLWrapper import SPARQLWrapper, JSON
import xml.etree.ElementTree as ET
from datetime import datetime
import csv
import json
import os

ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
ET.register_namespace('xml','http://www.w3.org/XML/1998/namespace')

# Get current time
dt = datetime.now()
tmsp = dt.strftime("%Y%m%d_%H%M")

# Not all witnesses were enriched with reference identification.
# Change paths to fit your own folder organization.
witnesses = ["basnage","berault","merville","pesnelle","terrien"]
binpath = "/home/mpica/Progs/perso/CONDE/editions/base-version/"
einpath = "_base.xml"


# Construct the path for outputs. 
current_dir = os.getcwd()
new_dir = f"{current_dir}/output/extraction_{tmsp}"
os.mkdir(new_dir)

# Change output paths here if you like.
# Make CSV path now.
idrefd_csv = f'{new_dir}/idref_authors_{tmsp}.csv'
idrefd_json = f'{new_dir}/idref_authors_{tmsp}.json'
classic_idrefd = f'{new_dir}/idref_classics_{tmsp}.csv'
all_classic_occurrences = f'{new_dir}/all_classic_occurrences_{tmsp}.csv'

# Lists of characters to be treated particularly.
noLspace = ",.)/]-'"
noRspace = "(/[]-'"
# insecable = ";:"

### FUNCTION: Prepare dates

In [2]:
def dateprep(first, second, third, fourth):
    if first == "none" or first == None or first == "":
        first == "0000"
    if second == "none" or second == None or second == "":
        second == "0000"
    if third == "none" or third == None or third == "":
        third = str("{:04}".format(int(first) + 100))
    if fourth == "none" or fourth == None or fourth == "":
        fourth = str("{:04}".format(int(second) + 100))
    return first + "-01-01", second + "-01-01", third + "-01-01", fourth + "-01-01"

## FUNCTION: extract authors and store in a dictionary

In [3]:
authors = {}

def extract(witness, path):
    print("Extracting authors on -> "+ witness)
    
    """
    Function taking the name and path to a TEI-XML text file and
    analyzing the references, doing two things:
        - returning an XML element named after the current witness,
            itself containing the copy of each reference declaration,
        - completing the general author dictionnary with new elements,
            whether new authors or authors whose informations were
            incomplete.
            
    :param witness: Name (=id) of the current witness as str (no space)
    :param path: Path to the current witness TEI-XML file.
    
    """
    
    # Create the witness element.
    liste = ET.Element(witness)
    
    # Open and parse TEI-XML file.
    with open(path) as filein:
        tree = ET.parse(filein)
        root = tree.getroot()
        
        # Look for all declared authors.
        for author in tqdm(root.findall('.//{http://www.tei-c.org/ns/1.0}listPerson/{http://www.tei-c.org/ns/1.0}person')):
            
            # Get <birth> element.
            fullbirth = author.find('.//{http://www.tei-c.org/ns/1.0}birth')
            fulldeath = author.find('.//{http://www.tei-c.org/ns/1.0}death')
            
            try:
                author.get("{http://www.w3.org/XML/1998/namespace}id")
            except:
                print("Weird guy here, not finding their ID.")
                
            try:
                # Get current author identifier.
                ident = author.get("{http://www.w3.org/XML/1998/namespace}id")
                
                if ident not in authors.keys():
                    
                    try:
                        bnf = author.get('sameAs')
                    except:
                        bnf = "None"
                    
                    # Create a dict. entry for the current author.
                    authors[ident] = {}
                    authors[ident]["bnf"] = bnf
                                
                    try:
                        # Get current author birth date.
                        
                        if "when" in fullbirth.attrib.keys():
                            authors[ident]["earliest-birth"] = fullbirth.get("when")
                            authors[ident]["latest-birth"] = fullbirth.get("when")
                        else:
                            authors[ident]["earliest-birth"] = fullbirth.get("notBefore")
                            authors[ident]["latest-birth"] = fullbirth.get("notAfter")
                            
                    except:
                        authors[ident]["earliest-birth"] = "none"
                        authors[ident]["latest-birth"] = "none"
                        liste.append(author)
                    
                    try:
                        # Get current author death date.
                        
                        if "when" in fullbirth.attrib.keys():
                            authors[ident]["earliest-death"] = fulldeath.get("when")
                            authors[ident]["latest-death"] = fulldeath.get("when")
                        else:
                            authors[ident]["earliest-death"] = fulldeath.get("notBefore")
                            authors[ident]["latest-death"] = fulldeath.get("notAfter")
                            
                    except:
                        authors[ident]["earliest-death"] = "none"
                        authors[ident]["latest-death"] = "none"
                        liste.append(author)
                        

                    try:
                        # Create a dict. to store all recorded names for current author.
                        lg = {}
                        
                        # Loop on names, store their language.
                        for name in author.findall('.//{http://www.tei-c.org/ns/1.0}persName'):
                            namelang = name.get("{http://www.w3.org/XML/1998/namespace}lang")
                            
                            if name.text:
                                # If name is not split into <forename>/<surname> elements,
                                # there is text directly into <persName> element and we
                                # make this the current language text.
                                lg[namelang] = name.text
                                
                            else:
                                # If name is split, the order is unsure, therefore
                                # we store each kind into its own entry
                                # within names dict. and make a final str out of it.
                                names = {}
                                for nchild in name.findall('*'):
                                    if nchild.tag == "{http://www.tei-c.org/ns/1.0}forename":
                                        names["fn"] = nchild.text
                                    elif nchild.tag == "{http://www.tei-c.org/ns/1.0}surname":
                                        names["sn"] = nchild.text
                                    
                                lg[namelang] = names["fn"] + " " + names["sn"]
                                
                        
                        # Setting an order of preference for final display of name:
                        # preferably French, if not, Latin, and if neither, English.
                        # (These are the only three name languages within the corpus.)
                        if "fr" in lg.keys():
                            authors[ident]["name"] = lg["fr"]
                        elif "la" in lg.keys():
                            authors[ident]["name"] = lg["la"]
                        elif "eng" in lg.keys():
                            authors[ident]["name"] = lg["eng"]
                            
                    except:
                        authors[ident]["name"] = "none"
                        liste.append(author)
                    
                # If the author was recorded in a previous witness but has no name,
                # we try to make a name string again with this witness.
                
                elif authors[ident]["name"] == "none":
                    
                    try:
                        lg = {}
                        for name in author.findall('.//{http://www.tei-c.org/ns/1.0}persName'):
                            namelang = name.get("{http://www.w3.org/XML/1998/namespace}lang")
                            if name.text:
                                lg[namelang] = name.text
                                
                            else:
                                names = {}
                                for nchild in name.findall('*'):
                                    if nchild.tag == "{http://www.tei-c.org/ns/1.0}forename":
                                        names["fn"] = nchild.text
                                    elif nchild.tag == "{http://www.tei-c.org/ns/1.0}surname":
                                        names["sn"] = nchild.text
                                        
                                lg[namelang] = names["sn"] + ", " + names["fn"]
                                                            
                        if "fr" in lg.keys():
                            authors[ident]["name"] = lg["fr"]
                        elif "la" in lg.keys():
                            authors[ident]["name"] = lg["la"]
                        elif "eng" in lg.keys():
                            authors[ident]["name"] = lg["eng"]
                            
                    except:
                        authors[ident]["name"] = "none"
                        liste.append(author)
                
                else:
                    print(f"{ident} is already in the dict, so I'm not making it.")
                
            except:
                print("I'm extracting authors for the first time. There is a problem with one.")
                continue
    
    return liste

## FUNCTION: extract text from tei:w element

In [4]:
def get_w_text(word):
    
    """
    Function taking a <tei:w> element and
    returning its compiled textual content.
    
    :param word: ET.Element('{http://www.tei-c.org/ns/1.0}w')
    
    """
    
    # Preparing the return string as an empty string.
    texte = ""
    
    # If there is text directly inside <w> element and
    # before the first child, add it.
    if word.text:
        texte += str(word.text)
                
    # Loop on all current <w> children.
    for item in word:
            
        # If current child is <tei:height> or <tei:supplied>
        if item.tag == '{http://www.tei-c.org/ns/1.0}height' or item.tag == '{http://www.tei-c.org/ns/1.0}supplied':
            # Add text.
            texte += str(item.text)
            # If any, add the text following current child.
            if item.tail:
                texte += str(item.tail)
                
        # If current child is <tei:lb>, add the following text.
        elif item.tag == '{http://www.tei-c.org/ns/1.0}lb':
            if item.tail:
                texte += str(item.tail)
                        
        # If current child is <tei:choice>, add the second child of <choice>
        # (<tei:reg> or <tei:expan>), then add the text following current child if any.
        elif item.tag == '{http://www.tei-c.org/ns/1.0}choice':
            texte += str(item[1].text)
            if item.tail:
                texte += str(item.tail)
        
        # If current child is <tei:c>, add its text, then the following text if any.
        elif item.tag == '{http://www.tei-c.org/ns/1.0}c':
            texte += item.text
            if item.tail:
                texte += str(item.tail)
        
        
        # If current child is <tei:hi>, add its text, then the following text if any.
        elif item.tag == '{http://www.tei-c.org/ns/1.0}hi':
            texte += item.text
            if item.tail:
                texte += item.tail
        
        # If current child is <tei:add>, loop on its children and do the same checks.
        elif item.tag == '{http://www.tei-c.org/ns/1.0}add':
            # On refait tous les tests.
            if item.find('.') == None :
                texte = str(item.text)
                            
            else:
                        
                if item.text:
                    texte += str(item.text)
                        
                for subitem in item:
                    if subitem.tag == '{http://www.tei-c.org/ns/1.0}lb':
                        if subitem.tail:
                            texte += str(subitem.tail)
                    elif subitem.tag == '{http://www.tei-c.org/ns/1.0}choice':
                        texte += str(subitem[1].text)
                        if subitem.tail:
                            texte += str(subitem.tail)
                            
    return texte

## FUNCTION: Make title string

In [5]:
def title_str(div):
    
    """
    Function taking a <tei:div> element with lemmatized text
    and returning its title, if any.
    
    :param div: ET.Element('{http://www.tei-c.org/ns/1.0}div')
    :param dcount: integer
    
    """
    
    # Lists of characters to be treated particularly.
    noLspace = ",.)/]-'"
    noRspace = "(/[]-'"
    # insecable = ";:"
    
    # List of strings to be filled.
    divlist = []
    
    try:
        # If you do find a title as first child of div, make its text.
        if div.find('./*[1]').tag == "{http://www.tei-c.org/ns/1.0}head":
            
            # Loop on each <tei:w> word token.
            for word in div.findall('./{http://www.tei-c.org/ns/1.0}head/{http://www.tei-c.org/ns/1.0}w'):
                
                # Compile the text of current <tei:w> element.
                wtxt = get_w_text(word)
                
                # If the list is empty, add the current word to the list.
                if len(divlist) == 0:
                    divlist.append(wtxt)

                # If the token is a punctuation character which
                # is not separated from the previous word by a space,
                # add it to the last entry in the list.
                elif wtxt in noLspace:
                    divlist[-1] += wtxt
                
                # If the last entry in the list is a character which
                # is not separated from the next word by a space,
                # add the current token to it.
                elif divlist[-1] in noRspace:
                    divlist[-1] += wtxt

                # If the last letter in the last entry in the list is
                # a character which is not separated from the next word
                # by a space, add the current token to it.
                elif divlist[-1][-1] in noRspace:
                    divlist[-1] += wtxt

                #elif wtxt in insecable:
                #    divlist[-1] += "\u00a0"
                #    divlist[-1] += wtxt
                
                # Otherwise, just add the token as a new list entry.
                else:
                    divlist.append(wtxt)
            
            # Once you have treated every token in the title, make the
            # return string by adding a space between each list entry.
            title = " ".join(divlist)
        
        # If there is no title to the div but it has an @subtype,
        # its value makes the return string.
        elif div.get('subtype') != None:
            title = div.get('subtype')
            
        elif div.get('type') not in ['part','chapter','section']:
            title = div.get('type')
        
        else:
            title = "Aucun titre."
    
    # Just a marker to spot errors within final output.
    except Exception as e:
        print(e, "-> Could not construct string for: "+ ET.tostring(word).decode('utf-8') + " in " + div.get('{http://www.w3.org/XML/1998/namespace}id'))
        title = "Pas réussi."
    
    return title

## FUNCTION: Make a dictionary from the tokens in a paragraph

In [6]:
def p_tokens(parag):
    
    pdict = {}
    count = 0
    
    for child in parag.findall("./*"):
        
        if child.tag == "{http://www.tei-c.org/ns/1.0}w":
            count += 1
            nb = child.get('n')
            texte = get_w_text(child)
            pdict[count] = {'nb':nb, 'text':texte}
            
            
        elif child.tag == "{http://www.tei-c.org/ns/1.0}ref":
            for token in child.findall("./{http://www.tei-c.org/ns/1.0}w"):
                count += 1
                nb = token.get('n')
                texte = get_w_text(token)
                pdict[count] = {'nb':nb, 'text':texte}
                
        elif child.tag == "{http://www.tei-c.org/ns/1.0}add":
            for token in child.findall("./{http://www.tei-c.org/ns/1.0}w"):
                count += 1
                nb = token.get('n')
                texte = get_w_text(token)
                pdict[count] = {'nb':nb, 'text':texte}
    
    return pdict

## FUNCTION: Compile a token dict into a string

In [7]:
def dict_to_str(tokendict):
    
    divlist = []
    
    for ind in sorted(tokendict.keys()):
        
        wtxt = tokendict[ind]['text']
        
        # If the list is empty, add the current word to the list.
        if len(divlist) == 0:
            divlist.append(wtxt)

        # If the token is a punctuation character which
        # is not separated from the previous word by a space,
        # add it to the last entry in the list.
        elif wtxt in noLspace:
            divlist[-1] += wtxt

        # If the last entry in the list is a character which
        # is not separated from the next word by a space,
        # add the current token to it.
        elif divlist[-1] in noRspace:
                divlist[-1] += wtxt

        # If the last letter in the last entry in the list is
        # a character which is not separated from the next word
        # by a space, add the current token to it.
        elif divlist[-1][-1] in noRspace:
            divlist[-1] += wtxt
        
        #elif wtxt in insecable:
        #    divlist[-1] += "\u00a0"
        #    divlist[-1] += wtxt

        # Otherwise, just add the token as a new list entry.
        else:
            divlist.append(wtxt)

        # Once you have treated every token in the title, make the
        # return string by adding a space between each list entry.
    stringed = " ".join(divlist)
    
    return stringed

## FUNCTION: Construct the context

In [8]:
def get_context(parag, refnbs):
    
    # parag = ET.Element("p")
    # before = list (liste de valeurs d'@n)
    
    after = False
    
    befdict = {}
    refdict = {}
    afdict = {}
    left = ""
    mention = ""
    right = ""
    
    for worder in parag.keys():
        count = worder
        wnb = parag[worder]['nb']
        wtxt = parag[worder]['text']
        
        if after == True:
            afdict[worder] = parag[worder]
        
        else:
            if wnb in refnbs:
                refdict[worder] = parag[worder]
                if wnb == refnbs[-1]:
                    after = True
            else:
                befdict[worder] = parag[worder]
    
    left = dict_to_str(befdict)
    right = dict_to_str(afdict)
    mention = dict_to_str(refdict)
    
    returndict = {'left':left, 'mention':mention, 'right':right}
    
    return returndict

## FUNCTION: Concordancer for ONE author.

In [9]:
def concordances(authorid, bookpath):
    
    refstr = "#" + authorid
    
    partcount = 0
    chptcount = 0
    sctcount = 0
    frontcount = 0
    refcount= 0
    
    authordict = {}
        
    with open(bookpath) as xmlfile:
        tree = ET.parse(xmlfile)
        root = tree.getroot()
        
        for frontdiv in root.findall('.//{http://www.tei-c.org/ns/1.0}front/{http://www.tei-c.org/ns/1.0}div'):
            partcount += 1
            divtype = frontdiv.get('type')
            partitle = title_str(frontdiv)

            for interesting in frontdiv.findall(".//{http://www.tei-c.org/ns/1.0}p[{http://www.tei-c.org/ns/1.0}ref]"):
                        
                for ref in interesting.findall("./{http://www.tei-c.org/ns/1.0}ref"):
                            
                    if ref.get('corresp') == refstr:
                                
                        whole_p = p_tokens(interesting)
                        refcount += 1
                        refnbs = []
                                
                        for refw in ref.findall('./{http://www.tei-c.org/ns/1.0}w'):
                            refnbs.append(refw.get('n'))
                                
                        sortedp = get_context(whole_p, refnbs)
                        str_p = dict_to_str(whole_p)
                                
                                    
                        authordict[refcount] = {
                            'ctxtg':sortedp['left'],
                            'mention':sortedp['mention'],
                            'ctxtd':sortedp['right'],
                            'p':str_p,
                            'part':"<front>",
                            'chpt':partitle,
                            'sct':""
                        }
        
        for part in root.findall('.//{http://www.tei-c.org/ns/1.0}div[@type="part"]'):
            partcount += 1
            partitle = title_str(part)

            for chapter in part.findall('.//{http://www.tei-c.org/ns/1.0}div[@type="chapter"]'):
                chptcount += 1
                chaptitle = title_str(chapter)
                
                for section in chapter.findall('.//{http://www.tei-c.org/ns/1.0}div[@type="section"]'):
                    sctcount += 1
                    
                    sectitle = title_str(section)
                        
                    for interesting in section.findall(".//{http://www.tei-c.org/ns/1.0}p[{http://www.tei-c.org/ns/1.0}ref]"):
                        
                        for ref in interesting.findall("./{http://www.tei-c.org/ns/1.0}ref"):
                            
                            if ref.get('corresp') == refstr:
                                
                                whole_p = p_tokens(interesting)
                                refcount += 1

                                refnbs = []
                                
                                for refw in ref.findall('./{http://www.tei-c.org/ns/1.0}w'):
                                    refnbs.append(refw.get('n'))
                                
                                sortedp = get_context(whole_p, refnbs)
                                str_p = dict_to_str(whole_p)
                                
                                    
                                authordict[refcount] = {
                                    'ctxtg':sortedp['left'],
                                    'mention':sortedp['mention'],
                                    'ctxtd':sortedp['right'],
                                    'p':str_p,
                                    'part':partitle,
                                    'chpt':chaptitle,
                                    'sct':sectitle
                                }
    
    return authordict

## Getting the author list with information

In [10]:
# Initiate the root element for the XML debugging file.
listroot = ET.Element("people")

# Loop on witnesses: construct the path from initial vars,
# then trigger extract() function on current witness,
# so as to both make the according element for the XML debugging file,
# and fill the general author dictionary.

for witness in witnesses:
    fullpath = binpath + witness + einpath
    
    listroot.append(extract(witness, fullpath))

Extracting authors on -> basnage


  0%|          | 0/360 [00:00<?, ?it/s]

Extracting authors on -> berault


  0%|          | 0/145 [00:00<?, ?it/s]

ulpien is already in the dict, so I'm not making it.
cassiod is already in the dict, so I'm not making it.
aulus is already in the dict, so I'm not making it.
solon is already in the dict, so I'm not making it.
papon is already in the dict, so I'm not making it.
cicero is already in the dict, so I'm not making it.
plutarque is already in the dict, so I'm not making it.
verro is already in the dict, so I'm not making it.
bartole is already in the dict, so I'm not making it.
tite-live is already in the dict, so I'm not making it.
platon is already in the dict, so I'm not making it.
charondas is already in the dict, so I'm not making it.
bouteiller is already in the dict, so I'm not making it.
guenois is already in the dict, so I'm not making it.
afflito is already in the dict, so I'm not making it.
rebuffe is already in the dict, so I'm not making it.
alciat is already in the dict, so I'm not making it.
bohier is already in the dict, so I'm not making it.
aufreri is already in the dict, 

  0%|          | 0/38 [00:00<?, ?it/s]

le-rouille is already in the dict, so I'm not making it.
davir is already in the dict, so I'm not making it.
terrien is already in the dict, so I'm not making it.
godefroy is already in the dict, so I'm not making it.
berault is already in the dict, so I'm not making it.
littleton is already in the dict, so I'm not making it.
dudo is already in the dict, so I'm not making it.
chopin is already in the dict, so I'm not making it.
du-moulin is already in the dict, so I'm not making it.
justinien is already in the dict, so I'm not making it.
cujas is already in the dict, so I'm not making it.
rheginon is already in the dict, so I'm not making it.
aimoin is already in the dict, so I'm not making it.
coquille is already in the dict, so I'm not making it.
beaumanoir is already in the dict, so I'm not making it.
scaliger is already in the dict, so I'm not making it.
ragueau is already in the dict, so I'm not making it.
seneca is already in the dict, so I'm not making it.
tiraqueau is already i

  0%|          | 0/207 [00:00<?, ?it/s]

matt-paris is already in the dict, so I'm not making it.
du-chesne is already in the dict, so I'm not making it.
spelman is already in the dict, so I'm not making it.
vossius is already in the dict, so I'm not making it.
menage is already in the dict, so I'm not making it.
ragueau is already in the dict, so I'm not making it.
du-cange is already in the dict, so I'm not making it.
glanville is already in the dict, so I'm not making it.
breton is already in the dict, so I'm not making it.
littleton is already in the dict, so I'm not making it.
bracton is already in the dict, so I'm not making it.
stanford is already in the dict, so I'm not making it.
selden is already in the dict, so I'm not making it.
couvellus is already in the dict, so I'm not making it.
afflito is already in the dict, so I'm not making it.
skenaeus is already in the dict, so I'm not making it.
tacite is already in the dict, so I'm not making it.
marculphe is already in the dict, so I'm not making it.
bignon is alread

  0%|          | 0/92 [00:00<?, ?it/s]

aristote is already in the dict, so I'm not making it.
gratien is already in the dict, so I'm not making it.
s-paul is already in the dict, so I'm not making it.
severe is already in the dict, so I'm not making it.
le-rouille is already in the dict, so I'm not making it.
cicero is already in the dict, so I'm not making it.
augustin is already in the dict, so I'm not making it.
justinien is already in the dict, so I'm not making it.
paulus is already in the dict, so I'm not making it.
ulpien is already in the dict, so I'm not making it.
pline is already in the dict, so I'm not making it.
virgile is already in the dict, so I'm not making it.
plutarque is already in the dict, so I'm not making it.
quintilien is already in the dict, so I'm not making it.
bald is already in the dict, so I'm not making it.
rebuffe is already in the dict, so I'm not making it.
bud is already in the dict, so I'm not making it.
tiraqueau is already in the dict, so I'm not making it.
imbert is already in the dic

## Query IdRef on authors and make general lists

In [11]:
csv_output = []
info_output = []

for author in tqdm(authors.keys()):
    try:
        link = authors[author]["bnf"]
        bnf=link.replace("catalogue", "data").replace("https","http")

        querystart = """
            PREFIX foaf: <http://xmlns.com/foaf/0.1/> 
            PREFIX dcterms: <http://purl.org/dc/terms/> 
            PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
            PREFIX owl: <http://www.w3.org/2002/07/owl#>

            SELECT ?prefLabel ?personne
            WHERE {
              ?personne a foaf:Person ;
                        owl:sameAs <"""

        queryend = """#foaf:Person> ;
                        skos:prefLabel ?prefLabel .
            }"""

        fullquery = querystart + bnf + queryend

        # Specify the DBPedia endpoint
        sparql = SPARQLWrapper("https://data.idref.fr/sparql")

        # Query for the description of "Capsaicin", filtered by language
        sparql.setQuery(fullquery)

        # Convert results to JSON format
        sparql.setReturnFormat(JSON)
        result = sparql.query().convert()
        output = result["results"]["bindings"][0]

        try:
            current = {
                "id":author,
                "ConDÉ name":authors[author]["name"],
                "found":"yes",
                "IdRef name":output["prefLabel"]["value"],
                "BnF":bnf,
                "IdRef":output["personne"]["value"],
                "earliest birth":authors[author]["earliest-birth"],
                "latest birth":authors[author]["latest-birth"],
                "earliest death":authors[author]["earliest-death"],
                "latest death":authors[author]["latest-death"]
            }
            csv_output.append(current)
            
        except:
            current = {
                "id":author,
                "ConDÉ name":authors[author]["name"],
                "found":"yes",
                "IdRef name":"",
                "BnF":bnf,
                "IdRef":"",
                "earliest birth":authors[author]["earliest-birth"],
                "latest birth":authors[author]["latest-birth"],
                "earliest death":authors[author]["earliest-death"],
                "latest death":authors[author]["latest-death"]
            }
            csv_output.append(current)
            
    except:
        current = {
                "id":author,
                "ConDÉ name":authors[author]["name"],
                "found":"no",
                "IdRef name":"",
                "BnF":"",
                "IdRef":"",
                "earliest birth":authors[author]["earliest-birth"],
                "latest birth":authors[author]["latest-birth"],
                "earliest death":authors[author]["earliest-death"],
                "latest death":authors[author]["latest-death"]
            }
        csv_output.append(current)
    

  0%|          | 0/489 [00:00<?, ?it/s]

## Make the basic author files

In [12]:
with open(idrefd_csv,"w") as csvfile:
    csvwriting = csv.DictWriter(csvfile, fieldnames = ["id", "ConDÉ name", "found", "IdRef name", "BnF", "IdRef", "earliest birth", "latest birth","earliest death", "latest death"])
    csvwriting.writeheader()
    csvwriting.writerows(csv_output)
    
with open(idrefd_json, "w") as jsonfile:
    json.dump(csv_output, jsonfile, ensure_ascii=False)

for author in tqdm(csv_output):
    try:
        if int(author["earliest birth"]) <= 560:
            info_output.append(author)
    except:
        print(author["id"])
            
with open(classic_idrefd, "w") as authorfile:
    csvwriting = csv.DictWriter(authorfile, fieldnames = ["id", "ConDÉ name", "found", "IdRef name", "BnF", "IdRef", "earliest birth", "latest birth","earliest death", "latest death"])
    csvwriting.writeheader()
    csvwriting.writerows(info_output)

  0%|          | 0/489 [00:00<?, ?it/s]

anian
herold
arq
goncanus
febur
s-michel
masurier
chartier
ald
frigentius
fillesac
adrien-pape
s-matthieu
fortin
adrien
moise
froben
s-thomas
josephe
balduin
s-jeanbap
josue
bannes
evariste
montholon
bonneton
bam
colombel
pape-lucius
marcellus
s-luc
angel
nicephore
muncer
iudaus
anchar
timarchus
skinner
lauriere
perard
robertson
gillet
camus
rigord
nicod
duplessis
chantreau
gousset
vedel
morgues
everard
renauldon
bertheaume
maillard
gouget
avezan


## Get mentions of all relevant authors and output one table each

In [14]:
# CSV columns:
columns_l = ["ID","Auteur", "BnF", "IdRef", "Naissance (min)", "Naissance (max)", "Mort (min)", "Mort (max)", "N°", "Témoin", "Contexte G", "Mention", "Contexte D", "Paragraphe", "Partie", "Chapitre", "Section"]
all_occs = []
occstr = ""
all_authors_one_file = []

for author in tqdm(info_output):
    authorname = author['IdRef name']
    authorpath = new_dir + "/" + author['id'] + ".csv"
    author_output = []
    birthmin, birthmax, deathmin, deathmax = dateprep(author["earliest birth"], author["latest birth"], author["earliest death"], author["latest death"])
    
    for witness in witnesses:
        fullpath = binpath + witness + einpath
        bookdict = concordances(author['id'], fullpath)
            
        for occurrence in bookdict.keys():
            occ = bookdict[occurrence]
            csv_line = {
                "ID":author['id'],
                "Auteur":author['IdRef name'],
                "BnF":author['BnF'],
                "IdRef":author['IdRef'],
                "Naissance (min)":birthmin,
                "Naissance (max)":birthmax,
                "Mort (min)":deathmin,
                "Mort (max)":deathmax,
                "N°":occurrence,
                "Témoin":witness,
                "Contexte G":occ['ctxtg'],
                "Mention":occ['mention'],
                "Contexte D":occ['ctxtd'],
                "Paragraphe":occ['p'],
                "Partie":occ['part'],
                "Chapitre":occ['chpt'],
                "Section":occ['sct']
                
            }
            author_output.append(csv_line)
            all_authors_one_file.append(csv_line)
            
    countoccs = str(len(author_output))
    
    all_occs.append({'id':author['id'], 'nb': countoccs})
    
    with open(authorpath, 'w') as csvtobe:
        csvwriting = csv.DictWriter(csvtobe, fieldnames=columns_l)
        csvwriting.writeheader()
        csvwriting.writerows(author_output)
        
for author in all_occs:
    occstr += f"{author['id']} was spotted {author['nb']} times.\n"
   

  0%|          | 0/94 [00:00<?, ?it/s]

In [15]:
with open(new_dir + "/rapport.txt", "w") as txtfile:
    txtfile.write(occstr)
    
with open(all_classic_occurrences, "w") as csvfile:
    csvwriting = csv.DictWriter(csvfile, fieldnames=columns_l)
    csvwriting.writeheader()
    csvwriting.writerows(all_authors_one_file)