# Concordancer for the authors documented by identifier

Script written by Morgane Pica for a submission to the symposium ["Lire les classiques en Normandie"](https://rmblf.be/2022/02/04/appel-a-contribution-lire-les-classiques-en-normandie/) (oct 2022), to be written by herself and Mathieu Goux.

This script contains several functions, in this exact order in the file:
* `get_w_text(word)` -> Function taking a `<tei:w>` element and returning its compiled textual content.
* `title_str(div, div_type, div_count)` -> Function taking a `<tei:div>` element with lemmatized text and returning its title, if any.
* `p_tokens(paragraph)` -> Function taking a `<tei:p>` element and returning all its words as a dictionary.
* `dict_to_str(token_dict)` -> Function taking a dictionary as produced by `p_tokens()` and returning the corresponding string.
* `get_context(paragraph, ref_numbers)` -> Function taking a dictionary as produced by `p_tokens()` and the list of the numbers of the tokens of a particular `<tei:ref>` element inside that paragraph, and sorting the tokens between left context, relevant text and right context.
* `concordances(author_id, book_path)` -> Function taking the identifier of an author and the path to a TEI-XML file, and using the previous functions to gather relevant information on all mentions of this author in this file. It returns a dictionary for this author.
    

**À FAIRE** :
* Renuméroter les tokens pour que tous aient un numéro.

## Imports & declarations

In [1]:
from tqdm.notebook import tqdm #tqdm est bibliothèque qui permet d'avoir une barre de progression
import xml.etree.ElementTree as ET
from datetime import datetime
import csv
import json
import os

ET.register_namespace("", "http://www.tei-c.org/ns/1.0")
ET.register_namespace('xml','http://www.w3.org/XML/1998/namespace')

# Get current time
dt = datetime.now()
tmsp = dt.strftime("%Y%m%d_%H%M")

# Not all witnesses were enriched with reference identification.
# Change paths to fit your own folder organization.
witnesses = ["basnage","berault","merville","pesnelle","terrien"]
binpath = "/home/mpica/Progs/perso/CONDE/editions/base-version/"
einpath = "_base.xml"

# Get Json documenting interesting authors.
with open("authors.json") as jsonf:
    authors = json.load(jsonf)

# Construct the path for outputs. 
current_dir = os.getcwd()
new_dir = f"{current_dir}/output/{tmsp}"
os.mkdir(new_dir)

# Make CSV path now.
csvpath = f'{new_dir}/occurrences.csv'

# Change output paths here if you like.
listfile = f"{new_dir}/authors.csv"
tablefile = f"{new_dir}/mentions.csv"
authortable = f"{new_dir}/authors.csv"
authorjson = f"{new_dir}/authors.json"
checklist = f"{new_dir}/checklist.xml"

# Lists of characters to be treated particularly.
noLspace = ",.)/]-'"
noRspace = "(/[]-'"
# insecable = ";:"

## FUNCTION: extract text from tei:w element

In [2]:
def get_w_text(word):
    
    """
    Function taking a <tei:w> element and
    returning its compiled textual content.
    
    :param word: ET.Element('{http://www.tei-c.org/ns/1.0}w')
    
    """
    
    # Preparing the return string as an empty string.
    texte = ""
    
    # If there is text directly inside <w> element and
    # before the first child, add it.
    if word.text:
        texte += str(word.text)
                
    # Loop on all current <w> children.
    for item in word:
            
        # If current child is <tei:height> or <tei:supplied>
        if item.tag == '{http://www.tei-c.org/ns/1.0}height' or item.tag == '{http://www.tei-c.org/ns/1.0}supplied':
            # Add text.
            texte += str(item.text)
            # If any, add the text following current child.
            if item.tail:
                texte += str(item.tail)
                
        # If current child is <tei:lb>, add the following text.
        elif item.tag == '{http://www.tei-c.org/ns/1.0}lb':
            if item.tail:
                texte += str(item.tail)
                        
        # If current child is <tei:choice>, add the second child of <choice>
        # (<tei:reg> or <tei:expan>), then add the text following current child if any.
        elif item.tag == '{http://www.tei-c.org/ns/1.0}choice':
            texte += str(item[1].text)
            if item.tail:
                texte += str(item.tail)
        
        # If current child is <tei:c>, add its text, then the following text if any.
        elif item.tag == '{http://www.tei-c.org/ns/1.0}c':
            texte += item.text
            if item.tail:
                texte += str(item.tail)
        
        
        # If current child is <tei:hi>, add its text, then the following text if any.
        elif item.tag == '{http://www.tei-c.org/ns/1.0}hi':
            texte += item.text
            if item.tail:
                texte += item.tail
        
        # If current child is <tei:add>, loop on its children and do the same checks.
        elif item.tag == '{http://www.tei-c.org/ns/1.0}add':
            # On refait tous les tests.
            if item.find('.') == None :
                texte = str(item.text)
                            
            else:
                        
                if item.text:
                    texte += str(item.text)
                        
                for subitem in item:
                    if subitem.tag == '{http://www.tei-c.org/ns/1.0}lb':
                        if subitem.tail:
                            texte += str(subitem.tail)
                    elif subitem.tag == '{http://www.tei-c.org/ns/1.0}choice':
                        texte += str(subitem[1].text)
                        if subitem.tail:
                            texte += str(subitem.tail)
                            
    return texte

## FUNCTION: Make title string

In [3]:
def title_str(div, dtype, dcount):
    
    """
    Function taking a <tei:div> element with lemmatized text
    and returning its title, if any.
    
    :param div: ET.Element('{http://www.tei-c.org/ns/1.0}div')
    :param dtype: div.get('type') as string
    :param dcount: integer
    
    """
    
    # Lists of characters to be treated particularly.
    noLspace = ",.)/]-'"
    noRspace = "(/[]-'"
    # insecable = ";:"
    
    # List of strings to be filled.
    divlist = []
    
    try:
        # If you do find a title as first child of div, make its text.
        if div.find('./*[1]').tag == "{http://www.tei-c.org/ns/1.0}head":
            
            # Loop on each <tei:w> word token.
            for word in div.findall('./{http://www.tei-c.org/ns/1.0}head/{http://www.tei-c.org/ns/1.0}w'):
                
                # Compile the text of current <tei:w> element.
                wtxt = get_w_text(word)
                
                # If the list is empty, add the current word to the list.
                if len(divlist) == 0:
                    divlist.append(wtxt)

                # If the token is a punctuation character which
                # is not separated from the previous word by a space,
                # add it to the last entry in the list.
                elif wtxt in noLspace:
                    divlist[-1] += wtxt
                
                # If the last entry in the list is a character which
                # is not separated from the next word by a space,
                # add the current token to it.
                elif divlist[-1] in noRspace:
                    divlist[-1] += wtxt

                # If the last letter in the last entry in the list is
                # a character which is not separated from the next word
                # by a space, add the current token to it.
                elif divlist[-1][-1] in noRspace:
                    divlist[-1] += wtxt

                #elif wtxt in insecable:
                #    divlist[-1] += "\u00a0"
                #    divlist[-1] += wtxt
                
                # Otherwise, just add the token as a new list entry.
                else:
                    divlist.append(wtxt)
            
            # Once you have treated every token in the title, make the
            # return string by adding a space between each list entry.
            title = " ".join(divlist)
        
        # If there is no title to the div but it has an @subtype,
        # its value makes the return string.
        elif div.get('subtype') != None:
            title = div.get('subtype')
        
        else:
            title = "Aucun titre."
    
    # Just a marker to spot errors within final output.
    except Exception as e:
        print(e, "-> Could not construct string for: "+ ET.tostring(word).decode('utf-8') + " in " + div.get('{http://www.w3.org/XML/1998/namespace}id'))
        title = "Pas réussi."
    
    return title

## FUNCTION: Make a dictionary from the tokens in a paragraph

In [4]:
def p_tokens(parag):
    
    pdict = {}
    count = 0
    
    for child in parag.findall("./*"):
        
        if child.tag == "{http://www.tei-c.org/ns/1.0}w":
            count += 1
            nb = child.get('n')
            texte = get_w_text(child)
            pdict[count] = {'nb':nb, 'text':texte}
            
            
        elif child.tag == "{http://www.tei-c.org/ns/1.0}ref":
            for token in child.findall("./{http://www.tei-c.org/ns/1.0}w"):
                count += 1
                nb = token.get('n')
                texte = get_w_text(token)
                pdict[count] = {'nb':nb, 'text':texte}
                
        elif child.tag == "{http://www.tei-c.org/ns/1.0}add":
            for token in child.findall("./{http://www.tei-c.org/ns/1.0}w"):
                count += 1
                nb = token.get('n')
                texte = get_w_text(token)
                pdict[count] = {'nb':nb, 'text':texte}
    
    return pdict

## FUNCTION: Compile a token dict into a string

In [5]:
def dict_to_str(tokendict):
    
    divlist = []
    
    for ind in sorted(tokendict.keys()):
        
        wtxt = tokendict[ind]['text']
        
        # If the list is empty, add the current word to the list.
        if len(divlist) == 0:
            divlist.append(wtxt)

        # If the token is a punctuation character which
        # is not separated from the previous word by a space,
        # add it to the last entry in the list.
        elif wtxt in noLspace:
            divlist[-1] += wtxt

        # If the last entry in the list is a character which
        # is not separated from the next word by a space,
        # add the current token to it.
        elif divlist[-1] in noRspace:
                divlist[-1] += wtxt

        # If the last letter in the last entry in the list is
        # a character which is not separated from the next word
        # by a space, add the current token to it.
        elif divlist[-1][-1] in noRspace:
            divlist[-1] += wtxt
        
        #elif wtxt in insecable:
        #    divlist[-1] += "\u00a0"
        #    divlist[-1] += wtxt

        # Otherwise, just add the token as a new list entry.
        else:
            divlist.append(wtxt)

        # Once you have treated every token in the title, make the
        # return string by adding a space between each list entry.
    stringed = " ".join(divlist)
    
    return stringed

## FUNCTION: Construct the context

In [6]:
def get_context(parag, refnbs):
    
    # parag = ET.Element("p")
    # before = list (liste de valeurs d'@n)
    
    after = False
    
    befdict = {}
    refdict = {}
    afdict = {}
    left = ""
    mention = ""
    right = ""
    
    for worder in parag.keys():
        count = worder
        wnb = parag[worder]['nb']
        wtxt = parag[worder]['text']
        
        if after == True:
            afdict[worder] = parag[worder]
        
        else:
            if wnb in refnbs:
                refdict[worder] = parag[worder]
                if wnb == refnbs[-1]:
                    after = True
            else:
                befdict[worder] = parag[worder]
    
    left = dict_to_str(befdict)
    right = dict_to_str(afdict)
    mention = dict_to_str(refdict)
    
    returndict = {'left':left, 'mention':mention, 'right':right}
    
    return returndict

## FUNCTION: Concordancer for ONE author.

In [7]:
def concordances(authorid, bookpath):
    
    refstr = "#" + authorid
    
    partcount = 0
    chptcount = 0
    sctcount = 0
    frontcount = 0
    refcount= 0
    
    authordict = {}
        
    with open(bookpath) as xmlfile:
        tree = ET.parse(xmlfile)
        root = tree.getroot()
        
        for part in root.findall('.//{http://www.tei-c.org/ns/1.0}div[@type="part"]'):
            partcount += 1
            partitle = title_str(part, "part", partcount)

            for chapter in part.findall('.//{http://www.tei-c.org/ns/1.0}div[@type="chapter"]'):
                chptcount += 1
                chaptitle = title_str(chapter, "chapter", chptcount)
                
                for section in chapter.findall('.//{http://www.tei-c.org/ns/1.0}div[@type="section"]'):
                    sctcount += 1
                    
                    sectitle = title_str(section, "section", sctcount)
                        
                    for interesting in section.findall(".//{http://www.tei-c.org/ns/1.0}p[{http://www.tei-c.org/ns/1.0}ref]"):
                        
                        for ref in interesting.findall("./{http://www.tei-c.org/ns/1.0}ref"):
                            
                            if ref.get('corresp') == refstr:
                                
                                whole_p = p_tokens(interesting)
                                refcount += 1

                                refnbs = []
                                
                                for refw in ref.findall('./{http://www.tei-c.org/ns/1.0}w'):
                                    refnbs.append(refw.get('n'))
                                
                                sortedp = get_context(whole_p, refnbs)
                                str_p = dict_to_str(whole_p)
                                
                                    
                                authordict[refcount] = {
                                    'ctxtg':sortedp['left'],
                                    'mention':sortedp['mention'],
                                    'ctxtd':sortedp['right'],
                                    'p':str_p,
                                    'part':partitle,
                                    'chpt':chaptitle,
                                    'sct':sectitle
                                }
    
    return authordict

## Using the previously declared functions

In [None]:
# CSV columns:
columns = ["ID","Auteur", "N°", "Témoin", "Contexte G", "Mention", "Contexte D", "Paragraphe", "Partie", "Chapitre", "Section"]

for author in authors.keys():
    authorname = authors[author]['name']
    authorpath = new_dir + "/" + author + ".csv"
    csv_output = []
    
    for witness in witnesses:
        fullpath = binpath + witness + einpath
        bookdict = concordances(author, fullpath)
            
        for occurrence in bookdict.keys():
            occ = bookdict[occurrence]
            csv_line = {
                "ID":author,
                "Auteur":authorname,
                "N°":occurrence,
                "Témoin":witness,
                "Contexte G":occ['ctxtg'],
                "Mention":occ['mention'],
                "Contexte D":occ['ctxtd'],
                "Paragraphe":occ['p'],
                "Partie":occ['part'],
                "Chapitre":occ['chpt'],
                "Section":occ['sct']
                
            }
            csv_output.append(csv_line)


    with open(authorpath, 'w') as csvtobe:
        csvwriting = csv.DictWriter(csvtobe, fieldnames=columns)
        csvwriting.writeheader()
        csvwriting.writerows(csv_output)