# TCOF - CNRTL

**Imports**

In [21]:
import os
import pandas as pd
import re
import requests
import xml.etree.ElementTree as ET
import numpy as np
from bs4 import BeautifulSoup

# **Téléchargement des corpus depuis le site CNRTL**

In [None]:
def corpus_name_cnrtl(number):
    url = "https://tcof.atilf.fr/index.php?r=corpus%2Fview&id={}".format(number)
    response = requests.get(url)
    html_content = response.text
    soup = BeautifulSoup(html_content, "html.parser")
    # Find the h1 tag and extract its text
    h1_tag = soup.find('h1')
    text = h1_tag.get_text()
    # Extract the corpus name from the text
    corpus_name = text.replace('Corpus ', '')
    return corpus_name

In [None]:
for i in range(1,800):
    try:
        url = 'https://tcof.atilf.fr/index.php?r=corpus/download-corpus&id={}'.format(i)
        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            filename = 'E:/Corpus/TCOF_CNRTL/{}.zip'.format(corpus_name_cnrtl(i))
            with open(filename, 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)
    except:
        pass

# **Fichiers dans le dossier**

In [2]:
def get_files_cnrtl(path):
    list_corpus_path = [add_path2corpus(path, corpus) for corpus in get_all_folder(path)]
    mydict = {}
    for corpus_path in list_corpus_path:
        dict_corpus = get_corpus_files(corpus_path)
        mydict[corpus_path]=dict_corpus
    df = pd.DataFrame.from_dict(mydict, orient='index').rename_axis('corpus').reset_index()
    return df

In [3]:
def get_corpus_files(path):
    files = {}
    for file in os.listdir(path):
        if file.endswith('.xml'):
            files['xml']=file
        if file.endswith('.trs'):
            pattern = r'(anonymise_[\S]*.trs)'
            match = re.search(pattern, file)
            if match:
                files['anonymise_trs']=file
            else:
                files['trs']=file
        if file.endswith('.wav'):
            pattern = r'(anonymise_[\S]*.wav)'
            match = re.search(pattern, file)
            if match:
                files['anonymise_wav']=file
            else:
                files['wav']=file
        else:
            continue
    return files

In [4]:
def add_path2corpus(path,corpus):
    new = '{}\{}'.format(path, corpus)
    return new

In [5]:
def get_all_folder(path):
    directory = os.listdir(path)
    folder_list = []
    for folder in directory:
        if os.path.isdir(os.path.join(path, folder)):
            folder_list.append(folder)
    return folder_list

In [6]:
def add_corpus_names_col(df):
    wav = df['xml'].tolist()
    corpus_names = []
    for file in wav:
        try: 
            pattern = r'^(.*)\.xml$'
            match = re.search(pattern, file)
            if match:
                corpus = match.group(1)
        except:
            corpus = 'NAN'
        corpus_names.append(corpus)
    df['corpus_name'] = corpus_names
    return df

In [7]:
#mettre le chemin du dossier 
path_cnrtl = r'E:\Corpus\TCOF_CNRTL'
#obtenir le dataframe avec le noms des fichiers du dossier dedans
df_cnrtl = get_files_cnrtl(path_cnrtl)

In [8]:
df_cnrtl

Unnamed: 0,corpus,xml,wav,trs,anonymise_wav,anonymise_trs
0,E:\Corpus\TCOF_CNRTL\theorielinguistique_07,theorielinguistique_07.xml,theorielinguistique_07.wav,theorielinguistique_07.trs,,
1,E:\Corpus\TCOF_CNRTL\testssncf_qab_12,testssncf_qab_12.xml,testssncf_qab_12.wav,testssncf_qab_12.trs,anonymise_testssncf_qab_12.wav,anonymise_testssncf_qab_12.trs
2,E:\Corpus\TCOF_CNRTL\theatre_fle_11,theatre_fle_11.xml,,theatre_fle_11.trs,,
3,E:\Corpus\TCOF_CNRTL\telephone_lam_13,telephone_lam_13.xml,telephone_lam_13.wav,telephone_lam_13.trs,anonymise_telephone_lam_13.wav,anonymise_telephone_lam_13.trs
4,E:\Corpus\TCOF_CNRTL\tel_maz_07,tel_maz_07.xml,tel_maz_07.wav,tel_maz_07.trs,,
...,...,...,...,...,...,...
584,E:\Corpus\TCOF_CNRTL\thibaut1_der,thibaut1_der.xml,thibaut1_der.wav,thibaut1_der.trs,,
585,E:\Corpus\TCOF_CNRTL\thibault1_lev,thibault1_lev.xml,thibault1_lev.wav,thibault1_lev.trs,,
586,E:\Corpus\TCOF_CNRTL\thibault1_cor,thibault1_cor.xml,thibault1_cor.wav,thibault1_cor.trs,,
587,E:\Corpus\TCOF_CNRTL\thibaud1_son,thibaud1_son.xml,thibaud1_son.wav,thibaud1_son.trs,,


In [10]:
df_cnrtl = add_corpus_names_col(df_cnrtl)

In [11]:
#liste de tous les fichiers pour chaque corpus
df_cnrtl 

Unnamed: 0,corpus,xml,wav,trs,anonymise_wav,anonymise_trs,corpus_name
0,E:\Corpus\TCOF_CNRTL\theorielinguistique_07,theorielinguistique_07.xml,theorielinguistique_07.wav,theorielinguistique_07.trs,,,theorielinguistique_07
1,E:\Corpus\TCOF_CNRTL\testssncf_qab_12,testssncf_qab_12.xml,testssncf_qab_12.wav,testssncf_qab_12.trs,anonymise_testssncf_qab_12.wav,anonymise_testssncf_qab_12.trs,testssncf_qab_12
2,E:\Corpus\TCOF_CNRTL\theatre_fle_11,theatre_fle_11.xml,,theatre_fle_11.trs,,,theatre_fle_11
3,E:\Corpus\TCOF_CNRTL\telephone_lam_13,telephone_lam_13.xml,telephone_lam_13.wav,telephone_lam_13.trs,anonymise_telephone_lam_13.wav,anonymise_telephone_lam_13.trs,telephone_lam_13
4,E:\Corpus\TCOF_CNRTL\tel_maz_07,tel_maz_07.xml,tel_maz_07.wav,tel_maz_07.trs,,,tel_maz_07
...,...,...,...,...,...,...,...
584,E:\Corpus\TCOF_CNRTL\thibaut1_der,thibaut1_der.xml,thibaut1_der.wav,thibaut1_der.trs,,,thibaut1_der
585,E:\Corpus\TCOF_CNRTL\thibault1_lev,thibault1_lev.xml,thibault1_lev.wav,thibault1_lev.trs,,,thibault1_lev
586,E:\Corpus\TCOF_CNRTL\thibault1_cor,thibault1_cor.xml,thibault1_cor.wav,thibault1_cor.trs,,,thibault1_cor
587,E:\Corpus\TCOF_CNRTL\thibaud1_son,thibaud1_son.xml,thibaud1_son.wav,thibaud1_son.trs,,,thibaud1_son


# **Fichiers xml**

In [132]:
def xml2df(xml_files):
    list_df =  []
    for xml_file in xml_files:
        xml_ = open('{}'.format(xml_file), 'r')
        dict_file, locuteur = parsexml(xml_)
        df_corpus = get_df(dict_file)
        locuteur_list = list(locuteur.items())
        df_corpus['locuteur'] = [locuteur_list] * len(df_corpus)
        list_df.append(df_corpus)
    df = pd.concat(list_df)
    return df

In [133]:
def get_df(my_dict):
    dict2df_corpus = my_dict.copy()
    df_corpus = pd.DataFrame([dict2df_corpus])
    return df_corpus

In [134]:
def get_general(soup, my_dict, general):
    for tag_name in general:
        tag = soup.find(tag_name)
        if tag is not None:
            tag_value = tag.text.strip()
            tag_name = tag.name
            my_dict[tag_name] = tag_value
    return my_dict

def get_enregistrement(soup, my_dict, enregistrement):
    for tag_name in enregistrement:
        tag = soup.find(tag_name)
        if tag is not None:
            tag_value = tag.text.strip()
            tag_name = tag.name
            my_dict[tag_name] = tag_value
    return my_dict

def get_referencement(soup, my_dict, referencement):
    for tag_name in referencement:
        tag = soup.find(tag_name)
        if tag is not None:
            tag_value = tag.text.strip()
            tag_name = tag.name
            my_dict[tag_name] = tag_value
    return my_dict

def get_transcription(soup, my_dict, transcription):
    for tag_name in transcription:
        tag = soup.find(tag_name)
        if tag is not None:
            tag_value = tag.text.strip()
            tag_name = tag.name
            my_dict[tag_name] = tag_value
    return my_dict

def get_locuteurs(soup, my_dict, locuteur_tag):
    # get all locuteurs
    locuteurs = soup.find_all('locuteur')

    # create a dictionary of locuteurs
    for locuteur in locuteurs[1:]:
        loc_principal = locuteur.get('locuteurPrincipal')
        loc_nb_tour = locuteur.get('nombre_tours')
        loc_identifiant = locuteur.get('identifiant')

        # create a new dictionary for this locuteur
        dict_locuteur = {}
        dict_locuteur['nb_tour'] = loc_nb_tour
        dict_locuteur['locuteur_principal'] = loc_principal

        for tag_name in locuteur_tag:
            tag = locuteur.find(tag_name)
            if tag is not None:
                tag_value = tag.text.strip()
                tag_name = tag.name
                dict_locuteur[tag_name] = tag_value
        # add this locuteur to the dictionary
        my_dict[loc_identifiant] = dict_locuteur
    return my_dict

In [135]:
def parsexml(xml):
    general = ["nomDossier", "responsable_corpus","droit_acces","lien_autre_corpus","mail",'logiciel_alignement','anonymisation',"nombre_locuteurs","relation","type_corpus","modalite_recueil","canal","cadre","degre",'situation_enonciation',"genre","support_dialogue","document_annexe","resume","commentaire","createur_fiche",'date_creation_fiche']
    enregistrement = ["nom_fichier", "responsable", "autorisation", "qualite","taille","duree","date","duree_transcription","debut_timecode_transcription", "dernier_timecode_transcription", "pays","region","ville","arrondissement", "description_lieu","format"]
    transcription = ["nom_fichier", "transcripteur","reviseur","format", "nombre_mots", "convention_transcription"]
    referencement = ["nom_corpus", "responsable", "titre", "laboratoire"]
    locuteur_tag = ['age','sexe','etude','formation','profession_actuelle','profession_anterieure','role','degre','statut_francais','autre_langue','relation_locuteur','naissance','residence','appartenance','particularite','nombre_mots','temps_parole']
  
    soup = BeautifulSoup(xml, "lxml-xml")
    my_dict = {}
    my_dict.update(get_general(soup, my_dict, general))
    my_dict.update(get_enregistrement(soup, my_dict, enregistrement))
    my_dict.update(get_transcription(soup, my_dict,transcription))
    my_dict.update(get_referencement(soup, my_dict, referencement))
    locuteur = {}
    dict_locuteur = get_locuteurs(soup, locuteur, locuteur_tag)
    #my_dict.update(get_locuteurs(soup, my_dict, locuteur_tag))
    return my_dict, dict_locuteur

In [136]:
xml_files_cnrtl = [r'{}\{}.xml'.format(x, y).replace('\\','/') for x,y in list(zip(df_cnrtl['corpus'].tolist(), df_cnrtl['corpus_name'].tolist()))]

In [137]:
df_cnrtl_metadonnees = xml2df(xml_files_cnrtl)

In [138]:
#liste des métadonnées de chaque corpus
df_cnrtl_metadonnees

Unnamed: 0,nomDossier,responsable_corpus,droit_acces,mail,logiciel_alignement,anonymisation,nombre_locuteurs,relation,type_corpus,modalite_recueil,...,description_lieu,format,transcripteur,reviseur,nombre_mots,convention_transcription,nom_corpus,titre,laboratoire,locuteur
0,theorielinguistique_07,Virginie AndrÃ©,Libre,virginie.andre@univ-lorraine.fr,Transcriber,Son + transcription\nBip,3,professionnelle,entre adultes,Inconnue,...,,Wav,,,2423,2008,theorielinguistique_07,TCOF,ATILF,"[(L1, {'nb_tour': '66', 'locuteur_principal': ..."
0,testssncf_qab_12,Virginie AndrÃ©,Libre,virginie.andre@univ-lorraine.fr,Transcriber,Son + transcription\nBip,2,lien de parentÃ©,entre adultes,Inconnue,...,,Wav,"MICHEL Annlyse, QABICE Fanny",NASSAU Guillaume,3086,2008,testssncf_qab_12,TCOF,ATILF,"[(L1, {'nb_tour': '70', 'locuteur_principal': ..."
0,theatre_fle_11,Virginie AndrÃ©,Libre,virginie.andre@univ-lorraine.fr,Transcriber,Son + transcription\nBip,8,lien amical,entre adultes,Inconnue,...,,Wav,"FLEURANCE ClÃ©mentine, THOMARDEL Camille, GIRA...",NASSAU Guillaume,3859,2008,theatre_fle_11,TCOF,ATILF,"[(L1, {'nb_tour': '22', 'locuteur_principal': ..."
0,telephone_lam_13,Virginie AndrÃ©,Libre,virginie.andre@univ-lorraine.fr,Transcriber,Son + transcription\nBip,2,lien de parentÃ©,entre adultes,Inconnue,...,,Wav,LAMBERT LUCIE,NASSAU Guillaume,2372,2008,telephone_lam_13,TCOF,ATILF,"[(L1, {'nb_tour': '90', 'locuteur_principal': ..."
0,tel_maz_07,Virginie AndrÃ©,Libre,Virginie.Andre@univ-nancy2.fr,Transcriber,Son + transcription\nBip,2,lien amical,entre adultes,Inconnue,...,,Wav,"Mazoyer Mathilde, Gabrion Julie",StÃ©phanie Houin,1169,2008,tel_maz_07,TCOF,ATILF,"[(L1, {'nb_tour': '46', 'locuteur_principal': ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,thibaut1_der,Emmanuelle Canut,Libre,emmanuelle.canut@univ-lorraine.fr,Transcriber,Son + transcription\nBip,2,connaissance,adulte-enfant,SollicitÃ©es par un chercheur,...,,Wav,Dereniowski Catherine,G. Nassau,2459,2008,thibaut1_der,TCOF,ATILF,"[(Adulte, {'nb_tour': '77', 'locuteur_principa..."
0,thibault1_lev,Emmanuelle Canut,Libre,emmanuelle.canut@univ-lorraine.fr,Transcriber,Son + transcription\nBip,2,connaissance,adulte-enfant,SollicitÃ©es par un chercheur,...,,Wav,Y.Sow,E.Canut,940,2008,thibault1_lev,TCOF,ATILF,"[(Thibault, {'nb_tour': '71', 'locuteur_princi..."
0,thibault1_cor,Emmanuelle Canut,Libre,emmanuelle.canut@univ-lorraine.fr,Transcriber,Son + transcription\nBip,2,connaissance,adulte-enfant,SollicitÃ©es par un chercheur,...,,Wav,Y.Sow,E.Canut,728,2008,thibault1_cor,TCOF,ATILF,"[(Adulte, {'nb_tour': '88', 'locuteur_principa..."
0,thibaud1_son,Emmanuelle Canut,Libre,emmanuelle.canut@univ-lorraine.fr,Transcriber,Son + transcription\nBip,2,connaissance,adulte-enfant,SollicitÃ©es par un chercheur,...,,Wav,"Huguin, Sontag EugÃ©nie",G. Nassau,1067,2008,thibaud1_son,TCOF,ATILF,"[(Adulte, {'nb_tour': '55', 'locuteur_principa..."


# **Fichier trs**

**1. Analyse des tags.name**

In [143]:
def get_all_tagsname(xml_file):
    # Parse the XML file into an ElementTree object
    try : 
        tree = ET.parse(xml_file)
    # Get the root element of the tree
        root = tree.getroot()
    # Initialize an empty set to store the tag names
        tag_names = set()
    # Iterate over all elements in the tree and add their tag names to the set
        for elem in root.iter():
            tag_names.add(elem.tag)
    # Return the set of tag names
    except: 
        return "Pas de fichier trouvé à cet emplacement"
    return tag_names

In [151]:
def get_corpus_name(path):
    filename = re.search(r'[^\\\/]*?(?=\.\w+$)', path).group()
    return filename

In [156]:
def get_df_tags(xml_files):
    corpus_name = [get_corpus_name(xml_file) for xml_file in xml_files]
    tags_ = [get_all_tagsname(xml_file) for xml_file in xml_files]
    df = pd.DataFrame(list(zip(corpus_name, tags_)), columns = ['corpus_name','tags'])
    return df

In [176]:
def get_difference(df):
    # prendre la liste des tags + l'intersection de toutes les listes
    lists_tags = df['tags'].tolist()
    results_union = set().union(*lists_tags)
    # trouver les différences entre l'intersection de toutes les listes et la liste des tags d'un corpus donné
    df['is_same'] = df['tags'].apply(lambda x:is_same(results_union,x))
    return results_union, df

In [177]:
def is_same(list1, list2):
    s = set(list1)
    difference = [x for x in list2 if x not in s]
    if difference : 
        return difference
    else: 
        return True

In [178]:
trs_files_cnrtl = [r'{}\{}.trs'.format(x, y).replace('\\','/') for x,y in list(zip(df_cnrtl['corpus'].tolist(), df_cnrtl['corpus_name'].tolist()))]

In [180]:
df_cnrtl_tagstrs = get_df_tags(trs_files_cnrtl)

In [181]:
#liste des tags utilisés pour chaque fichier .trs
df_cnrtl_tagstrs

Unnamed: 0,corpus_name,tags
0,theorielinguistique_07,"{Speaker, Speakers, Trans, Sync, Section, Even..."
1,testssncf_qab_12,"{Speaker, Speakers, Trans, Sync, Section, Even..."
2,theatre_fle_11,"{Speaker, Speakers, Trans, Sync, Section, Even..."
3,telephone_lam_13,"{Speaker, Speakers, Trans, Sync, Section, Turn..."
4,tel_maz_07,"{Speaker, Speakers, Trans, Sync, Section, Even..."
...,...,...
584,thibaut1_der,"{Speaker, Speakers, Topic, Trans, Sync, Topics..."
585,thibault1_lev,"{Speaker, Speakers, Topic, Trans, Sync, Topics..."
586,thibault1_cor,"{Speaker, Speakers, Topic, Trans, Sync, Topics..."
587,thibaud1_son,"{Speaker, Speakers, Topic, Trans, Sync, Topics..."


Comparaison des tags

In [182]:
results_union_trs, df_cnrtl_tagstrs = get_difference(df_cnrtl_tagstrs)
print('Nb de tags communs :', len(results_union_trs), '\n Liste des tags communs :\n', results_union_trs)

Nb de tags communs : 12 
 Liste des tags communs :
 {'Speaker', 'Speakers', 'Sync', 'Topics', 'Section', 'Comment', 'Episode', 'Who', 'Topic', 'Trans', 'Event', 'Turn'}


In [183]:
df_cnrtl_tagstrs

Unnamed: 0,corpus_name,tags,is_same
0,theorielinguistique_07,"{Speaker, Speakers, Trans, Sync, Section, Even...",True
1,testssncf_qab_12,"{Speaker, Speakers, Trans, Sync, Section, Even...",True
2,theatre_fle_11,"{Speaker, Speakers, Trans, Sync, Section, Even...",True
3,telephone_lam_13,"{Speaker, Speakers, Trans, Sync, Section, Turn...",True
4,tel_maz_07,"{Speaker, Speakers, Trans, Sync, Section, Even...",True
...,...,...,...
584,thibaut1_der,"{Speaker, Speakers, Topic, Trans, Sync, Topics...",True
585,thibault1_lev,"{Speaker, Speakers, Topic, Trans, Sync, Topics...",True
586,thibault1_cor,"{Speaker, Speakers, Topic, Trans, Sync, Topics...",True
587,thibaud1_son,"{Speaker, Speakers, Topic, Trans, Sync, Topics...",True


Tous les fichiers trs ont les mêmes tags

**2. Parsage des fichiers trs**

In [231]:
def parse_trs(file):
    # = ["nom_fichier", "responsable", "autorisation", "qualite","taille","duree","date","duree_transcription","debut_timecode_transcription", "dernier_timecode_transcription", "pays","region","ville","arrondissement", "description_lieu","format"]
    transcription = ["nom_fichier", "transcripteur","reviseur","format", "nombre_mots", "convention_transcription"]
    referencement = ["nom_corpus", "responsable", "titre", "laboratoire"]
    locuteur_tag = ['age','sexe','etude','formation','profession_actuelle','profession_anterieure','role','degre','statut_francais','autre_langue','relation_locuteur','naissance','residence','appartenance','particularite','nombre_mots','temps_parole']
    trs = open(file,'r')
    soup = BeautifulSoup(trs, "lxml-xml")
    my_dict = {}
    my_dict.update(get_trans(soup, my_dict))
    my_dict.update(get_speaker(soup, my_dict))
    my_dict.update(get_topics(soup, my_dict))
    my_dict.update(get_section(soup, my_dict))
    my_dict.update(get_turn(soup, my_dict))
    return my_dict

In [259]:
def get_trans(soup, my_dict):
    trans = soup.find('Trans')
    # create a dictionary of locuteurs
    if trans is not None:
        audio_filename = trans.get('audio_filename')
        my_dict['audio_filename'] = audio_filename
    return my_dict

def get_speaker(soup, my_dict):
    speakers = soup.find_all('Speaker')
    # create a dictionary of speakers
    for speaker in speakers:
        scope = speaker.get('scope')
        accent = speaker.get('accent')
        dialect = speaker.get('dialect')
        check = speaker.get('check')
        name = speaker.get('name')
        id_ = speaker.get('id')
        # create a new dictionary for this speaker
        dict_locuteur = {}
        dict_locuteur['scope'] = scope
        dict_locuteur['accent'] = accent
        dict_locuteur['dialect'] = dialect
        dict_locuteur['check'] = check
        dict_locuteur['name'] = name
        dict_locuteur['id'] = id_
        my_dict[id_] = dict_locuteur
    return my_dict

def get_topics(soup, my_dict):
    topics = soup.find_all('Topic')
    # create a dictionary of locuteurs
    for topic in topics:
        id_ = topic.get('id')
        desc = topic.get('desc')
        # create a new dictionary for this speaker
        dict_topic = {}
        dict_topic['desc'] = desc
        dict_topic['id'] = id_
        my_dict[id_] = dict_topic
    return my_dict

def get_section(soup, my_dict):
    sections = soup.find_all('Section')
    # create a dictionary of locuteurs
    for section in sections:
        topic = section.get('topic')
        endTime = section.get('endTime')
        startTime = section.get('startTime')
        type_ = section.get('type')
        # create a new dictionary for this speaker
        dict_section = {}
        dict_section['topic'] = topic
        dict_section['endTime'] = endTime
        dict_section['startTime'] = startTime
        dict_section['type'] = type_
        my_dict['section_{}'.format(topic)] = dict_section
    return my_dict

def get_turn(soup, my_dict):
    turns = soup.find_all('Turn')
    for turn in turns:
        endTime = turn.get('endTime')
        startTime = turn.get('startTime')
        speaker = turn.get('speaker')
        sync_tags = turn.find_all('Sync')
        dict_turn = {}
        dict_turn['speaker'] = speaker
        dict_turn['text']= turn.text
        my_dict['{}_{}'.format(startTime, endTime)] = dict_turn
    return my_dict

In [260]:
def dict_to_dataframe(my_dict):
    df = pd.DataFrame.from_dict([my_dict])
    return df

**parser tous les fichiers trs**

In [262]:
list_df_trs = [dict_to_dataframe(parse_trs(file)) for file in trs_files_cnrtl]

**aperçu d'un résultat**

In [265]:
list_df_trs[10]

Unnamed: 0,audio_filename,spk1,spk2,spk3,to1,section_to1,0_7.176,7.176_7.476,7.476_8.924,8.924_15.556,...,961.832_963.051,963.051_963.144,963.144_964.453,964.453_965.688,965.688_969.265,969.265_970.706,970.706_971.487,971.487_971.987,971.987_973.737,973.737_974.667
0,solene_sofia_cm1_proinf.wav,"{'scope': 'local', 'accent': '', 'dialect': 'n...","{'scope': 'local', 'accent': '', 'dialect': 'n...","{'scope': 'local', 'accent': '', 'dialect': 'n...","{'desc': 'LS31/05/2012', 'id': 'to1'}","{'topic': 'to1', 'endTime': '974.667', 'startT...","{'speaker': 'spk1', 'text': '  voilÃ donc...","{'speaker': 'spk2', 'text': ' ben '}","{'speaker': 'spk1', 'text': ' dans ce qu'elle...","{'speaker': 'spk2', 'text': ' c'est que quand...",...,"{'speaker': 'spk2', 'text': ' et euh aussi il...","{'speaker': 'spk1', 'text': ' < ils pensent '}","{'speaker': 'spk3', 'text': ' donc Ã§a c'est ...","{'speaker': 'spk2', 'text': ' elles ont pas d...","{'speaker': 'spk1', 'text': ' non d'accord...","{'speaker': 'spk2', 'text': ' ah je peux m...","{'speaker': 'spk3', 'text': ' tu peux man- < ...","{'speaker': 'spk2', 'text': '  > '}","{'speaker': 'spk1', 'text': ' et bien je vous...","{'speaker': 'spk2', 'text': ' merci Ã vous ..."


**Pour chaque dataframe, il y a:**
- le nom du fichier audio, 
- le(s) locuteur(s) du fichier,
- le(s) topic(s) du fichier, 
- la(es) section(s) alignées
- les tours de paroles correspondent au reste des colonnes nommées *débutdutour_findutour*, puis pour chaque tour il y a le locuteur et le texte

**Ce qu'il reste à faire :**
- regarder plus en détail lorsqu'il y a plusieurs locuteurs
- regarder l'encodage pour les accents/caractères spéciaux