# Compare les balises TCOF selon la source de téléchargement

Imports

In [18]:
import os
import pandas as pd
import re
import requests
import xml.etree.ElementTree as ET

Fonctions

In [2]:
def get_all_folder(path):
    directory = os.listdir(path)
    folder_list = []
    for folder in directory:
        if os.path.isdir(os.path.join(path, folder)):
            folder_list.append(folder)
    return folder_list

In [3]:
def add_path2corpus(path,corpus):
    new = '{}\{}'.format(path, corpus)
    return new

In [7]:
def flatten_list(liste):
    flattened_list = []
    for sublist in liste:
        for item in sublist:
            flattened_list.append(item)
    return flattened_list

In [8]:
def get_corpus_files(path):
    files = {}
    for file in os.listdir(path):
        if file.endswith('.xml'):
            files['xml']=file
        if file.endswith('.trs'):
            files['trs']=file
        if file.endswith('.wav'):
            files['wav']=file
        else:
            continue
    return files

In [9]:
def get_files_df(path):
    folders_genre=[add_path2corpus(path, folder) for folder in get_all_folder(path)] 
    list_corpus_path = []
    for folder in folders_genre:
        corpus_path = [add_path2corpus(folder, corpus) for corpus in get_all_folder(folder)]
        list_corpus_path.append(corpus_path)
    list_corpus_path = flatten_list(list_corpus_path)
    mydict = {}
    for corpus_path in list_corpus_path:
        dict_corpus = get_corpus_files(corpus_path)
        mydict[corpus_path]=dict_corpus
    df = pd.DataFrame.from_dict(mydict, orient='index').rename_axis('corpus').reset_index()
    return df

In [10]:
def add_corpus_names_col(df):
    wav = df['wav'].tolist()
    corpus_names = []
    for file in wav:
        pattern = r'^(.*)\.wav$'
        match = re.search(pattern, file)
        if match:
            corpus = match.group(1)
        corpus_names.append(corpus)
    df['corpus_name'] = corpus_names
    return df

In [19]:
def get_all_tagsname(xml_file):
    # Parse the XML file into an ElementTree object
    tree = ET.parse(xml_file)
    # Get the root element of the tree
    root = tree.getroot()
    # Initialize an empty set to store the tag names
    tag_names = set()
    # Iterate over all elements in the tree and add their tag names to the set
    for elem in root.iter():
        tag_names.add(elem.tag)
    # Return the set of tag names
    return tag_names

In [107]:
def is_same(list1, list2):
    s = set(list1)
    difference = [x for x in list2 if x not in s]
    if difference : 
        return difference
    else: 
        return True

In [26]:
def get_corpus_name(path):
    filename = re.search(r'[^\\\/]*?(?=\.\w+$)', path).group()
    return filename

In [28]:
def get_df_tags(xml_files, source):
    corpus_name = [get_corpus_name(xml_file) for xml_file in xml_files]
    corpus_source = [source for xml_file in range(len(xml_files))]
    tags_ = [get_all_tagsname(xml_file) for xml_file in xml_files]
    df = pd.DataFrame(list(zip(corpus_name, corpus_source, tags_)), columns = ['corpus_name','source','tags'])
    return df

## Analyse de TCOF ZIP (depuis ortolang.fr)

In [77]:
#mettre le chemin du dossier téléchargé
path_zip = 'E:/Corpus/tcof/tcof/12/Corpus/Adultes'
#obtenir le dataframe avec le noms des fichiers du dossier dedans
df_zip = get_files_df(path_zip)

In [78]:
df_zip = add_corpus_names_col(df_zip)

In [85]:
df_zip

Unnamed: 0,corpus,wav,trs,xml,corpus_name
0,E:/Corpus/tcof/tcof/12/Corpus/Adultes\GenreNon...,plaid_haut_07.wav,plaid_haut_07.trs,plaid_haut_07.xml,plaid_haut_07
1,E:/Corpus/tcof/tcof/12/Corpus/Adultes\GenreNon...,sdl_bas_14.wav,sdl_bas_14.trs,sdl_bas_14.xml,sdl_bas_14
2,E:/Corpus/tcof/tcof/12/Corpus/Adultes\GenreNon...,forum_gom_14.wav,forum_gom_14.trs,forum_gom_14.xml,forum_gom_14
3,E:/Corpus/tcof/tcof/12/Corpus/Adultes\GenreNon...,voyage_jus_14.wav,voyage_jus_14.trs,voyage_jus_14.xml,voyage_jus_14
4,E:/Corpus/tcof/tcof/12/Corpus/Adultes\GenreNon...,chev_beu_sd.wav,chev_beu_sd.trs,chev_beu_sd.xml,chev_beu_sd
...,...,...,...,...,...
234,E:/Corpus/tcof/tcof/12/Corpus/Adultes\Conversa...,automobile_gue_08.wav,automobile_gue_08.trs,automobile_gue_08.xml,automobile_gue_08
235,E:/Corpus/tcof/tcof/12/Corpus/Adultes\Conversa...,tourisme_arn_15.wav,tourisme_arn_15.trs,tourisme_arn_15.xml,tourisme_arn_15
236,E:/Corpus/tcof/tcof/12/Corpus/Adultes\Conversa...,photographie_cou_14.wav,photographie_cou_14.trs,photographie_cou_14.xml,photographie_cou_14
237,E:/Corpus/tcof/tcof/12/Corpus/Adultes\Conversa...,voyage_con_15.wav,voyage_con_15.trs,voyage_con_15.xml,voyage_con_15


In [93]:
#créer la liste de tous les chemins de fichier xml du tcof_zip
xml_files_zip = [r'{}\{}.xml'.format(x, y).replace('\\','/') for x,y in list(zip(df_zip['corpus'].tolist(), df_zip['corpus_name'].tolist()))]

In [95]:
#remplir le dataframe 
df_zip = get_df_tags(xml_files_zip, "zip_ortolang")

In [96]:
df_zip

Unnamed: 0,corpus_name,source,tags
0,plaid_haut_07,zip_ortolang,"{resume, droit_acces, genre, support, autorisa..."
1,sdl_bas_14,zip_ortolang,"{resume, droit_acces, genre, support, autorisa..."
2,forum_gom_14,zip_ortolang,"{resume, droit_acces, genre, support, autorisa..."
3,voyage_jus_14,zip_ortolang,"{resume, droit_acces, genre, support, autorisa..."
4,chev_beu_sd,zip_ortolang,"{resume, droit_acces, genre, support, autorisa..."
...,...,...,...
234,automobile_gue_08,zip_ortolang,"{resume, droit_acces, genre, support, autorisa..."
235,tourisme_arn_15,zip_ortolang,"{resume, droit_acces, genre, support, autorisa..."
236,photographie_cou_14,zip_ortolang,"{resume, droit_acces, genre, support, autorisa..."
237,voyage_con_15,zip_ortolang,"{resume, droit_acces, genre, support, autorisa..."


# TCOF REPOSITORY

In [15]:
#remettre ici le chemin du dossier où les fichiers ont été téléchargés
directory_path = r'E:/Corpus/tcof/TCOF_repo/'
directory_files = os.listdir(directory_path)
xml_files_repo = []
for file in directory_files:
    if file.endswith('.xml'):
        xml_files_repo.append(file)

In [16]:
xml_files_repo = ['{}{}'.format(directory_path, x) for x in xml_files_repo]

In [32]:
df = get_df_tags(xml_files_repo, "repository_ortolang")

In [31]:
df

Unnamed: 0,corpus_name,source,tags
0,these_pit_09,repository_ortolang,"{resume, droit_acces, genre, support, autorisa..."
1,consultation_ger_08,repository_ortolang,"{resume, droit_acces, genre, support, autorisa..."
2,orthophonie2_san_10,repository_ortolang,"{resume, droit_acces, genre, support, autorisa..."
3,orthophonie_san_10,repository_ortolang,"{resume, droit_acces, genre, support, autorisa..."
4,pedi_gra_06,repository_ortolang,"{resume, droit_acces, genre, support, autorisa..."
...,...,...,...
234,reunionkholle_moi_10,repository_ortolang,"{resume, droit_acces, genre, support, autorisa..."
235,reunionpp_aar_08,repository_ortolang,"{resume, droit_acces, genre, support, autorisa..."
236,see_reu_mat_08,repository_ortolang,"{resume, droit_acces, genre, support, autorisa..."
237,taki_cam_13,repository_ortolang,"{resume, droit_acces, genre, support, autorisa..."


# TCOF CNRTL : à faire 

# Comparaison des balises

In [99]:
#réunir toutes les dataframes dans un seul
df = df.append(df_zip)

  df = df.append(df_zip)


In [100]:
df

Unnamed: 0,corpus_name,source,tags
0,these_pit_09,repository_ortolang,"{resume, droit_acces, genre, support, autorisa..."
1,consultation_ger_08,repository_ortolang,"{resume, droit_acces, genre, support, autorisa..."
2,orthophonie2_san_10,repository_ortolang,"{resume, droit_acces, genre, support, autorisa..."
3,orthophonie_san_10,repository_ortolang,"{resume, droit_acces, genre, support, autorisa..."
4,pedi_gra_06,repository_ortolang,"{resume, droit_acces, genre, support, autorisa..."
...,...,...,...
234,automobile_gue_08,zip_ortolang,"{resume, droit_acces, genre, support, autorisa..."
235,tourisme_arn_15,zip_ortolang,"{resume, droit_acces, genre, support, autorisa..."
236,photographie_cou_14,zip_ortolang,"{resume, droit_acces, genre, support, autorisa..."
237,voyage_con_15,zip_ortolang,"{resume, droit_acces, genre, support, autorisa..."


In [105]:
# prendre la liste des tags + l'intersection de toutes les listes
lists_tags = df['tags'].tolist()
results_union = set().union(*lists_tags)

In [114]:
# trouver les différences entre l'intersection de toutes les listes et la liste des tags d'un corpus donné
df['is_same'] = df['tags'].apply(lambda x:is_same(x, results_union))

In [116]:
#print all corpus with differents tags
df.loc[df['is_same'] != True]

Unnamed: 0,corpus_name,source,tags,is_same
0,these_pit_09,repository_ortolang,"{resume, droit_acces, genre, support, autorisa...",[reviseur]
1,consultation_ger_08,repository_ortolang,"{resume, droit_acces, genre, support, autorisa...",[reviseur]
2,orthophonie2_san_10,repository_ortolang,"{resume, droit_acces, genre, support, autorisa...",[reviseur]
3,orthophonie_san_10,repository_ortolang,"{resume, droit_acces, genre, support, autorisa...",[reviseur]
9,apiculteur_sd,repository_ortolang,"{nom, association, typologie, naissance, profe...","[transcripteur, reviseur]"
...,...,...,...,...
222,apiculteur_sd,zip_ortolang,"{nom, association, typologie, naissance, profe...","[transcripteur, reviseur]"
223,professeur_cez_08,zip_ortolang,"{resume, droit_acces, genre, support, autorisa...",[reviseur]
228,lecturecroco_sd,zip_ortolang,"{nom, association, typologie, naissance, profe...","[transcripteur, reviseur]"
233,siderurgie_guy_10,zip_ortolang,"{resume, droit_acces, genre, support, autorisa...",[reviseur]
