In [None]:
import os
import pandas as pd
import re
import requests
import xml.etree.ElementTree as ET
import numpy as np
from bs4 import BeautifulSoup
import chardet
from bs4.element import NavigableString, Tag

In [None]:
def get_all_folder(path):
    directory = os.listdir(path)
    folder_list = []
    for folder in directory:
        if os.path.isdir(os.path.join(path, folder)):
            folder_list.append('{}{}/'.format(path, folder))
        else:
            folder_list.append(path)
    return folder_list

In [None]:
def get_all_files(path):
    file_list = []
    for root, dirs, files in os.walk(path):
        for file in files:
            file_path = os.path.join(root, file)
            extension = os.path.splitext(file_path)[1]
            file_list.append((file_path, extension))
    return file_list

In [None]:
def flatten_list(liste):
    flattened_list = []
    for sublist in liste:
        for item in sublist:
            flattened_list.append(item)
    return flattened_list

# extract all corpus folders

In [None]:
liste_dossier = get_all_folder("E:/Corpus/clapi/clapi/2/CorpusComplet/")

In [None]:
liste_corpus = [get_all_folder(x) for x in liste_dossier]
liste_corpus = list(set(flatten_list(liste_corpus)))

In [None]:
for x in liste_corpus:
    print(x)

# Create dataframe to store all files inside a corpus

In [None]:
def create_new_key(dict_corpus, extension):
    if extension not in dict_corpus:
        key = '{}'.format(extension)
    else: 
        for i in range(1,100):
            possible_name =  '{}{}'.format(extension, i)
            if possible_name not in dict_corpus:
                key = possible_name
                break
            else:
                continue
    return key

In [None]:
df = pd.DataFrame()
for x in liste_corpus:  
    dict_corpus = {}
    pattern = r'[\S]+\/([\S]+)\/'
    match = re.search(pattern, x) 
    dict_corpus['corpus_name'] = str(match.group(1))
    for file, extension in get_all_files(x):
        if extension=='.xml':
            if file.endswith('OLAC.xml'):
                dict_corpus['olac']=file
            elif file.endswith('TEI.xml'):
                dict_corpus['tei']=file
            else:
                dict_corpus['{}'.format(create_new_key(dict_corpus, extension))]=file
        else: 
            dict_corpus['{}'.format(create_new_key(dict_corpus, extension))]=file
    df = df.append([dict_corpus], ignore_index = True)

In [None]:
df

# Inspect xml files

In [None]:
def detect_encoding(file):
    with open(file, 'rb') as f:
        result = chardet.detect(f.read())
        encoding = result['encoding'] 
    # Open the file with the detected encoding
    with open(file, 'r', encoding=encoding) as f:
        soup = BeautifulSoup(f, "lxml-xml")
    return soup

In [None]:
def get_all_tagsname(xml_file):
    # Parse the XML file into an ElementTree object
    try : 
        tree = ET.parse(xml_file)
    # Get the root element of the tree
        root = tree.getroot()
    # Initialize an empty set to store the tag names
        tag_names = set()
    # Iterate over all elements in the tree and add their tag names to the set
        for elem in root.iter():
            tag_names.add(elem.tag)
    # Return the set of tag names
    except: 
        return "Pas de fichier trouvé à cet emplacement"
    return tag_names

In [None]:
list_tags = []
for index, xml in enumerate(df['tei'].tolist()): 
    tags = []
    try : 
        soup = detect_encoding(xml)
        try : 
            tags_file = list(get_all_tagsname(xml))
            try :  
                tag = []
                for x in tags_file : 
                    pattern = r'\{[\S]+\}([\S]+)'
                    match = re.search(pattern, x) 
                    tag.append(match.group(1))
                tags.append(tag)
            except: 
                for x in tags_file : 
                    tags.append(x.replace('{http://purl.org/dc/elements/1.1/}','').replace('{http://purl.org/dc/terms/}',''))
        except: 
            tags.append('')
    except : 
        tags.append('')
    list_tags.append(tags)

In [None]:
df['tei_tags'] = list_tags

In [None]:
list_tags_olac = []
for index, xml in enumerate(df['olac'].tolist()): 
    tags = []
    try : 
        soup = detect_encoding(xml)
        try : 
            tags_file = list(get_all_tagsname(xml))
            try :  
                tag = []
                for x in tags_file : 
                    pattern = r'\{[\S]+\}([\S]+)'
                    match = re.search(pattern, x) 
                    tag.append(match.group(1))
                tags.append(tag)
            except: 
                for x in tags_file : 
                    tags.append(x.replace('{http://purl.org/dc/elements/1.1/}','').replace('{http://purl.org/dc/terms/}',''))
        except: 
            tags.append('')
    except : 
        tags.append('')
    list_tags_olac.append(tags)

In [None]:
df['olac_tags'] = list_tags_olac

In [None]:
def flatten_list(liste):
    flattened_list = []
    for sublist in liste:
        for item in sublist:
            flattened_list.append(item)
    return flattened_list

In [None]:
def is_same(list1, list2):
    s = set(list1)
    difference = [x for x in list2 if x not in s]
    if difference : 
        return difference
    else: 
        return True

In [None]:
def get_difference(df, col):
    # prendre la liste des tags + l'intersection de toutes les listes
    lists_tags = df['{}'.format(col)].tolist()
    results_union = set().union(*lists_tags)
    # trouver les différences entre l'intersection de toutes les listes et la liste des tags d'un corpus donné
    df['is_same{}'.format(col)] = df['{}'.format(col)].apply(lambda x:is_same(x, results_union))
    return results_union, df

In [None]:
results_union_olac, df = get_difference(df,'olac_tags')

In [None]:
df['tei_tags'] = df['tei_tags'].apply(lambda x : flatten_list(x))

In [None]:
results_union_tei, df = get_difference(df,'tei_tags')

In [None]:
df

## Liste des tags communs dans les fichiers TEI. xml

In [None]:
results_union_tei

## Liste des tags communs dans les fichiers OLAC. xml

In [None]:
results_union_olac

## DF TRANSCRIPTION

In [None]:
df_transcription = df.copy().drop(['.mp3', 'olac', 'tei', '.wav','.mp4','.mp41','.wmv','.mp42','.avi','.mp31','tei_tags','olac_tags','is_sameolac_tags','is_sametei_tags'], axis=1)

## Analyse du dataframe

In [None]:
df_transcription.isnull().sum() #la MAJORITE des corpus ne sont pas transcrits !!!!

In [None]:
df_transcription.isnull().sum().sum() #all the missing values in the dataset.

In [None]:
df_transcription.info()