In [None]:
#import the required libraries
from bs4 import BeautifulSoup
import re
import pandas as pd
import os
import time

In [None]:
def parse_xml_tei(file_path):
    print(file_path)
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'xml')
    
    posts = soup.find_all('post')
    results = []
    for post in posts:
        xml_id = post['xml:id']
        try : 
            n = post['n']
        except:
            n = ''
        who = post['who']
        date_attr = [attr for attr in post.attrs if re.match(r'when.*', attr)]
        if date_attr:
            date = post[date_attr[0]]
        else:
            date = None
        texte = ' '.join([x.get_text() for x in post.find_all('p')])
        texte_mod =re.sub("\\\'", "\'", texte)
        texte = re.sub(r'\n\s+', ' ', texte_mod)
        sujet = file_path.replace('F:\Corpus\wikiconflict\cmr-wikiconflits-','').replace('_discu-tei-v1.xml','')
        results.append({'xml_id': xml_id, 'date': date, 'n':n, 'who':who, 'texte':texte, 'sujet':sujet})

    return results

In [None]:
def get_files_in_directory(directory):
    file_list = []
    # Vérifier si le dossier existe
    if os.path.exists(directory):
        # Parcourir tous les fichiers du dossier
        for root, dirs, files in os.walk(directory):
            for file in files:
                file_path = os.path.join(root, file)
                file_list.append(file_path)
    
    return file_list

In [None]:
def dict_to_dataframe(dictionary):
    df = pd.DataFrame(dictionary)
    return df

In [None]:
path = get_files_in_directory('F:\Corpus\wikiconflict')
posts = [parse_xml_tei(file_path) for file_path in path]

all_ = pd.concat([dict_to_dataframe(x) for x in posts])
all_.to_csv('F:\Corpus\\finaux\wikiconflict.csv', sep='\t', encoding='utf-8')

In [None]:
# Exemple 
file_path = 'F:\Corpus\wikiconflict\cmr-wikiconflits-bogdanoff_discu-tei-v1.xml'
posts = parse_xml_tei(file_path)

In [None]:
all_ = pd.concat([dict_to_dataframe(x) for x in posts])

In [None]:
import xml.etree.ElementTree as ET

def parse_reddit_xml(file_path):
    #tree = ET.parse(file_path, parser = ET.XMLParser(encoding = 'utf-8'))
    root =  ET.fromstring(file_path)

    data = []

    for s in root.findall('s'):
        link_id = s.get('link_id')
        subreddit_id = s.get('subreddit_id')
        
        for utt in s.findall('utt'):
            uid = utt.get('uid')
            comment_id = utt.get('comment_id')
            parent_id = utt.get('parent_id')
            score = utt.get('score')
            create_utc = utt.get('create_utc')
            text = utt.text.strip()
            
            data.append({
                'link_id': link_id,
                'subreddit_id': subreddit_id,
                'uid': uid,
                'comment_id': comment_id,
                'parent_id': parent_id,
                'score': score,
                'create_utc': create_utc,
                'text': text
            })
    time.sleep(3)
    return data


In [None]:
def remove_control_characters(xml_file):
    with open(xml_file, 'r', encoding='utf-8') as file:
        xml_content = file.read()

    # Supprimer les caractères de contrôle
    cleaned_content = re.sub(r'[\x00-\x08\x0B-\x0C\x0E-\x1F\x7F]', '', xml_content)

    return cleaned_content


In [None]:
def clean_string(text):
    # Supprimer les sauts de ligne en trop
    text = re.sub(r'\n+', '\n', text)
    
    # Supprimer les espaces en trop
    text = re.sub(r' +', ' ', text)
    
    # Supprimer les caractères d'échappement '\' suivis de "'"
    text = re.sub(r'\\\'', "'", text)
    
    return text

In [None]:
def clean_text_in_dict_list(dict_list):
    cleaned_list = [
        {key: clean_string(value) if key == 'text' else value for key, value in data.items()}
        for data in dict_list
    ]
    return cleaned_list

In [None]:
clean = remove_control_characters(r"F:\Corpus\reddit\archive\final_SPF_2.xml")

d = parse_reddit_xml(clean)

In [None]:
cleaned_data_list = clean_text_in_dict_list(d)

df = pd.DataFrame(cleaned_data_list)

df.to_csv(r'F:\Corpus\finaux\reddit.csv', sep ='\t', encoding='utf8')