# Imports

In [None]:
import re 
import pandas as pd
import os
from bs4 import BeautifulSoup
import codecs

# Chemins des fichiers

In [None]:
#mettre les chemins des dossiers
path_lemonde_diplomatique = 'F:\Corpus\Le_Monde_2\W0036-02'
path_lemonde = 'F:\Corpus\Le_Monde_2\W0015'

In [None]:
def convert_encoding(input_file, output_file):
    """Pour convertir un fichier iso en utf8"""
    with codecs.open(input_file, 'r', encoding='ISO-8859-15') as file:
        content = file.read()
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(content)

In [None]:
def extract_files(folder_path):
    """Extract all files in folder"""
    file_paths = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_paths.append(os.path.join(root, file))
    return file_paths

# Extraction de tous les fichiers + garder ceux avec les extensions qui nous intéressent

In [None]:
#pour le monde diplomatique
files = extract_files(path_lemonde_diplomatique)
files = [x for x in files if x.endswith('-article.html')]

In [None]:
#pour le monde 
files2 = extract_files(path_lemonde)
files2 = [x for x in files2 if x.endswith('UTF8.xml')]

In [None]:
def read_html_file(file_path):
    try : 
        with open(file_path, 'r', encoding='utf-8') as file:
            html_content = file.read()
        soup = BeautifulSoup(html_content, 'html.parser')
    except UnicodeDecodeError:
        print('read_html_file erreur : ', file_path)
        soup = None
    return soup

In [None]:
def read_XML_file(file_path):
    print(file_path)
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
        soup = BeautifulSoup(html_content, 'xml')
    return soup

In [None]:
def get_numb(soup):
    all_notes = soup.find_all('p', attrs={'class':'spip_note'})
    numbers = {}
    for spip_note in all_notes: 
        a_tag = spip_note.find('a', class_='spip_note')
        if a_tag is not None:
            number = a_tag.get_text().strip()
            numbers['{}'.format(number)] = spip_note.get_text().replace('\xa0',' ')
    return numbers

In [None]:
def replace(replacement_dict, match):
    number = match.group(1)
    if number in replacement_dict:
        return replacement_dict[number]
    return match.group(0)

In [None]:
def get_spip(soup, text):
    new_str = []
    replacement_dict = get_numb(soup)
    pattern = '\(([0-9]*)\)'
    for word in text.split():
        match = re.match(pattern, word)
        if match : 
            number = match.group(1)
            if number in replacement_dict:
                replacement = '['+ replacement_dict[number] +']'
                word = re.sub(pattern, replacement, word)
                new_str.append(word)
            else:
                new_str.append(word)
        else : 
            new_str.append(word)
    #text = re.sub(pattern, replace(replacement_dict, match), text)
    #for match in matches:
     #   print(match)
      #  if match in replacement_dict:
       #     replacement = replacement_dict[match]
        #    print(replacement)
         #   print('---')
         #   text = text.replace(f'({match})', replacement)
    return new_str

In [None]:
def extract_data(soup, file):
    title = soup.find('title').get_text()
    authors_element = soup.find('meta', attrs={'name': 'Authors'})
    authors = authors_element['content'] if authors_element else None
    text = soup.find('span', attrs={'class':'corpsText'}).get_text().replace('\xa0',' ')#.decode('utf-8')
    text = ' '.join(get_spip(soup, text))
    pat = r'F:\\Corpus\\Le_Monde_2\\W0036-02\\([^\\]*)\\([^\\]*)\\([^\\]*)-article\.html'
    match = re.search(pat, file)
    name = match.group(1)+'_'+match.group(2)+'_'+match.group(3)
    try :
        date = soup.find('td', attrs={'class': 'date'}).get_text().replace('\n','').replace('  ','')
    except:
        date = match.group(2)+'/'+ match.group(1)
    dict_ = {}
    dict_['name']=name
    dict_['author'] = authors
    dict_['date'] = date
    dict_['text'] = text
    return dict_

In [None]:
def extract_data_XML(soup, file):
    title = fix_encoding(soup.find('Titre').get_text())
    authors = fix_encoding(soup.find('SignaturesIndexees').get_text()).strip()
    jour = soup.find('Date').get('Jour')
    mois = soup.find('Date').get('Mois')
    annee = soup.find('Date').get('Annee')
    date = str(jour +'/'+ mois +'/'+ annee)
    text = soup.find('Texte').get_text().strip()
    text = fix_encoding(text)
    
    if authors == '':
        pat2 = ".*\. - \((.*)\)$"
        match2 = re.search(pat2, text)
        if match2:
            authors = match2.group(1).replace('.','')
            
    pat = r'F:\\Corpus\\Le_Monde_2\\W0015\\2007\\([^\\]*)\\([^\\]*)\\([^\\]*)\.xml'
    match = re.search(pat, file)
    if match :
        name = match.group(1)+'_'+match.group(2)+'_'+match.group(3)
    else: 
        pat1 = r'F:\\Corpus\\Le_Monde_2\\W0015\\2007\\([^\\]*)\\([^\\]*)\.xml'
        match1 = re.search(pat1, file)
        name = '0_'+match1.group(1)+'_'+match1.group(2)
   
    categories = ", ".join([x.get_text() for x in soup.find_all('Categorie')])
    
    dict_ = {}
    dict_['name']=name
    dict_['titre']= title
    dict_['categories'] = categories
    dict_['author'] = authors
    dict_['date'] = date
    dict_['text'] = text
    return dict_

In [None]:
def create_dict(file_path, ext):
    if ext =="XML":
        soup = read_XML_file(file_path)
        if soup:
            #dict_ =extract_data(soup, file_path)
            dict_ =extract_data_XML(soup, file_path)
            return dict_
        else:
            return None
    elif ext =="HTML":
        soup = read_html_file(file_path)
        if soup:
            dict_ =extract_data(soup, file_path)
            return dict_
        else:
            return None

In [None]:
def fix_encoding(input_string):
    encoding_dict = {
        'Ã©': 'é',
        'Ãš': 'è',
        'Â«': '«',
        'Ã\xa0':'à',
        'Â»':'»',
        'Ã®':'î',
        'Ãª':'ê',
        'Ã¯':'ï',
        'Ã¢':'â',
        'Ã§':'ç',
        'Ã¹':'ù',
        'ÃŽ':'ô',
        'Ã»':'û',
        'Â\x80':' ',
        'Ã«':'ë',
        'Ã\x89':'É',
        'Ã\x80':'À',
        'Ã\x94':'Ô',
        'Â\x8c':'Œ',
        'Ã¶':'ö',
        'Ã\x87':'Ç',
        r'\\\'': '\'',
        'Ã\x88':'È',
        'Ã\x8a':'Ê',
        'ÃŒ':'ü',
        'Ã\x8e':'Î',
        'Â°':'°'
    }

    for encoded_char, correct_char in encoding_dict.items():
        input_string = input_string.replace(encoded_char, correct_char)

    return input_string

In [None]:
def index2article(index):
    soup = read_html_file(index)
    articles_link = [x.get('href') for x in soup.find_all('a', attrs={'class': 'devtextesommaire'})]
    news = [index.replace('index.html','{}'.format(x)).replace('/','\\') for x in articles_link]
    dict1={}
    for x in news :
        dict_ = {}
        soup1 = read_html_file(x)
        title = soup1.find('title').get_text()
        authors_element = soup1.find('meta', attrs={'name': 'Authors'})
        authors = authors_element['content'] if authors_element else None
        text = soup1.find('p', attrs={'class':'spip'}).get_text().replace('\xa0',' ')#.decode('utf-8')
        text = ' '.join(get_spip(soup1, text))
        pat = r'F:\\Corpus\\Le_Monde_2\\W0036-02\\(2004)\\([^\\]*)\\([^\\]*)\\([^\\]*)'
        match = re.search(pat, x)
        name = match.group(1)+'_'+match.group(2)+'_'+match.group(3)+'_'+match.group(4)
        date = soup1.find('meta', attrs={'name': 'Date'}).get('content')
        rubrique = soup1.find('meta', attrs={'name': 'Rubrique'}).get('content')
        dict_['name']= name
        dict_['categorie']= rubrique
        dict_['author'] = authors
        dict_['date'] = date
        dict_['text'] = text
        list_dict.append(dict_)
    return list_dict

# Analyse + création d'un df et d'un fichier csv

In [None]:
#pour le monde diplomatique (sauf 2004)
list_dict = [create_dict(x,'HTML') for x in files]
new = [x for x in list_dict if x]
df = pd.DataFrame(new)
df.to_csv('F:\Corpus\Le_Monde_2\lemondediplomatique_06_06.csv', sep='\t', encoding='utf-8')

In [None]:
#pour le monde diplomatique 2004
path2004 = r'F:\Corpus\Le_Monde_2\W0036-02\2004'
files2004 = extract_files(path2004)
files2004 = [x for x in files2004 if x.endswith('.html')]
listlist = [index2article(x) for x in files2004]

flatten_data = [item for sublist in listlist for item in sublist]
# Convert to dataframe
df2004 = pd.DataFrame(flatten_data, columns=['name', 'categorie', 'author', 'date', 'text'])

df2004.to_csv('F:\Corpus\Le_Monde_2\lemondediplomatique2004_06_07.csv', sep='\t', encoding='utf-8')

In [None]:
#pour le monde 2007
list_dict2 = [create_dict(x,'XML') for x in files2]
df2 = pd.DataFrame(list_dict2)
df2.to_csv('F:\Corpus\Le_Monde_2\lemonde2007_06_05.csv', sep='\t', encoding='utf-8')

In [None]:
#pour le monde 2012
lemonde2012 = 'F:\\Corpus\\Le_Monde_2\\W0015\\2012\\LeMonde_20120101-20121231.txt'

In [None]:
def read_file_line_by_line(file_path):
    lines = []
    dict1 = {}
    i = 0
    with open(file_path, 'r') as file:
        ensemble = []
        for line in file:
            if line.strip() == '© SA Le Monde - CEDROM-SNi inc. 2013. Tous droits réservés.':
                if ensemble:
                    dict1[str(i)] = ensemble
                    i += 1
                    ensemble = []
            else:
                ensemble.append(line.strip())
    
    return dict1

In [None]:
dict2012 = read_file_line_by_line(lemonde2012)

In [232]:
def get_sub_lists(data):
    sub_lists = []
    sub_list = []
    count = 0

    for item in data:
        if item != '':
            sub_list.append(item)
            count = 0
        else:
            count += 1
            if count == 3:
                sub_lists.append(sub_list)
                sub_list = []
                count = 0

    # Ajouter la dernière sous-liste
    sub_lists.append(sub_list)
    return sub_lists

def supprimer_debut_texte(chaine):
    mots = chaine.split()
    for i, mot in enumerate(mots):
        if mot.lower() == 'correspondant' or mot.lower() == 'correspondance':
            return ' '.join(mots[i+1:])
    return chaine

In [233]:
def supprimer_debut_texte(chaine):
    mots = chaine.split()
    debut_indices = []
    mots_cles = ['correspondant', 'correspondance']
    mots_cles_composes = ['Envoyé spécial']
    
    for i, mot in enumerate(mots):
        if mot.lower() in mots_cles:
            debut_indices.append(i)
        elif i < len(mots) - 1:
            mot_compose = mot.lower() + ' ' + mots[i + 1].lower()
            if mot_compose in mots_cles_composes:
                debut_indices.append(i)

    if debut_indices:
        debut_index = min(debut_indices)
        return ' '.join(mots[debut_index + 1:])
    return chaine

In [257]:
def extract(data):
    dict_ = {}
    sub_lists = get_sub_lists(data)
    if check_value_occurrences(sub_lists, sub_lists[3], 'N'):
        auteur = sub_lists[3]
        next_ = int(check_value_occurrences(sub_lists, sub_lists[3], 'Y'))
        text = sub_lists[4:next_]
        text = supprimer_debut_texte(' '.join([item for sublist in text for item in sublist]))
    else:
        auteur = ''
        text = ''.join(''.join(sublist) for sublist in sub_lists[3:-3])
    pat = r'([0-9]* [a-z-9áàâäãåçéèêëíìîïñóòôöõúùûüýÿæœ]* [0-9]{4}), .*'
    pat2 = r'(.*):(.*)'
    match = re.search(pat,  sub_lists[1][1])
    if len(sub_lists[2])>1:
        categorie = ''.join(sub_lists[2][0])
        titre = ' '.join(sub_lists[2][1:])
    else:
        categorie = ''
        titre = ' '.join(sub_lists[2])
    dict_['journal']= ''.join(sub_lists[1][0])
    dict_['date']= match.group(1)
    dict_['titre']= titre
    dict_['categorie'] = categorie
    dict_['auteur'] = ''.join(auteur)
    dict_['texte'] = text
    
    for sous_liste in sub_lists[-2]:
        match2 = re.search(pat2, sous_liste)
        dict_['{}'.format(match2.group(1).strip())] = match2.group(2).strip()
    return dict_

In [258]:
def check_value_occurrences(lst_of_lists, value, response):
    count = 0
    index = None
    for i, sublist in enumerate(lst_of_lists):
        if sublist == value:
            count += 1
            if count == 2:
                index = i
                break
    if response == 'Y':
        return index
    else:
        return count == 2

In [259]:
list_dict2012 = [extract(v) for k, v in dict2012.items()]

In [261]:
list_dict2012

[{'journal': 'Le Monde',
  'date': '31 décembre 2012',
  'titre': "Afghanistan : l'impossible cocorico",
  'categorie': 'ÉDITORIAL',
  'auteur': '',
  'texte': "Faute de jouer dans la cour des grands sur le terrain militaire en Afghanistan, la France rêvait sans doute d'exister sur la scène diplomatique en devenant un acteur de la paix afghane après le retrait des troupes de l'OTAN fin 2014.Depuis la fin 2011, sous couvert de rencontres interafghanes discrètes à Chantilly, la France a jeté les bases d'une véritable négociation de paix entre l'insurrection talibane et les forces politiques de ce pays. L'habileté a consisté à ne pas inquiéter les poids lourds diplomatiques qui pèsent réellement sur l'avenir de l'Afghanistan.L'absence de publicité et le profil bas adopté par le gouvernement français, qui s'abrite derrière une fondation pour organiser en sous-main ces rencontres, ont permis de tenir à distance le Pakistan, les Etats-Unis et l'Inde. La présidence afghane a été conviée à ce 

In [260]:
df2012 = pd.DataFrame(list_dict2012)

In [262]:
df2012.to_csv('F:\Corpus\Le_Monde_2\lemonde2012_06_07.csv', sep='\t', encoding='utf-8')