# Imports

In [None]:
import re 
import pandas as pd
import os
from bs4 import BeautifulSoup
import codecs

# Chemins des fichiers

In [None]:
#mettre les chemins des dossiers
path_lemonde_diplomatique = 'F:\Corpus\Le_Monde_2\W0036-02'
path_lemonde = 'F:\Corpus\Le_Monde_2\W0015'

In [None]:
def convert_encoding(input_file, output_file):
    """Pour convertir un fichier iso en utf8"""
    with codecs.open(input_file, 'r', encoding='ISO-8859-15') as file:
        content = file.read()
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(content)

In [None]:
def extract_files(folder_path):
    """Extract all files in folder"""
    file_paths = []
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_paths.append(os.path.join(root, file))
    return file_paths

# Extraction de tous les fichiers + garder ceux avec les extensions qui nous intéressent

In [None]:
#pour le monde diplomatique
files = extract_files(path_lemonde_diplomatique)
files = [x for x in files if x.endswith('-article.html')]

In [None]:
#pour le monde 
files2 = extract_files(path_lemonde)
files2 = [x for x in files2 if x.endswith('UTF8.xml')]

In [None]:
def read_html_file(file_path):
    try : 
        with open(file_path, 'r', encoding='utf-8') as file:
            html_content = file.read()
        soup = BeautifulSoup(html_content, 'html.parser')
    except UnicodeDecodeError:
        print('read_html_file erreur : ', file_path)
        soup = None
    return soup

In [None]:
def read_XML_file(file_path):
    print(file_path)
    with open(file_path, 'r', encoding='utf-8') as file:
        html_content = file.read()
        soup = BeautifulSoup(html_content, 'xml')
    return soup

In [None]:
def get_numb(soup):
    all_notes = soup.find_all('p', attrs={'class':'spip_note'})
#     numbers = {}
#     for spip_note in all_notes: 
#         a_tag = spip_note.find('a', class_='spip_note')
#         if a_tag is not None:
#             number = a_tag.get_text().strip()
#             numbers['{}'.format(number)] = spip_note.get_text().replace('\xa0',' ')
#     return numbers

In [None]:
def replace(replacement_dict, match):
    number = match.group(1)
    if number in replacement_dict:
        return replacement_dict[number]
    return match.group(0)

In [None]:
def get_spip(soup, text):
    new_str = []
    replacement_dict = get_numb(soup)
    pattern = '\(([0-9]*)\)'
    for word in text.split():
        match = re.match(pattern, word)
        if match : 
            number = match.group(1)
            if number in replacement_dict:
                replacement = '['+ replacement_dict[number] +']'
                word = re.sub(pattern, replacement, word)
                new_str.append(word)
            else:
                new_str.append(word)
        else : 
            new_str.append(word)
    #text = re.sub(pattern, replace(replacement_dict, match), text)
    #for match in matches:
     #   print(match)
      #  if match in replacement_dict:
       #     replacement = replacement_dict[match]
        #    print(replacement)
         #   print('---')
         #   text = text.replace(f'({match})', replacement)
    return new_str

In [None]:
def extract_data(soup, file):
    title = soup.find('title').get_text()
    authors_element = soup.find('meta', attrs={'name': 'Authors'})
    try :
        keywords = soup.find('meta', attrs={'name': 'Keywords'}).get('content')
    except AttributeError:
        keywords = ''
    authors = authors_element['content'] if authors_element else None
    text = soup.find('span', attrs={'class':'corpsText'}).get_text().replace('\xa0',' ')#.decode('utf-8')
    all_notes = ' '.join([x.get_text().strip() for x in soup.find_all('p', attrs={'class':'spip_note'})])
    text = text + all_notes
#     text = ' '.join(get_spip(soup, text))
    pat = r'F:\\Corpus\\Le_Monde_2\\W0036-02\\([^\\]*)\\([^\\]*)\\([^\\]*)-article\.html'
    match = re.search(pat, file)
    name = match.group(1)+'_'+match.group(2)+'_'+match.group(3)
    try :
        date = soup.find('td', attrs={'class': 'date'}).get_text().replace('\n','').replace('  ','')
    except:
        date = match.group(2)+'/'+ match.group(1)
    dict_ = {}
    dict_['name']=name
    dict_['author'] = authors
    dict_['mots_clefs'] = keywords
    dict_['date'] = date
    dict_['text'] = clean(fix_encoding(text)).strip()
    return dict_

In [None]:
def extract_data_XML(soup, file):
    title = fix_encoding(soup.find('Titre').get_text())
    authors = fix_encoding(soup.find('SignaturesIndexees').get_text()).strip()
    jour = soup.find('Date').get('Jour')
    mois = soup.find('Date').get('Mois')
    annee = soup.find('Date').get('Annee')
    date = str(jour +'/'+ mois +'/'+ annee)
    text = soup.find('Texte').get_text().strip()
    text = fix_encoding(text)
    
    if authors == '':
        pat2 = ".*\. - \((.*)\)$"
        match2 = re.search(pat2, text)
        if match2:
            authors = match2.group(1).replace('.','')
            
    pat = r'F:\\Corpus\\Le_Monde_2\\W0015\\2007\\([^\\]*)\\([^\\]*)\\([^\\]*)\.xml'
    match = re.search(pat, file)
    if match :
        name = match.group(1)+'_'+match.group(2)+'_'+match.group(3)
    else: 
        pat1 = r'F:\\Corpus\\Le_Monde_2\\W0015\\2007\\([^\\]*)\\([^\\]*)\.xml'
        match1 = re.search(pat1, file)
        name = '0_'+match1.group(1)+'_'+match1.group(2)
   
    categories = ", ".join([x.get_text() for x in soup.find_all('Categorie')])
    
    dict_ = {}
    dict_['name']=name
    dict_['titre']= title
    dict_['categories'] = categories
    dict_['author'] = authors
    dict_['date'] = date
    dict_['text'] = clean(fix_encoding(text)).strip()
    return dict_

In [None]:
def create_dict(file_path, ext):
    if ext =="XML":
        soup = read_XML_file(file_path)
        if soup:
            #dict_ =extract_data(soup, file_path)
            dict_ =extract_data_XML(soup, file_path)
            return dict_
        else:
            return None
    elif ext =="HTML":
        soup = read_html_file(file_path)
        if soup:
            dict_ =extract_data(soup, file_path)
            return dict_
        else:
            return None

In [None]:
def clean(text):
    text = re.sub('\(\d*\)', '', text)
    text = re.sub('^\"', '', text)
    text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', text)
    text = re.sub('«', '', text)
    text = re.sub('»', '', text)
    text = re.sub('\s-\s\([^\.]*\.\)$', '', text)
    text = re.sub('\"', '', text)
    text = text.replace('"', '')
    text = text.replace('\"', '')
    text = text.replace('“', '')
    text = text.replace('”', '')
    text = text.replace('\n', '')
    text = text.replace('*', '')
    text = text.replace('_','')
    text = text.replace(' – ', ' ,')
    text = text.replace('\xad','')
    return text 

In [None]:
def fix_encoding(input_string):
    encoding_dict = {
        'Ã©': 'é',
        'Ãš': 'è',
        'Â«': '«',
        'Ã\xa0':'à',
        'Ã\x97':'à',
        'Â\x92':"\'",
        'Â»':'»',
        'Ã®':'î',
        'Ã\x9b':'Û',
        'Ãª':'ê',
        'Ã\x82':'Â',
        "Ã\xad": "í",
        "Â\x96":',',
        'Ã¯':'ï',
        'Ã¢':'â',
        'Ã§':'ç',
        'Ã¹':'ù',
        "Ã½": "ý",
        "Ã¼": "ü",
        'Ã»':'û',
        '\x9c':'œ',
        '\x8c':'œ',
        '\x93': '',
        '\x94':'',
        '\x1a':'',
        'ÃŽ':'ô',
        'Â\x80':' ',
        'Ã«':'ë',
        'Ã\x89':'É',
        'Ã\x80':'À',
        "ÃŸ": "ß",
        'Ã\x94':'Ô',
        'Â\x8c':'Œ',
        'Ã¶':'ö',
        'Ã\x87':'Ç',
        r'\\\'': '\'',
        'Ã\x88':'È',
        'Ã\x8a':'Ê',
        'ÃŒ':'ü',
        'Ã\x8e':'Î',
        'Â°':'°',
        "Ã\x8f":"Ï",
        "Ã±": "ñ",
        "Ã³": "ó",
        "Ã´": "ô",
        "Ã¶": "ö",
        "\xa0":"",
    }

    for encoded_char, correct_char in encoding_dict.items():
        input_string = input_string.replace(encoded_char, correct_char)

    return input_string

In [None]:
def index2article(index):
    soup = read_html_file(index)
    articles_link = [x.get('href') for x in soup.find_all('a', attrs={'class': 'devtextesommaire'})]
    news = [index.replace('index.html','{}'.format(x)).replace('/','\\') for x in articles_link]
    resume = soup.find('div', attrs={'class':'diplozizi'})
    list_dict = []
    i = 0
    for x in resume.find_all('p', attrs={'class':'spip'}):
        dico = {}
        i+=1
        if i%2!=0:
            dico['name']=x.get_text()
        elif i%2==0:
            dico['text']=x.get_text().replace('&nbsp', ' ')
            dico['categorie']='Resumé 1ère page'
            dico['date'] = re.sub('F:/Corpus/Le_Monde_2/W0036-02/(\d*)/(\d*)/index.html','\2-\1',index)
            list_dict.append(dico)
        
    dict1={}
    for x in news :
        dict_ = {}
        soup1 = read_html_file(x)
        title = soup1.find('title').get_text()
        authors_element = soup1.find('meta', attrs={'name': 'Authors'})
        authors = authors_element['content'] if authors_element else None
        text = soup1.find('p', attrs={'class':'spip'}).get_text().replace('\xa0',' ')#.decode('utf-8')
        all_notes = ' '.join([x.get_text().strip() for x in soup1.find_all('p', attrs={'class':'spip_note'})])
        text = text + all_notes
#         text = ' '.join(get_spip(soup1, text))
        pat = r'F:\\Corpus\\Le_Monde_2\\W0036-02\\(2004)\\([^\\]*)\\([^\\]*)\\([^\\]*)'
        match = re.search(pat, x)
        name = match.group(1)+'_'+match.group(2)+'_'+match.group(3)+'_'+match.group(4)
        date = soup1.find('meta', attrs={'name': 'Date'}).get('content')
        rubrique = soup1.find('meta', attrs={'name': 'Rubrique'}).get('content')
        try : 
            themes = soup1.find('meta', attrs={'name': 'Keywords'}).get('content')
        except AttributeError:
            themes = '' 
        dict_['name']= name
        dict_['categorie']= rubrique
        dict_['mots_clefs']=themes
        dict_['author'] = authors
        dict_['date'] = date
        dict_['text'] = clean(fix_encoding(text))
        list_dict.append(dict_)
    return list_dict

# Analyse + création d'un df et d'un fichier csv

In [None]:
#pour le monde diplomatique (sauf 2004)
list_dict = [create_dict(x,'XML') for x in files2]
new = [x for x in list_dict if x]
df = pd.DataFrame(new)

In [None]:
import codecs
try:
    f = codecs.open(r"F:\Corpus\finaux\Karo\lemonde2007_3col.tsv", encoding='utf-8', errors='strict')
    for line in f:
        pass
    print("Valid utf-8")
except UnicodeDecodeError:
    print("invalid utf-8")

In [None]:
df.to_csv('F:\Corpus\\finaux\\lemonde2007.tsv', sep='\t', encoding='utf-8', index=False)

In [None]:
lemonde2007= df[['name','author','text']]
lemonde2007 = lemonde2007.rename(columns={'name': 'nom', 'author': 'auteur', 'text':'text_clean'})
lemonde2007.to_csv(r'F:\Corpus\finaux\Karo\lemonde2007_3col.tsv', sep='\t', encoding='utf-8', index=False)

In [None]:
#pour le monde diplomatique 2004
path2004 = r'F:\Corpus\Le_Monde_2\W0036-02\2004'
files2004 = extract_files(path2004)
files2004 = [x for x in files2004 if x.endswith('.html')]
listlist = [index2article(x) for x in files2004]

In [None]:
flatten_data = [item for sublist in listlist for item in sublist]
# Convert to dataframe
df2004 = pd.DataFrame(flatten_data, columns=['name', 'categorie', 'author', 'date', 'text'])

In [None]:
df2004.to_csv(r'F:\Corpus\finaux\lemonde2004.csv', sep='\t', encoding='utf-8', index=False)

In [None]:
df2004_2 = df2004[['name','author','text']]
df2004_2 = df2004_2.rename(columns={'name': 'nom', 'author': 'auteur', 'text':'text_clean'})
df2004_2.to_csv(r'F:\Corpus\finaux\Karo\lemonde2004_3col.tsv', sep='\t', encoding='utf-8', index=False)

In [None]:
#pour le monde 2007
list_dict2 = [create_dict(x,'XML') for x in files2]
df2 = pd.DataFrame(list_dict2)
df2.to_csv(r'F:\Corpus\finaux\lemonde2007.tsv', sep='\t', encoding='utf-8', index=False)

In [None]:
#pour le monde 2012
lemonde2012 = 'F:\\Corpus\\Le_Monde_2\\W0015\\2012\\LeMonde_20120101-20121231.txt'

In [None]:
def read_file_line_by_line(file_path):
    lines = []
    dict1 = {}
    i = 0
    with open(file_path, 'r') as file:
        ensemble = []
        for line in file:
            if line.strip() == '© SA Le Monde - CEDROM-SNi inc. 2013. Tous droits réservés.':
                if ensemble:
                    dict1[str(i)] = ensemble
                    i += 1
                    ensemble = []
            else:
                ensemble.append(line.strip())
    
    return dict1

In [None]:
dict2012 = read_file_line_by_line(lemonde2012)

In [None]:
def get_sub_lists(data):
    sub_lists = []
    sub_list = []
    count = 0

    for item in data:
        if item != '':
            sub_list.append(item)
            count = 0
        else:
            count += 1
            if count == 3:
                sub_lists.append(sub_list)
                sub_list = []
                count = 0

    # Ajouter la dernière sous-liste
    sub_lists.append(sub_list)
    return sub_lists

In [None]:
def supprimer_debut_texte(chaine):
    mots = chaine.split()
    debut_indices = []
    mots_cles = ['correspondant', 'correspondance']
    mots_cles_composes = ['Envoyé spécial']
    
    for i, mot in enumerate(mots):
        if mot.lower() in mots_cles:
            debut_indices.append(i)
        elif i < len(mots) - 1:
            mot_compose = mot.lower() + ' ' + mots[i + 1].lower()
            if mot_compose in mots_cles_composes:
                debut_indices.append(i)

    if debut_indices:
        debut_index = min(debut_indices)
        return ' '.join(mots[debut_index + 1:])
    return chaine

In [None]:
def extract(data):
    dict_ = {}
    sub_lists = get_sub_lists(data)
    if check_value_occurrences(sub_lists, sub_lists[3], 'N'):
        auteur = sub_lists[3]
        next_ = int(check_value_occurrences(sub_lists, sub_lists[3], 'Y'))
        text = sub_lists[4:next_]
        text = supprimer_debut_texte(' '.join([item for sublist in text for item in sublist]))
    else:
        auteur = ''
        text = ''.join(''.join(sublist) for sublist in sub_lists[3:-3])
    pat = r'([0-9]* [a-z-9áàâäãåçéèêëíìîïñóòôöõúùûüýÿæœ]* [0-9]{4}), .*'
    pat2 = r'(.*):(.*)'
    match = re.search(pat,  sub_lists[1][1])
    if len(sub_lists[2])>1:
        categorie = ''.join(sub_lists[2][0])
        titre = ' '.join(sub_lists[2][1:])
    else:
        categorie = ''
        titre = ' '.join(sub_lists[2])
    dict_['journal']= ''.join(sub_lists[1][0])
    dict_['date']= match.group(1)
    dict_['titre']= titre
    dict_['categorie'] = categorie
    dict_['auteur'] = ' '.join(auteur)
    dict_['texte'] = clean(fix_encoding(text)).replace('\n', '').strip()
    
    for sous_liste in sub_lists[-2]:
        match2 = re.search(pat2, sous_liste)
        dict_['{}'.format(match2.group(1).strip())] = match2.group(2).strip()
    return dict_

In [None]:
def check_value_occurrences(lst_of_lists, value, response):
    count = 0
    index = None
    for i, sublist in enumerate(lst_of_lists):
        if sublist == value:
            count += 1
            if count == 2:
                index = i
                break
    if response == 'Y':
        return index
    else:
        return count == 2

In [None]:
list_dict2012 = [extract(v) for k, v in dict2012.items()]

In [None]:
df2012 = pd.DataFrame(list_dict2012)

In [None]:
df2012 = df2012[['journal', 'date', 'titre', 'categorie', 'auteur', 'texte', 'Section','Taille', 'Type d\'article']]

In [None]:
df2012

In [None]:
df2012.to_csv(r'F:\Corpus\finaux\lemonde2012.tsv', sep='\t', encoding='utf-8', index=False)

In [None]:
df2012_2 = df2012[['titre','auteur','texte']]
df2012_2 = df2012_2.rename(columns={'titre': 'nom', 'texte':'text_clean'})
df2012_2.to_csv(r'F:\Corpus\finaux\Karo\lemonde2012_3col.tsv', sep='\t', encoding='utf-8', index=False)