In [2]:
import os
import pandas as pd
import xml.etree.ElementTree as ET
from tqdm import tqdm
from datetime import datetime


**Load the newspapers**

In [3]:
base_path = r'\\ent1files.epfl.ch\ElyseeMuseum\PROJECT_DATA\BCU_v2\RawData_newspaper\LLE_temp'
data = []

nb_news_tot = 0
nb_news = 0

def parse_alto_xml(file_path):
    text = ""
    tree = ET.parse(file_path)
    root = tree.getroot()
    ns = {'alto': 'http://schema.ccs-gmbh.com/ALTO'}
    for text_block in root.findall('.//alto:TextBlock', ns):
        for text_line in text_block.findall('.//alto:TextLine', ns):
            for string in text_line.findall('.//alto:String', ns):
                text += string.attrib['CONTENT'] + ' '
            text += '\n'
    return text

# Iterate over directories and use tqdm to show the progress.
for newspaper_dir in tqdm(next(os.walk(base_path))[1], desc="Processing Newspapers"):
    alto_dir = os.path.join(base_path, newspaper_dir, "ALTO")
    newspaper_text = ""
    nb_news_tot += 1
    if nb_news == 5: 
        break
    if os.path.exists(alto_dir):
        nb_news += 1
        alto_files = os.listdir(alto_dir)
        # Wrap the list of ALTO files with tqdm to show progress for each newspaper directory.
        for alto_file in tqdm(alto_files, desc=f"Processing {newspaper_dir}"):
            if alto_file.endswith(".xml"):
                alto_file_path = os.path.join(alto_dir, alto_file)
                newspaper_text += parse_alto_xml(alto_file_path) + "\n\n"
        data.append({"newspaper_name": newspaper_dir, "content": newspaper_text})
    else:
        print(f"ALTO directory not found for {newspaper_dir}")

newspapers = pd.DataFrame(data)


Processing 19571025_01: 100%|██████████| 24/24 [00:01<00:00, 12.70it/s]
Processing 19330526_01: 100%|██████████| 12/12 [00:00<00:00, 19.04it/s]t]
Processing 19330527_01: 100%|██████████| 12/12 [00:00<00:00, 17.19it/s]  
Processing 19330529_01: 100%|██████████| 10/10 [00:00<00:00, 17.12it/s]
Processing 19330530_01: 100%|██████████| 10/10 [00:00<00:00, 39.12it/s]
Processing Newspapers:   0%|          | 5/2102 [00:04<29:54,  1.17it/s]


In [None]:
newspapers.to_csv(r'\Users\kenji\OneDrive\Documents\EPFL\Master Project\newspaper_df.csv', index=False)


In [None]:
newspapers = pd.read_csv(r'\Users\kenji\OneDrive\Documents\EPFL\Master Project\newspaper_df.csv')


In [5]:
newspapers.head(5)

Unnamed: 0,newspaper_name,content
0,19571025_01,Nouvelles du iour \nTournee decisive en France...
1,19330526_01,R 1 : DACTIONET \nADMINISTRATION \nas . Avenue...
2,19330527_01,NOUVELLESDUJOUR \nM . Rooseveltaprononcéladéva...
3,19330529_01,NOUVELLESDUJOUR \nLescontribuablesparisiensenI...
4,19330530_01,· SQ . l 1 ver \n· le \n· la \n\n\n· Altesse \...


**Functions to Process newspapers**

In [6]:
from nltk.tokenize import word_tokenize
import re

# Function to parse and format the date
def parse_date(name):
    date_str = name.split('_')[0]  # Extracts the date part (AAAAMMDD)
    date_obj = datetime.strptime(date_str, '%Y%m%d')  # Parses the date
    return date_obj.strftime('%Y-%m-%d')  # Formats the date as YYYY-MM-DD




# Function to remove numbers, punctuation, and tokenize
def preprocess_and_tokenize(text):
    # Remove numbers
    text_no_numbers = re.sub(r'\d+', '', text)
    # Remove punctuation
    text_no_punctuation = re.sub(r'[^\w\s]', '', text_no_numbers)
    # Tokenize
    tokens = word_tokenize(text_no_punctuation)
    return tokens


#Function to check how many of those words appears at least once in the newspaper
def fribourg_score(content):
    to_check = ["Fribourg", "fribourg", "fribourgeois", "fribourgeoise"]
    score = 0
    for word in to_check:
        if word in content:
            score += 1
    return score



In [7]:
newspapers['date'] = newspapers['newspaper_name'].apply(parse_date)
newspapers['tokenized_content'] = newspapers['content'].apply(preprocess_and_tokenize)
newspapers["num"] = newspapers["newspaper_name"]
newspapers = newspapers[["num", "date", "content", "tokenized_content"]]
newspapers["length"] = newspapers["tokenized_content"].apply(len)
newspapers["score"] = newspapers.tokenized_content.apply(fribourg_score)

In [8]:
newspapers.head()

Unnamed: 0,num,date,content,tokenized_content,length,score
0,19571025_01,1957-10-25,Nouvelles du iour \nTournee decisive en France...,"[Nouvelles, du, iour, Tournee, decisive, en, F...",31859,3
1,19330526_01,1933-05-26,R 1 : DACTIONET \nADMINISTRATION \nas . Avenue...,"[R, DACTIONET, ADMINISTRATION, as, Avenue, deP...",10708,2
2,19330527_01,1933-05-27,NOUVELLESDUJOUR \nM . Rooseveltaprononcéladéva...,"[NOUVELLESDUJOUR, M, Rooseveltaprononcéladéval...",11347,3
3,19330529_01,1933-05-29,NOUVELLESDUJOUR \nLescontribuablesparisiensenI...,"[NOUVELLESDUJOUR, LescontribuablesparisiensenI...",8147,1
4,19330530_01,1933-05-30,· SQ . l 1 ver \n· le \n· la \n\n\n· Altesse \...,"[SQ, l, ver, le, la, Altesse, lieu, choeur, dh...",969,2


**Part to select passages that contains words related to fribourg**

In [48]:
import textwrap

def print_list(passage, width=80):
    wrapper = textwrap.TextWrapper(width=width)
    
    formatted_passage = ' '.join(passage)
    wrapped_passage = wrapper.fill(text=formatted_passage)
    print(wrapped_passage)
    print("\n---\n")

def print_text(text, width=80):
    wrapper = textwrap.TextWrapper(width=width)
    wrapped_passage = wrapper.fill(text=text)
    print(wrapped_passage)
    print("\n---\n")

def cut_content(content, cut_size):
    """
    Selects only the relevant parts of the content that contains specific words related to Fribourg.
    Args:
    - content: tokenized list of words of the newspaper.
    - cut_size: the number of words to cut from around the specific word.

    Returns:
    - cutted: a list of cut parts from the newspapers containing around the specific word.
    """
    to_check = ["Fribourg", "fribourg", "fribourgeois", "fribourgeoise"]
    cutted = []

    max_size = len(content)
    to_cut = int(cut_size / 2)
    last_cut_end = 0  # Track the end position of the last cut

    for i, word in enumerate(content):
        if word in to_check:
            start_pos = max(0, i - to_cut)
            end_pos = min(max_size, i + to_cut)

            # If the start position is within the range of the last cut, adjust it to prevent overlap
            if start_pos < last_cut_end:
                # Adjust start_pos to last_cut_end and end_pos accordingly, ensuring it doesn't exceed max_size
                start_pos = last_cut_end
                end_pos = min(max_size, start_pos + cut_size)

            # Update the last cut's end position
            last_cut_end = end_pos

            # Cut around the word, making sure we don't exceed the size of content
            temp_cut = content[start_pos:end_pos]
            if len(temp_cut)>0:
                cutted.append(temp_cut)

    return cutted


In [10]:
interesting_cut = newspapers.tokenized_content.apply(cut_content, args = (100,))

In [55]:
for passage in interesting_cut[1]:
    print_list(passage)

tIlon eccestastìque aitetc eattuetaccepte es quatre organisateurs ontchoisi
lefutur évêquedelEgliseprotestante unifiéedAlle CestlepasteurFrédéric vonBodel
hwi h di BlfId Wt descwmg qUi Ingea leee en es l d unegraneinstitution deblenfal
fd d d n eelar s np ele ou sce tames demaaes impoents etdedevoyessontducantonde
hébergés M vonBodelschwingh estâgé SIXans ANNONCES PUBLICITASS A Fribourg
RuedeBlimont Téléphone PRIX DESANNONCES Lemillimètresurunecolonne Canton Ysct
SuisseIOct Etranger t ct Réel ct CHINEETJAPON Tokio mai Leporteparole
duministère delaguerre démentlesinformations annonçant laconclu
siondunarmisticesinejaponais Nankin mai Leporteparole dugouvernement commen
tantlanouvelleducompromis sinejaponais dl dans lenord aecaréque commePékinet f
TienTsinétalentmenaces l aallucalmerla population maiscelàne
gouvernementaconcluouseproposedecon apocheclureuncompromisavecleJapon Tokio mai
Laccordslnojaponaisdevaitêtresignéhier Ilauraituncaractèrepréliminaire On
sattendàcequilaboutisseàlaco

In [23]:
avg_nb_passage = int(sum(len(interesting_cut[i]) for i in range (len(interesting_cut)))/len(interesting_cut))
nb_tokens = int(sum(len(interesting_cut[i][j]) for i in range (len(interesting_cut)) for j in range(len(interesting_cut[i]))))
print(f"Nb newspaper having interesting word: {len(interesting_cut)} ~ {(len(interesting_cut)/nb_news)*100}%")
print(f"Average number of interesting passage per newspaper: {avg_nb_passage}")
print(f"Total number of tokens retains: {nb_tokens}")
print(f"Average number of tokens retains per newspaper: {int(nb_tokens/len(interesting_cut))}")


Nb newspaper having interesting word: 5 ~ 100.0%
Average number of interesting passage per newspaper: 22
Total number of tokens retains: 10887
Average number of tokens retains per newspaper: 2177


**Save passages**

In [14]:
import pickle


# Save with pickle
with open('interesting_cut.pkl', 'wb') as f:
    pickle.dump(interesting_cut, f)

with open('interesting_cut.pkl', 'rb') as f:
    loaded_interesting_cut = pickle.load(f)


**OPENAI text correction**

In [34]:
import openai
from openai import OpenAI


In [35]:
client = OpenAI(api_key='YOUR_PRIVATE_KEY')


In [50]:
# La liste des mots ou phrases à nettoyer et réécrire
list_to_process = loaded_interesting_cut[1][0] # premier passage du second journal


# Convertir la liste en une chaîne de caractères formatée pour la requête
text_to_process = " ".join(list_to_process)

print(len(list_to_process))
print_text(text_to_process)


100
tIlon eccestastìque aitetc eattuetaccepte es quatre organisateurs ontchoisi
lefutur évêquedelEgliseprotestante unifiéedAlle CestlepasteurFrédéric vonBodel
hwi h di BlfId Wt descwmg qUi Ingea leee en es l d unegraneinstitution deblenfal
fd d d n eelar s np ele ou sce tames demaaes impoents etdedevoyessontducantonde
hébergés M vonBodelschwingh estâgé SIXans ANNONCES PUBLICITASS A Fribourg
RuedeBlimont Téléphone PRIX DESANNONCES Lemillimètresurunecolonne Canton Ysct
SuisseIOct Etranger t ct Réel ct CHINEETJAPON Tokio mai Leporteparole
duministère delaguerre démentlesinformations annonçant laconclu
siondunarmisticesinejaponais Nankin mai Leporteparole dugouvernement commen
tantlanouvelleducompromis sinejaponais dl dans lenord aecaréque commePékinet f
TienTsinétalentmenaces l aallucalmerla population maiscelàne
gouvernementaconcluouseproposedecon apocheclureuncompromisavecleJapon Tokio mai
Laccordslnojaponaisdevaitêtresignéhier Ilauraituncaractèrepréliminaire On
sattendàcequilaboutisseà

In [52]:
response = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
        {
            "role": "system",
            "content": "Vous êtes un assistant très intelligent chargé de corriger et de réécrire les textes pour les rendre cohérents et grammaticalement corrects"
        },
        {
            "role": "user",
            "content": f"Corrigez et réécrivez ce texte en francais en utilisant tout les mots dans l'ordre: {text_to_process}"
        }

    ],
  temperature=1,
  max_tokens=300,
  top_p=1,
  frequency_penalty=0,
  presence_penalty=0
)



In [53]:
print("Before processing:\n")
print_text(text_to_process)
print("After processing:\n")
print_text(response.choices[0].message.content)


Before processing:

tIlon eccestastìque aitetc eattuetaccepte es quatre organisateurs ontchoisi
lefutur évêquedelEgliseprotestante unifiéedAlle CestlepasteurFrédéric vonBodel
hwi h di BlfId Wt descwmg qUi Ingea leee en es l d unegraneinstitution deblenfal
fd d d n eelar s np ele ou sce tames demaaes impoents etdedevoyessontducantonde
hébergés M vonBodelschwingh estâgé SIXans ANNONCES PUBLICITASS A Fribourg
RuedeBlimont Téléphone PRIX DESANNONCES Lemillimètresurunecolonne Canton Ysct
SuisseIOct Etranger t ct Réel ct CHINEETJAPON Tokio mai Leporteparole
duministère delaguerre démentlesinformations annonçant laconclu
siondunarmisticesinejaponais Nankin mai Leporteparole dugouvernement commen
tantlanouvelleducompromis sinejaponais dl dans lenord aecaréque commePékinet f
TienTsinétalentmenaces l aallucalmerla population maiscelàne
gouvernementaconcluouseproposedecon apocheclureuncompromisavecleJapon Tokio mai
Laccordslnojaponaisdevaitêtresignéhier Ilauraituncaractèrepréliminaire On
sattendà