Travail sur l'évolution de l'apparition du terme Tramway et Tram dans le corpus afin d'établir une potentielle évolution de l'importance de ce moyen de transport à Bruxelles

PS : Parfois les chiffres ne se suivent pas parce que j'ai recommencé certaines fonctions mais pas celles qui demandaient trop de travail à la machine (sinon elle s'arrêtait et recommancait à 0) 

# TP4 : Récupération des PDFs et transformation en TXT

## Récupération des PDFs

### Imports

In [1]:
import os
import re
import time
import requests

### Créer un répertoire (s'il n'existe pas déjà) pour stocker tous les fichiers

In [2]:
pdf_path = '../data/pdf'
# Créer le dossier s'il n'existe pas
if not os.path.exists(pdf_path):
    os.mkdir(pdf_path)

### Récupérer les URLs des pdf de tous les bulletins communaux

In [3]:
root_url = "https://archives.bruxelles.be/bulletins/date"
resp = requests.get(root_url)
print(f"Status: {resp.status_code}")
print(f"Encoding: {resp.encoding}")
html = resp.text
print(f"Text length: {len(html)}")

pattern = r"https://archief.brussel.be/Colossus/BulletinsCommunaux/Bulletins/Documents/.*\.pdf"
urls = re.findall(pattern, html)
print(f"{len(urls)} PDF files found")

Status: 200
Encoding: utf-8
Text length: 821446
2833 PDF files found


In [4]:
#impression de la première URL de 1869 (début du tramway à Bruxelles)
print(urls[198])

https://archief.brussel.be/Colossus/BulletinsCommunaux/Bulletins/Documents/Bxl_1869_Tome_I1_Part_1.pdf


### Télécharger tous les PDFs

In [5]:
start_offset = 0
end_offset = len(urls)
for url in urls:
    filename = url.split("/")[-1]
    if not os.path.exists(os.path.join(pdf_path, filename)):
        print(f"Downloading {filename}...")
        start_time = time.time()
        response = requests.get(url)
        print(f"   done in {(time.time() - start_time):.1f} seconds")
        output_file = open(os.path.join(pdf_path, filename), "wb")
        output_file.write(response.content)
    else:
        print(f"{filename} already downloaded")
print("Done")

Bxl_1847_Tome_I1_Part_1.pdf already downloaded
Bxl_1847_Tome_I1_Part_2.pdf already downloaded
Bxl_1847_Tome_I1_Part_3.pdf already downloaded
Bxl_1847_Tome_I1_Part_4.pdf already downloaded
Bxl_1847_Tome_I1_Part_5.pdf already downloaded
Bxl_1848_Tome_I1_Part_1.pdf already downloaded
Bxl_1848_Tome_I1_Part_2.pdf already downloaded
Bxl_1848_Tome_I1_Part_3.pdf already downloaded
Bxl_1849_Tome_I1_Part_1.pdf already downloaded
Bxl_1849_Tome_I1_Part_2.pdf already downloaded
Bxl_1849_Tome_I1_Part_3.pdf already downloaded
Bxl_1849_Tome_I1_Part_4.pdf already downloaded
Bxl_1849_Tome_I1_Part_5.pdf already downloaded
Bxl_1849_Tome_II1_Part_1.pdf already downloaded
Bxl_1849_Tome_II1_Part_2.pdf already downloaded
Bxl_1849_Tome_II1_Part_3.pdf already downloaded
Bxl_1849_Tome_II1_Part_4.pdf already downloaded
Bxl_1849_Tome_II1_Part_5.pdf already downloaded
Bxl_1849_Tome_II1_Part_6.pdf already downloaded
Bxl_1849_Tome_II1_Part_7.pdf already downloaded
Bxl_1850_Tome_I1_Part_1.pdf already downloaded
Bxl_18

## Conversion de fichiers PDF en fichiers TXT

### Imports

In [6]:
import textract

### Lister les fichiers dans 'data/TPfinal/pdf'

In [7]:
pdf_path = '../data/pdf'

pdfs = []
for f in os.listdir(pdf_path):
    if os.path.isfile(os.path.join(pdf_path, f)):
        pdfs.append(f)
pdfs

['.DS_Store',
 'Bxl_1847_Tome_I1_Part_1.pdf',
 'Bxl_1847_Tome_I1_Part_2.pdf',
 'Bxl_1847_Tome_I1_Part_3.pdf',
 'Bxl_1847_Tome_I1_Part_4.pdf',
 'Bxl_1847_Tome_I1_Part_5.pdf',
 'Bxl_1848_Tome_I1_Part_1.pdf',
 'Bxl_1848_Tome_I1_Part_2.pdf',
 'Bxl_1848_Tome_I1_Part_3.pdf',
 'Bxl_1849_Tome_I1_Part_1.pdf',
 'Bxl_1849_Tome_I1_Part_2.pdf',
 'Bxl_1849_Tome_I1_Part_3.pdf',
 'Bxl_1849_Tome_I1_Part_4.pdf',
 'Bxl_1849_Tome_I1_Part_5.pdf',
 'Bxl_1849_Tome_II1_Part_1.pdf',
 'Bxl_1849_Tome_II1_Part_2.pdf',
 'Bxl_1849_Tome_II1_Part_3.pdf',
 'Bxl_1849_Tome_II1_Part_4.pdf',
 'Bxl_1849_Tome_II1_Part_5.pdf',
 'Bxl_1849_Tome_II1_Part_6.pdf',
 'Bxl_1849_Tome_II1_Part_7.pdf',
 'Bxl_1850_Tome_I1_Part_1.pdf',
 'Bxl_1850_Tome_I1_Part_2.pdf',
 'Bxl_1850_Tome_I1_Part_3.pdf',
 'Bxl_1850_Tome_I1_Part_4.pdf',
 'Bxl_1850_Tome_II1_Part_1.pdf',
 'Bxl_1850_Tome_II1_Part_2.pdf',
 'Bxl_1850_Tome_II1_Part_3.pdf',
 'Bxl_1850_Tome_II1_Part_4.pdf',
 'Bxl_1850_Tome_II1_Part_5.pdf',
 'Bxl_1850_Tome_II1_Part_6.pdf',
 'Bxl_1850_To

### Créer un répertoire txt s'il n'existe pas encore

In [8]:
txt_path = '../data/txt'
if not os.path.exists(txt_path):
    os.mkdir(txt_path)

### Transformer les PDFs en txts

In [10]:
txts = []
for f in os.listdir(txt_path):
    if os.path.isfile(os.path.join(txt_path, f)):
        txts.append(f)
txts[:10]

['Bxl_1847_Tome_I1_Part_1.txt',
 'Bxl_1847_Tome_I1_Part_2.txt',
 'Bxl_1847_Tome_I1_Part_3.txt',
 'Bxl_1847_Tome_I1_Part_4.txt',
 'Bxl_1847_Tome_I1_Part_5.txt',
 'Bxl_1848_Tome_I1_Part_1.txt',
 'Bxl_1848_Tome_I1_Part_2.txt',
 'Bxl_1848_Tome_I1_Part_3.txt',
 'Bxl_1849_Tome_I1_Part_1.txt',
 'Bxl_1849_Tome_I1_Part_2.txt']

In [11]:
#  Lister les fichiers pdf qui n'ont pas encore été convertis en txt
not_converted_pdfs = []
for pdf in pdfs:
    file_name = os.path.splitext(pdf)[0]
    if file_name + ".txt" not in txts:
        not_converted_pdfs.append(pdf)
len(not_converted_pdfs)
print(not_converted_pdfs)

['.DS_Store', 'Bxl_1959_Tome_RptAn_Part_2.pdf', 'Bxl_1959_Tome_RptAn_Part_4.pdf', 'Bxl_1961_Tome_RptAn_Part_2.pdf', 'Bxl_1961_Tome_RptAn_Part_3.pdf', 'Bxl_1961_Tome_RptAn_Part_5.pdf']


In [12]:
for pdf in not_converted_pdfs:
    print(pdf)
    try:
        text = textract.process(os.path.join(pdf_path, pdf))
        with open(os.path.join(txt_path, f'{os.path.splitext(pdf)[0]}.txt'), 'wb') as output_file:
            output_file.write(text)
    except:
        print("Error with following file:", pdf)

.DS_Store
Error with following file: .DS_Store
Bxl_1959_Tome_RptAn_Part_2.pdf
Error with following file: Bxl_1959_Tome_RptAn_Part_2.pdf
Bxl_1959_Tome_RptAn_Part_4.pdf
Error with following file: Bxl_1959_Tome_RptAn_Part_4.pdf
Bxl_1961_Tome_RptAn_Part_2.pdf
Error with following file: Bxl_1961_Tome_RptAn_Part_2.pdf
Bxl_1961_Tome_RptAn_Part_3.pdf
Error with following file: Bxl_1961_Tome_RptAn_Part_3.pdf
Bxl_1961_Tome_RptAn_Part_5.pdf
Error with following file: Bxl_1961_Tome_RptAn_Part_5.pdf
