# Création des fichiers sur lesquels je vais travailler et nettoyage de ces fichiers

## Création de fichiers par décennie

Je vais créer des fichiers pour chaque décennie commencant par 0 et jusque 9 

Exceptions = premier et dernier fichiers qui comprendront plus qu'une décennie puisque le corpus commence en 1847 
et se termine en 1978 (pas décennie ronde) 

- 1ère décennie (appelée fird pour first decade) = 1847 à 1859
- 2ème = 1860 à 1869
- 3ème = 1870 à 1879
- ...
- 13ème = 1970 à 1978

### Import

In [1]:
import os
import yake

### Lister les fichiers dans 'data/txt/' + vérification du nombre de fichiers

In [2]:
data_path = "../data/txt/"
files = os.listdir(data_path)
len (files)

2829

### Création d'un dossier dans 'data' qui sera comprendra tous les fichiers de décennie

In [3]:
if not os.path.exists('../data/decade'):
    os.mkdir('../data/decade')

### Création des fichiers de chaque décennie 

#### Commençons par la première décennie

In [4]:
# D'abord, regrouper les fichiers de 1847 à 1849
!cat ../data/txt/Bxl_184*.txt > ../data/decade/firdall.txt

In [5]:
# Vérification de la longueur du fichier en termes de nombre de lignes, de mots, d'octets.
!wc ../data/decade/firdall.txt

 108825  978391 4712296 ../data/decade/firdall.txt


In [6]:
# On ajoute les fichiers des années de 1850 à 1859
!cat ../data/txt/Bxl_185*.txt >> ../data/decade/firdall.txt

In [7]:
# Vérification de la longueur du fichier pour vérifier qu'on n'a rien écrasé
!wc ../data/decade/firdall.txt

  688939  5565852 25151452 ../data/decade/firdall.txt


#### Même chose pour toutes les autres décennies

In [8]:
!cat ../data/txt/Bxl_186*.txt > ../data/decade/secdall.txt
!cat ../data/txt/Bxl_187*.txt > ../data/decade/thidall.txt
!cat ../data/txt/Bxl_188*.txt > ../data/decade/foudall.txt
!cat ../data/txt/Bxl_189*.txt > ../data/decade/fifdall.txt
!cat ../data/txt/Bxl_190*.txt > ../data/decade/sixdall.txt
!cat ../data/txt/Bxl_191*.txt > ../data/decade/sevdall.txt
!cat ../data/txt/Bxl_192*.txt > ../data/decade/eigdall.txt
!cat ../data/txt/Bxl_193*.txt > ../data/decade/nindall.txt
!cat ../data/txt/Bxl_194*.txt > ../data/decade/tendall.txt
!cat ../data/txt/Bxl_195*.txt > ../data/decade/eledall.txt
!cat ../data/txt/Bxl_196*.txt > ../data/decade/twedall.txt
!cat ../data/txt/Bxl_197*.txt > ../data/decade/thtdall.txt

## Nettoyage de chaque fichier

### Imports

In [4]:
from collections import Counter
from wordcloud import WordCloud
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from IPython.display import Image
import shutil

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Création d'une liste de 'mots vides' (= stopwords)

In [5]:
sw = stopwords.words("french")
sw += ["ville", "faire", "commune", "conseil", "est", "une", "pourra", "point",
       "messieurs", "demande", "directeur", "un", "serait", "le", "la", "les", "il", 
       "question", "aussi", "deux", "plus", "dit", "ai", "mais", "une", "on", "cas", "moins",
       "sous", "tout", "cette", "fait", "être", "voir", "vue", "vu", "très", "peut", "quelques",
       "cela", "déjà", "celui", "avoir", "elles", "suite", "contre", "hui", "ceux", "nouveau",
       "leurs", "chaque", "alors", "après", "celle", "donc", "van", "toute", "tous", "encore",
       "mot", "sans", "avant", "entre", "fait", "dont", "cet", "jusqu", "dire", "autres", "etc",
       "faites", "faut", "ainsi", "doit", "trois", "peu", "rien", "comme"]
sw = set(sw)

### Lister les fichiers 

In [6]:
# Lister les Fichiers
data_path = "../data/decade/"
files = os.listdir(data_path)

### Nettoyage des fichiers

#### Commençons par le premier

In [9]:
# Choisir un fichier
first_file = "../data/decade/firdall.txt"
first_file

'../data/decade/firdall.txt'

In [10]:
# Récupérer le texte du fichier
text = open(os.path.join(first_file), 'r').read()
text[:500]

"V I L L E DE\n\nBRUXELLES\n\nbulletin ires 8éanas\nDl!\n\nCONSEIL\n\nCOMMUNAL\n\nANNÉE\n\n1847.\n\n\x0cAU\n\n\x0cVILLE DE B R U X E L L E S .\n\nbulletin\n\nCONSEIL\n\nàes\n\nSéances\n\nCOMMUNAL.\n\nANNÉE\n\n1847.\n\nBRUXELLES,\nIMPRIMERIE\n\nD E J . H. B R I A R D ,\n\nRITE N E U V E , 3 1 , FAUBOURG DE N A M U R ,\n\n1 84 8\n\n\x0cDE!\n\nDU CONSEI\nDîBÏ\n\nE. - Communication\nconclusions de la section des\ndu nouvel hospice pour les av\n\nEnraisonde l'absence &\nmaladie.le Conseil ajourne\nleurs de pierre el marchai\ncles des taxes communale'\nbieniàance e"

In [14]:
# Préparer la demande de nettoyage (avec les précisions de ce qu'on veut qu'il garde)
def clean_text(first_file, folder=None):
    if folder is None:
        input_path = f"firdall.txt"
        output_path = f"firdall_clean.txt"
    else:
        input_path = f"{folder}/firdall.txt"
        output_path = f"{folder}/firdall_clean.txt"
    output = open(output_path, "w", encoding='utf-8')
    with open(input_path, encoding='utf-8') as f:
        text = f.read()
        words = nltk.wordpunct_tokenize(text)
        kept = [w.lower() for w in words if len(w) > 2 and w.isalpha() and w.lower() not in sw]
        kept_string = " ".join(kept)
        output.write(kept_string)
    return f'Output has been written in {output_path}!'

In [15]:
clean_text(first_file, folder=data_path)

'Output has been written in ../data/decade//firdall_clean.txt!'

In [16]:
# Vérifier le résultat
with open(os.path.join(data_path, f'firdall_clean.txt'), 'r') as f:
    after = f.read()

after[:500]

'bruxelles bulletin ires communal année bulletin àes séances communal année bruxelles imprimerie rite faubourg consei dîbï communication conclusions section nouvel hospice enraisonde absence maladie ajourne pierre marchai cles taxes communale bieniàance eldeseiànv donne communie mandant gnant envoi etat obligatoire secrétariat dtput proposition dan donné lecture glissement marc royales rue fai phonnenr terrains rèumsderb combinaison devoir dow ans marcs iraocs mètres espourvica lém compte rendu s'

#### Faire pareil avec les 13 autres fichiers

In [13]:
sec_file = "../data/decade/secdall.txt"
thi_file = "../data/decade/thidall.txt"
fou_file = "../data/decade/foudall.txt"
fif_file = "../data/decade/fifdall.txt"
six_file = "../data/decade/sixdall.txt"
sev_file = "../data/decade/sevdall.txt"
eig_file = "../data/decade/eigdall.txt"
nin_file = "../data/decade/nindall.txt"
ten_file = "../data/decade/tendall.txt"
ele_file = "../data/decade/eledall.txt"
twe_file = "../data/decade/twedall.txt"
tht_file = "../data/decade/thtdall.txt"

In [18]:
def clean_text(sec_file, folder=None):
    if folder is None:
        input_path = f"secdall.txt"
        output_path = f"secdall_clean.txt"
    else:
        input_path = f"{folder}/secdall.txt"
        output_path = f"{folder}/secdall_clean.txt"
    output = open(output_path, "w", encoding='utf-8')
    with open(input_path, encoding='utf-8') as f:
        text = f.read()
        words = nltk.wordpunct_tokenize(text)
        kept = [w.lower() for w in words if len(w) > 2 and w.isalpha() and w.lower() not in sw]
        kept_string = " ".join(kept)
        output.write(kept_string)
    return f'Output has been written in {output_path}!'

In [19]:
clean_text(sec_file, folder=data_path)

'Output has been written in ../data/decade//secdall_clean.txt!'

In [20]:
def clean_text(thi_file, folder=None):
    if folder is None:
        input_path = f"thidall.txt"
        output_path = f"thidall_clean.txt"
    else:
        input_path = f"{folder}/thidall.txt"
        output_path = f"{folder}/thidall_clean.txt"
    output = open(output_path, "w", encoding='utf-8')
    with open(input_path, encoding='utf-8') as f:
        text = f.read()
        words = nltk.wordpunct_tokenize(text)
        kept = [w.lower() for w in words if len(w) > 2 and w.isalpha() and w.lower() not in sw]
        kept_string = " ".join(kept)
        output.write(kept_string)
    return f'Output has been written in {output_path}!'

In [21]:
clean_text(thi_file, folder=data_path)

'Output has been written in ../data/decade//thidall_clean.txt!'

In [22]:
def clean_text(fou_file, folder=None):
    if folder is None:
        input_path = f"foudall.txt"
        output_path = f"foudall_clean.txt"
    else:
        input_path = f"{folder}/foudall.txt"
        output_path = f"{folder}/foudall_clean.txt"
    output = open(output_path, "w", encoding='utf-8')
    with open(input_path, encoding='utf-8') as f:
        text = f.read()
        words = nltk.wordpunct_tokenize(text)
        kept = [w.lower() for w in words if len(w) > 2 and w.isalpha() and w.lower() not in sw]
        kept_string = " ".join(kept)
        output.write(kept_string)
    return f'Output has been written in {output_path}!'

In [23]:
clean_text(fou_file, folder=data_path)

'Output has been written in ../data/decade//foudall_clean.txt!'

In [24]:
def clean_text(fif_file, folder=None):
    if folder is None:
        input_path = f"fifdall.txt"
        output_path = f"fifdall_clean.txt"
    else:
        input_path = f"{folder}/fifdall.txt"
        output_path = f"{folder}/fifdall_clean.txt"
    output = open(output_path, "w", encoding='utf-8')
    with open(input_path, encoding='utf-8') as f:
        text = f.read()
        words = nltk.wordpunct_tokenize(text)
        kept = [w.lower() for w in words if len(w) > 2 and w.isalpha() and w.lower() not in sw]
        kept_string = " ".join(kept)
        output.write(kept_string)
    return f'Output has been written in {output_path}!'

In [25]:
clean_text(fif_file, folder=data_path)

'Output has been written in ../data/decade//fifdall_clean.txt!'

In [26]:
def clean_text(six_file, folder=None):
    if folder is None:
        input_path = f"sixdall.txt"
        output_path = f"sixdall_clean.txt"
    else:
        input_path = f"{folder}/sixdall.txt"
        output_path = f"{folder}/sixdall_clean.txt"
    output = open(output_path, "w", encoding='utf-8')
    with open(input_path, encoding='utf-8') as f:
        text = f.read()
        words = nltk.wordpunct_tokenize(text)
        kept = [w.lower() for w in words if len(w) > 2 and w.isalpha() and w.lower() not in sw]
        kept_string = " ".join(kept)
        output.write(kept_string)
    return f'Output has been written in {output_path}!'

In [27]:
clean_text(six_file, folder=data_path)

'Output has been written in ../data/decade//sixdall_clean.txt!'

In [28]:
def clean_text(sev_file, folder=None):
    if folder is None:
        input_path = f"sevdall.txt"
        output_path = f"sevdall_clean.txt"
    else:
        input_path = f"{folder}/sevdall.txt"
        output_path = f"{folder}/sevdall_clean.txt"
    output = open(output_path, "w", encoding='utf-8')
    with open(input_path, encoding='utf-8') as f:
        text = f.read()
        words = nltk.wordpunct_tokenize(text)
        kept = [w.lower() for w in words if len(w) > 2 and w.isalpha() and w.lower() not in sw]
        kept_string = " ".join(kept)
        output.write(kept_string)
    return f'Output has been written in {output_path}!'

In [29]:
clean_text(sev_file, folder=data_path)

'Output has been written in ../data/decade//sevdall_clean.txt!'

In [11]:
def clean_text(eig_file, folder=None):
    if folder is None:
        input_path = f"eigdall.txt"
        output_path = f"eigdall_clean.txt"
    else:
        input_path = f"{folder}/eigdall.txt"
        output_path = f"{folder}/eigdall_clean.txt"
    output = open(output_path, "w", encoding='utf-8')
    with open(input_path, encoding='utf-8') as f:
        text = f.read()
        words = nltk.wordpunct_tokenize(text)
        kept = [w.lower() for w in words if len(w) > 2 and w.isalpha() and w.lower() not in sw]
        kept_string = " ".join(kept)
        output.write(kept_string)
    return f'Output has been written in {output_path}!'

In [15]:
clean_text(eig_file, folder=data_path)

'Output has been written in ../data/decade//eigdall_clean.txt!'

In [32]:
def clean_text(nin_file, folder=None):
    if folder is None:
        input_path = f"nindall.txt"
        output_path = f"nindall_clean.txt"
    else:
        input_path = f"{folder}/nindall.txt"
        output_path = f"{folder}/nindall_clean.txt"
    output = open(output_path, "w", encoding='utf-8')
    with open(input_path, encoding='utf-8') as f:
        text = f.read()
        words = nltk.wordpunct_tokenize(text)
        kept = [w.lower() for w in words if len(w) > 2 and w.isalpha() and w.lower() not in sw]
        kept_string = " ".join(kept)
        output.write(kept_string)
    return f'Output has been written in {output_path}!'

In [33]:
clean_text(nin_file, folder=data_path)

'Output has been written in ../data/decade//nindall_clean.txt!'

In [34]:
def clean_text(ten_file, folder=None):
    if folder is None:
        input_path = f"tendall.txt"
        output_path = f"tendall_clean.txt"
    else:
        input_path = f"{folder}/tendall.txt"
        output_path = f"{folder}/tendall_clean.txt"
    output = open(output_path, "w", encoding='utf-8')
    with open(input_path, encoding='utf-8') as f:
        text = f.read()
        words = nltk.wordpunct_tokenize(text)
        kept = [w.lower() for w in words if len(w) > 2 and w.isalpha() and w.lower() not in sw]
        kept_string = " ".join(kept)
        output.write(kept_string)
    return f'Output has been written in {output_path}!'

In [35]:
clean_text(ten_file, folder=data_path)

'Output has been written in ../data/decade//tendall_clean.txt!'

In [36]:
def clean_text(ele_file, folder=None):
    if folder is None:
        input_path = f"eledall.txt"
        output_path = f"eledall_clean.txt"
    else:
        input_path = f"{folder}/eledall.txt"
        output_path = f"{folder}/eledall_clean.txt"
    output = open(output_path, "w", encoding='utf-8')
    with open(input_path, encoding='utf-8') as f:
        text = f.read()
        words = nltk.wordpunct_tokenize(text)
        kept = [w.lower() for w in words if len(w) > 2 and w.isalpha() and w.lower() not in sw]
        kept_string = " ".join(kept)
        output.write(kept_string)
    return f'Output has been written in {output_path}!'

In [37]:
clean_text(ele_file, folder=data_path)

'Output has been written in ../data/decade//eledall_clean.txt!'

In [38]:
def clean_text(twe_file, folder=None):
    if folder is None:
        input_path = f"twedall.txt"
        output_path = f"twedall_clean.txt"
    else:
        input_path = f"{folder}/twedall.txt"
        output_path = f"{folder}/twedall_clean.txt"
    output = open(output_path, "w", encoding='utf-8')
    with open(input_path, encoding='utf-8') as f:
        text = f.read()
        words = nltk.wordpunct_tokenize(text)
        kept = [w.lower() for w in words if len(w) > 2 and w.isalpha() and w.lower() not in sw]
        kept_string = " ".join(kept)
        output.write(kept_string)
    return f'Output has been written in {output_path}!'

In [39]:
clean_text(twe_file, folder=data_path)

'Output has been written in ../data/decade//twedall_clean.txt!'

In [40]:
def clean_text(tht_file, folder=None):
    if folder is None:
        input_path = f"thtdall.txt"
        output_path = f"thtdall_clean.txt"
    else:
        input_path = f"{folder}/thtdall.txt"
        output_path = f"{folder}/thtdall_clean.txt"
    output = open(output_path, "w", encoding='utf-8')
    with open(input_path, encoding='utf-8') as f:
        text = f.read()
        words = nltk.wordpunct_tokenize(text)
        kept = [w.lower() for w in words if len(w) > 2 and w.isalpha() and w.lower() not in sw]
        kept_string = " ".join(kept)
        output.write(kept_string)
    return f'Output has been written in {output_path}!'

In [41]:
clean_text(tht_file, folder=data_path)

'Output has been written in ../data/decade//thtdall_clean.txt!'

### Rangement dans le dossier 

Création d'un dossier dans 'decade' dans lequel on déplacera tous les fichiers non-nettoyés qu'on n'utilisera plus pour le reste du travail

In [23]:
# Créer un dossier s'il n'existe pas déjà 
if not os.path.exists('../data/nonclean'):
    os.mkdir('../data/nonclean')

In [24]:
# Déplacement de tous les fichiers non-nettoyés dans le nouveau dossier créer 'data/nonclean'
files = ['../data/decade/firdall.txt','../data/decade/secdall.txt', '../data/decade/thidall.txt','../data/decade/foudall.txt',
         '../data/decade/fifdall.txt', '../data/decade/sixdall.txt', '../data/decade/sevdall.txt','../data/decade/eigdall.txt',
         '../data/decade/nindall.txt','../data/decade/tendall.txt', '../data/decade/eledall.txt', '../data/decade/twedall.txt',
         '../data/decade/thtdall.txt']

for file in files:
    shutil.move(file, '../data/nonclean')

In [25]:
len (files)

13

In [26]:
print (files)

['../data/decade/firdall.txt', '../data/decade/secdall.txt', '../data/decade/thidall.txt', '../data/decade/foudall.txt', '../data/decade/fifdall.txt', '../data/decade/sixdall.txt', '../data/decade/sevdall.txt', '../data/decade/eigdall.txt', '../data/decade/nindall.txt', '../data/decade/tendall.txt', '../data/decade/eledall.txt', '../data/decade/twedall.txt', '../data/decade/thtdall.txt']
