# BALO

In [14]:
import os
import re
import zipfile
import tarfile
import pandas as pd
from datasets import Dataset, load_dataset

### Extract archive

.taz files are generally badly handled by this script, so many folders have to be decompressed by hand with appropriate software.

In [1]:
# Function to extract compressed folders and subfolders recursively
def extract_folders_recursively(folder_path):
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            
            # Check if the file is a ZIP archive
            if zipfile.is_zipfile(file_path):
                with zipfile.ZipFile(file_path, 'r') as zip_ref:
                    zip_ref.extractall(root)
                os.remove(file_path)  # Optional: Remove the original ZIP file
                
            elif file_path.endswith('.tar.gz') or file_path.endswith('.tgz') or file_path.endswith('.taz') or file_path.endswith('.tar') :
                try :
                    with tarfile.open(file_path, 'r:gz') as tar_ref:
                        tar_ref.extractall(root)
                    os.remove(file_path)  # Optional: Remove the original TAR.GZ file
                except : print(f"Error : {file_path}")

In [2]:
extract_folders_recursively(r"/Users/nboizard/Downloads/dataset_collection")

Error : /Users/nboizard/Downloads/dataset_collection/french/dila_opendata/data/echanges.dila.gouv.fr/OPENDATA/BALO/FluxHistorique/old/BALO_2016.tar
Error : /Users/nboizard/Downloads/dataset_collection/french/dila_opendata/data/echanges.dila.gouv.fr/OPENDATA/BALO/FluxHistorique/old/BALO_2014.tar
Error : /Users/nboizard/Downloads/dataset_collection/french/dila_opendata/data/echanges.dila.gouv.fr/OPENDATA/BALO/FluxHistorique/old/BALO_2015.tar
Error : /Users/nboizard/Downloads/dataset_collection/french/dila_opendata/data/echanges.dila.gouv.fr/OPENDATA/BALO/FluxHistorique/old/BALO_2011.tar
Error : /Users/nboizard/Downloads/dataset_collection/french/dila_opendata/data/echanges.dila.gouv.fr/OPENDATA/BALO/FluxHistorique/old/BALO_2005.tar
Error : /Users/nboizard/Downloads/dataset_collection/french/dila_opendata/data/echanges.dila.gouv.fr/OPENDATA/BALO/FluxHistorique/old/BALO_2010.tar
Error : /Users/nboizard/Downloads/dataset_collection/french/dila_opendata/data/echanges.dila.gouv.fr/OPENDATA/BA

---

### Create pandas dataset with all text files

In [1]:
data = []

root_folder = r"/Users/nboizard/Downloads/dataset_collection"

def count_words(text):
    words = text.split()
    return len(words)

for parent_folder, subfolders, files in os.walk(root_folder):
    for file in files:
        if file.endswith(".txt"):
            file_path = os.path.join(parent_folder, file)
            with open(file_path, "r", encoding="utf-8") as f:
                text = f.read()
                data.append((file, text, file[0:4], count_words(text)))

df = pd.DataFrame(data, columns=["file_name", "text", "year", "number_words"])

df.to_csv("balo.csv", index=False)
df.head()

             file_name                                               text  \
0  202307282303438.txt   BANQUE POPULAIRE AQUITAINE CENTRE ATLANTIQUE ...   
1  202306282303013.txt   CAISSE DE CREDIT MUNICIPAL DE NICE Etablissem...   
2  202304172300881.txt   CGG  Société anonyme au capital de  7 123  81...   
3  202305122301586.txt   Biophytis Société anonyme Au capital de  3.17...   
4  202305172301636.txt   STREAMWIDE Société  a nonyme  a u capital de ...   

   year  number_words  
0  2023           287  
1  2023            83  
2  2023          2538  
3  2023          3467  
4  2023          9509  


---

### HuggingFace Dataset

In [9]:
df = pd.read_csv("balo.csv")
df = df.dropna()

In [10]:
def clean_text(text):
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r"'\s+", "'", text)
    return text

df['text'] = df['text'].apply(clean_text)

In [17]:
df.columns

Index(['file_name', 'text', 'year', 'number_words'], dtype='object')

In [19]:
dataset = Dataset.from_pandas(df)
dataset = dataset.remove_columns("__index_level_0__")
dataset.push_to_hub("Nicolas-BZRD/BALO_fr_gouv")

Creating parquet from Arrow format: 100%|██████████| 13/13 [00:03<00:00,  3.80ba/s]
Creating parquet from Arrow format: 100%|██████████| 13/13 [00:03<00:00,  3.45ba/s]3s/it]
Pushing dataset shards to the dataset hub: 100%|██████████| 2/2 [04:44<00:00, 142.46s/it]
Deleting unused files from dataset repository: 100%|██████████| 2/2 [00:00<00:00,  3.59it/s]
Downloading metadata: 100%|██████████| 452/452 [00:00<00:00, 736kB/s]
