# BALO

In [None]:
import os
import re
import zipfile
import tarfile
import pandas as pd
from datasets import Dataset, load_dataset

### Extract archive

.taz files are generally badly handled by this script, so many folders have to be decompressed by hand with appropriate software.

In [None]:
# Function to extract compressed folders and subfolders recursively
def extract_folders_recursively(folder_path):
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            
            # Check if the file is a ZIP archive
            if zipfile.is_zipfile(file_path):
                with zipfile.ZipFile(file_path, 'r') as zip_ref:
                    zip_ref.extractall(root)
                os.remove(file_path)  # Optional: Remove the original ZIP file
                
            elif file_path.endswith('.tar.gz') or file_path.endswith('.tgz') or file_path.endswith('.taz') or file_path.endswith('.tar') :
                try :
                    with tarfile.open(file_path, 'r:gz') as tar_ref:
                        tar_ref.extractall(root)
                    os.remove(file_path)  # Optional: Remove the original TAR.GZ file
                except : print(f"Error : {file_path}")

In [None]:
extract_folders_recursively(r"/Users/nboizard/Downloads/dataset_collection")

---

### Create pandas dataset with all text files

In [None]:
data = []

root_folder = r"/Users/nboizard/Downloads/dataset_collection"

def count_words(text):
    words = text.split()
    return len(words)

for parent_folder, subfolders, files in os.walk(root_folder):
    for file in files:
        if file.endswith(".txt"):
            file_path = os.path.join(parent_folder, file)
            with open(file_path, "r", encoding="utf-8") as f:
                text = f.read()
                data.append((file, text, file[0:4], count_words(text)))

df = pd.DataFrame(data, columns=["file_name", "text", "year", "number_words"])

df.to_csv("balo.csv", index=False)
df.head()

---

### HuggingFace Dataset

In [None]:
df = pd.read_csv("balo.csv")
df = df.dropna()

In [None]:
def clean_text(text):
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r"'\s+", "'", text)
    return text

df['text'] = df['text'].apply(clean_text)

In [None]:
dataset = Dataset.from_pandas(df)
dataset = dataset.remove_columns("__index_level_0__")
dataset.push_to_hub("Nicolas-BZRD/BALO_opendata")