Load Legal documents

In [1]:
from transformers import AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf")

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
print(tokenizer.all_special_tokens)

['<|endoftext|>']


In [6]:
# Add the special token
tokenizer.add_special_tokens({'sep_token': '<|sep|>'})

1

In [7]:
print(tokenizer.sep_token)

<|sep|>


In [5]:
import os
from tqdm import tqdm

corpus_text = ""

with open("dataset/corpus.txt", "w") as out_file:
    for folder in tqdm(os.listdir("dataset")):
        if folder == "corpus.zip" or folder == "corpus_tmp.txt" or folder == "corpus.txt" \
            or folder == "legislacion_boe_es" or folder == "multiun_es":
            print(f"Skipping {folder}")
        else:
            txt_path = os.path.join("dataset", folder, "output.txt")
            with open(txt_path, 'r') as f:
                txt_content = f.read()
                out_file.write(txt_content)
                # add sep token to indicate end of document
                out_file.write(tokenizer.sep_token)
                out_file.write("\n")

 17%|█▋        | 3/18 [00:05<00:29,  1.98s/it]

Skipping legislacion_boe_es


 39%|███▉      | 7/18 [00:06<00:06,  1.58it/s]

Skipping corpus.txt


 72%|███████▏  | 13/18 [00:15<00:06,  1.28s/it]

Skipping corpus_tmp.txt
Skipping corpus.zip


 89%|████████▉ | 16/18 [00:15<00:01,  1.43it/s]

Skipping multiun_es


100%|██████████| 18/18 [00:18<00:00,  1.02s/it]


In [6]:
from tqdm import tqdm

def count_lines(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return sum(1 for _ in f)

# Function to tokenize in batches
def count_tokens(file_path, num_lines, batch_size=1000):
    total_tokens = 0
    batch = []

    with open(file_path, "r", encoding="utf-8") as f:
        for line in tqdm(f, total=num_lines, desc="Processing lines"):
            batch.append(line.strip())
            if len(batch) == batch_size:
                # Tokenize the batch and count tokens
                encoded_batch = tokenizer(batch, truncation=False, add_special_tokens=False)
                total_tokens += sum(len(tokens) for tokens in encoded_batch["input_ids"])
                batch = []

        # Process remaining lines in the batch
        if batch:
            encoded_batch = tokenizer(batch, truncation=False, add_special_tokens=False)
            total_tokens += sum(len(tokens) for tokens in encoded_batch["input_ids"])
    
    return total_tokens


Get number of tokens of each legal document

In [None]:
import os

with open("number_of_tokens.txt", "w") as out_file:
    for folder in os.listdir("dataset"):
        if folder.startswith("corpus"):
            print(f"Skipping {folder}")
        else:
            file_path = "dataset/" + folder + "/output.txt"
            print(f"Processing {file_path}")
            num_lines = count_lines(file_path)
            total_tokens = count_tokens(file_path, num_lines)
            print(f"Tokens in {folder}: {total_tokens}")
            out_file.write(f"{folder}: {total_tokens}\n")

Processing dataset/doctrina_fiscalia_es/output.txt


Processing lines:   0%|          | 0/74544 [00:00<?, ?it/s]

Processing lines: 100%|██████████| 74544/74544 [00:03<00:00, 20491.56it/s]


Tokens in doctrina_fiscalia_es: 4812960
Processing dataset/dictamenes_consejo_estado_es/output.txt


Processing lines: 100%|██████████| 5359576/5359576 [03:28<00:00, 25682.86it/s]


Tokens in dictamenes_consejo_estado_es: 245762157
Processing dataset/jrc_acquis_es/output.txt


Processing lines: 100%|██████████| 6006032/6006032 [02:38<00:00, 37875.86it/s]


Tokens in jrc_acquis_es: 109787641
Processing dataset/legislacion_boe_es/output.txt


Get goal number of tokens of each document

In [2]:
number_of_tokens = {
    "doctrina_fiscalia_es": 4812960,
    "dictamenes_consejo_estado_es": 245762157,
    "jrc_acquis_es": 109787641,
    "legislacion_boe_es": 1128769759,
    "codigos_universitarios_es": 22890142,
    "codigos_electronicos_es": 23896311,
    "un_opus_es": 6705807,
    "consultas_tributarias_es": 139173424,
    "patentes_medicas": 29574697,
    "dogc_ca-es": 253103213,
    "eurlex_es": 140790,
    "spanish_constitution_eu-ca-es": 34589,
    "abogacia_estado_boe_es": 11242268,
    "multiun_es": 644140252,
    "europarl_es": 97908059,
}

total_tokens = sum(number_of_tokens.values())
print(f"Total tokens: {total_tokens}")      # 2.7B

max_tokens = 300_000_000        # 300M

num_documents = len(number_of_tokens.keys())
print(f"Number of documents: {num_documents}")     # 15

# sort number_of_tokens by value
number_of_tokens = dict(sorted(number_of_tokens.items(), key=lambda x: x[1]))

print(number_of_tokens)

corpus_meta_dict = {}

# iterate through dict
for key, value in list(number_of_tokens.items()):
    if value < (max_tokens // num_documents):
        corpus_meta_dict[key] = value
        max_tokens -= value
        num_documents -= 1
        print(f"Adding {key} to corpus")
        # remove key from number_of_tokens
        del number_of_tokens[key]

print(corpus_meta_dict)
print(f"Lenght of corpus_meta_dict: {len(corpus_meta_dict)}")

for key, value in list(number_of_tokens.items()):
    tokens_allowed = max_tokens // num_documents
    # cut the document to only contain the allowed tokens
    corpus_meta_dict[key] = tokens_allowed
    max_tokens -= tokens_allowed
    num_documents -= 1

print(corpus_meta_dict)
total_tokens = sum(corpus_meta_dict.values())
print(f"Total tokens: {total_tokens}")      # 300M

# Save the meta data
with open("corpus_meta.txt", "w") as out_file:
    for key, value in corpus_meta_dict.items():
        out_file.write(f"{key}: {value}\n")


Total tokens: 2717942069
Number of documents: 15
{'spanish_constitution_eu-ca-es': 34589, 'eurlex_es': 140790, 'doctrina_fiscalia_es': 4812960, 'un_opus_es': 6705807, 'abogacia_estado_boe_es': 11242268, 'codigos_universitarios_es': 22890142, 'codigos_electronicos_es': 23896311, 'patentes_medicas': 29574697, 'europarl_es': 97908059, 'jrc_acquis_es': 109787641, 'consultas_tributarias_es': 139173424, 'dictamenes_consejo_estado_es': 245762157, 'dogc_ca-es': 253103213, 'multiun_es': 644140252, 'legislacion_boe_es': 1128769759}
Adding spanish_constitution_eu-ca-es to corpus
Adding eurlex_es to corpus
Adding doctrina_fiscalia_es to corpus
Adding un_opus_es to corpus
Adding abogacia_estado_boe_es to corpus
Adding codigos_universitarios_es to corpus
Adding codigos_electronicos_es to corpus
{'spanish_constitution_eu-ca-es': 34589, 'eurlex_es': 140790, 'doctrina_fiscalia_es': 4812960, 'un_opus_es': 6705807, 'abogacia_estado_boe_es': 11242268, 'codigos_universitarios_es': 22890142, 'codigos_electr

In [2]:
import pickle
import os
from tqdm import tqdm

def chunk_and_tokenize_text(file_path, tokenizer, max_length=2048, overlap=0, buffer_size=10_000_000, output_path="pretokenized_data_300M.pkl"):
    """
    Split a large text file into tokenized chunks of input_ids and attention_masks based on max_length tokens with optional overlap,
    and save them to disk.

    Args:
        file_path (str): Path to the large text file.
        tokenizer: Tokenizer for encoding the text.
        max_length (int): Maximum number of tokens per chunk.
        overlap (int): Number of tokens to overlap between chunks.
        buffer_size (int): Number of characters to read from the file at a time.
        output_path (str): Path to save the pretokenized dataset.

    Returns:
        None
    """
    tokenized_data = []
    leftover_tokens = []

    # Get file size to set up the progress bar
    file_size = os.path.getsize(file_path)

    with open(file_path, "r", encoding="utf-8") as f, tqdm(total=file_size, unit="B", unit_scale=True, desc="Processing file") as pbar:
        while True:
            # Read a portion of the text file
            text_chunk = f.read(buffer_size)
            if not text_chunk:
                break

            # Update progress bar
            pbar.update(len(text_chunk.encode("utf-8")))

            # Combine leftover tokens with the new chunk
            if leftover_tokens:
                text_chunk = tokenizer.decode(leftover_tokens, skip_special_tokens=True) + text_chunk

            # Tokenize the combined text
            tokens = tokenizer.encode(text_chunk, truncation=False)

            # Split tokens into chunks and save tokenized data
            for i in range(0, len(tokens), max_length - overlap):
                token_chunk = tokens[i:i + max_length]
                if len(token_chunk) < max_length:
                    # Save leftover tokens for the next iteration
                    leftover_tokens = token_chunk
                    break

                # Add token chunk with padding to match max_length
                padded_chunk = tokenizer.pad(
                    {"input_ids": [token_chunk]},
                    padding="max_length",
                    max_length=max_length,
                    return_attention_mask=True,
                    return_tensors="pt"
                )

                # Save tokenized chunk
                tokenized_data.append({
                    "input_ids": padded_chunk["input_ids"].squeeze(0).tolist(),
                    "attention_mask": padded_chunk["attention_mask"].squeeze(0).tolist()
                })
            else:
                leftover_tokens = []

    # Handle any leftover tokens
    if leftover_tokens:
        padded_chunk = tokenizer.pad(
            {"input_ids": [leftover_tokens]},
            padding="max_length",
            max_length=max_length,
            return_attention_mask=True,
            return_tensors="pt"
        )
        tokenized_data.append({
            "input_ids": padded_chunk["input_ids"].squeeze(0).tolist(),
            "attention_mask": padded_chunk["attention_mask"].squeeze(0).tolist()
        })

    # Save the tokenized data to disk
    with open(output_path, "wb") as f:
        pickle.dump(tokenized_data, f)

    print(f"Tokenized data saved to {output_path}")


In [3]:
import pickle
import os
from tqdm import tqdm

def chunk_and_tokenize_text_up_to(file_path, tokenizer, max_length=2048, overlap=0, buffer_size=10_000_000, max_tokens=300_000_000, output_path="pretokenized_data_300M.pkl"):
    """
    Split a large text file into tokenized chunks of input_ids and attention_masks based on max_length tokens with optional overlap,
    and save them to disk. Stop when the total number of tokens reaches max_tokens.

    Args:
        file_path (str): Path to the large text file.
        tokenizer: Tokenizer for encoding the text.
        max_length (int): Maximum number of tokens per chunk.
        overlap (int): Number of tokens to overlap between chunks.
        buffer_size (int): Number of characters to read from the file at a time.
        max_tokens (int): Maximum number of tokens to process.
        output_path (str): Path to save the pretokenized dataset.

    Returns:
        None
    """
    tokenized_data = []
    leftover_tokens = []
    total_tokens = 0  # Keep track of the total number of tokens processed

    # Get file size to set up the progress bar
    file_size = os.path.getsize(file_path)

    with open(file_path, "r", encoding="utf-8") as f, tqdm(total=file_size, unit="B", unit_scale=True, desc="Processing file") as pbar:
        while total_tokens < max_tokens:  # Stop if max_tokens is reached
            # Read a portion of the text file
            text_chunk = f.read(buffer_size)
            if not text_chunk:
                print("End of file reached")
                break

            # Update progress bar
            pbar.update(len(text_chunk.encode("utf-8")))

            # Combine leftover tokens with the new chunk
            if leftover_tokens:
                text_chunk = tokenizer.decode(leftover_tokens, skip_special_tokens=True) + text_chunk

            # Tokenize the combined text
            tokens = tokenizer.encode(text_chunk, truncation=False)

            # Split tokens into chunks and save tokenized data
            for i in range(0, len(tokens), max_length - overlap):
                token_chunk = tokens[i:i + max_length]
                if len(token_chunk) < max_length:
                    # Save leftover tokens for the next iteration if we can't fill the max_length
                    # This happens when len(tokens) is not a perfect multiple of max_length - overlap
                    leftover_tokens = token_chunk
                    break

                # Add token chunk with padding to match max_length
                padded_chunk = tokenizer.pad(
                    {"input_ids": [token_chunk]},
                    padding="max_length",
                    max_length=max_length,
                    return_attention_mask=True,
                    return_tensors="pt"
                )

                # Save tokenized chunk
                tokenized_data.append({
                    "input_ids": padded_chunk["input_ids"].squeeze(0).tolist(),
                    "attention_mask": padded_chunk["attention_mask"].squeeze(0).tolist()
                })
                total_tokens += len(token_chunk)  # Update the total token count

                # Check if max_tokens is reached
                if total_tokens >= max_tokens:
                    print(f"Reached max_tokens limit: {max_tokens}")
                    break
            else:
                # we enter into else if the for loop was not broken (with "break")
                # And terminates naturally
                leftover_tokens = []
                continue  # Continue outer loop if inner loop wasn't broken
            if total_tokens >= max_tokens:
                break  # Break outer loop if inner loop was broken

    # Save the tokenized data to disk
    with open(output_path, "wb") as f:
        pickle.dump(tokenized_data, f)

    print(f"Tokenized data saved to {output_path}")


In [None]:
# def chunk_text(file_path, tokenizer, max_length=2048, overlap=0, buffer_size=10_000_000):
#     """
#     Split a large text file into chunks of text based on max_length tokens with optional overlap.

#     Args:
#         file_path (str): Path to the large text file.
#         tokenizer: Tokenizer for encoding the text.
#         max_length (int): Maximum number of tokens per chunk.
#         overlap (int): Number of tokens to overlap between chunks.
#         buffer_size (int): Number of characters to read from the file at a time.

#     Returns:
#         list: List of text chunks corresponding to the token limits.
#     """
#     text_chunks = []
#     leftover_tokens = []

#     # Get file size to set up the progress bar
#     file_size = os.path.getsize(file_path)
#     processed_size = 0

#     with open(file_path, "r", encoding="utf-8") as f, tqdm(total=file_size, unit="B", unit_scale=True, desc="Processing file") as pbar:
#         while True:
#             # Read a portion of the text file
#             text_chunk = f.read(buffer_size)
#             if not text_chunk:
#                 break

#             processed_size += len(text_chunk.encode("utf-8"))
#             pbar.update(len(text_chunk.encode("utf-8")))

#             # Combine leftover tokens with the new chunk
#             if leftover_tokens:
#                 text_chunk = tokenizer.decode(leftover_tokens) + text_chunk

#             # Tokenize the combined text
#             tokens = tokenizer.encode(text_chunk, truncation=False)

#             # Split tokens into chunks and decode back to text
#             for i in range(0, len(tokens), max_length - overlap):
#                 token_chunk = tokens[i:i + max_length]
#                 if len(token_chunk) < max_length:
#                     # Save leftover tokens for the next iteration
#                     leftover_tokens = token_chunk
#                     break

#                 # Decode token chunk back to text and save
#                 text_chunks.append(tokenizer.decode(token_chunk))
#             else:
#                 leftover_tokens = []

#     # Add the remaining tokens as the last text chunk
#     if leftover_tokens:
#         text_chunks.append(tokenizer.decode(leftover_tokens))

#     return text_chunks

In [13]:
import pickle

In [12]:
# File path to the large text
file_path = "dataset/corpus_approx_300M.txt"

# Preprocess text into token chunks
# token_chunks = chunk_text(file_path, tokenizer, max_length=2048, overlap=256)
token_chunks = chunk_and_tokenize_text(file_path, tokenizer, max_length=2048, overlap=256)

Processing file:   0%|          | 0.00/1.10G [00:00<?, ?B/s]You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Processing file: 100%|██████████| 1.10G/1.10G [17:19<00:00, 1.06MB/s]


Tokenized data saved to pretokenized_data_300M.pkl


In [6]:
import os

documents = []
doc_name = []

for folder in os.listdir("dataset"):
    if folder.startswith("corpus"):
        print(f"Skipping {folder}")
    else:
        file_path = "dataset/" + folder + "/output.txt"
        documents.append(file_path)
        doc_name.append(folder)

tokenized_corpus = []

for i, doc in enumerate(documents):
    # Use the chunk_and_tokenize_text function to tokenize the document
    output_path = f"{doc.split('.')[0]}_tokenized.pkl"
    print(f"Tokenizing {doc_name[i]}")
    max_tokens = corpus_meta_dict[doc_name[i]]
    chunk_and_tokenize_text_up_to(doc, tokenizer, max_tokens=max_tokens, output_path=output_path)
    
    # Load the tokenized chunks
    with open(output_path, "rb") as f:
        tokenized_data = pickle.load(f)
    
    tokenized_corpus.extend(tokenized_data)

# Save the final corpus
with open("tokenized_diverse_corpus_300M.pkl", "wb") as f:
    pickle.dump(tokenized_corpus, f)
print(f"Final tokenized corpus saved with {len(tokenized_corpus)} chunks.")

Skipping corpus_half_approx_500M_chunks.pkl
Skipping corpus_half_approx_500M.txt
Skipping corpus_approx_300M.txt
Skipping corpus_tmp.txt
Skipping corpus_945M_tokens.txt
Tokenizing doctrina_fiscalia_es


Processing file:   0%|          | 0.00/17.0M [00:00<?, ?B/s]You're using a GPTNeoXTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Processing file: 100%|██████████| 17.0M/17.0M [00:15<00:00, 1.07MB/s]

Reached max_tokens limit: 4812960
Tokenized data saved to dataset/doctrina_fiscalia_es/output_tokenized.pkl





Tokenizing dictamenes_consejo_estado_es


Processing file:  12%|█▏        | 102M/867M [01:28<11:09, 1.14MB/s] 

Reached max_tokens limit: 28784642





Tokenized data saved to dataset/dictamenes_consejo_estado_es/output_tokenized.pkl
Tokenizing jrc_acquis_es


Processing file:  27%|██▋       | 102M/372M [01:34<04:09, 1.08MB/s] 

Reached max_tokens limit: 28784641





Tokenized data saved to dataset/jrc_acquis_es/output_tokenized.pkl
Tokenizing legislacion_boe_es


Processing file:   2%|▏         | 91.2M/3.77G [01:27<59:03, 1.04MB/s]  

Reached max_tokens limit: 28784642





Tokenized data saved to dataset/legislacion_boe_es/output_tokenized.pkl
Tokenizing codigos_universitarios_es


Processing file: 100%|██████████| 79.6M/79.6M [01:10<00:00, 1.13MB/s]

Reached max_tokens limit: 22890142





Tokenized data saved to dataset/codigos_universitarios_es/output_tokenized.pkl
Tokenizing codigos_electronicos_es


Processing file:  99%|█████████▊| 81.7M/82.8M [01:15<00:01, 1.08MB/s]

Reached max_tokens limit: 23896311





Tokenized data saved to dataset/codigos_electronicos_es/output_tokenized.pkl
Tokenizing un_opus_es


Processing file: 100%|██████████| 23.5M/23.5M [00:21<00:00, 1.11MB/s]

Reached max_tokens limit: 6705807
Tokenized data saved to dataset/un_opus_es/output_tokenized.pkl





Tokenizing consultas_tributarias_es


Processing file:  20%|██        | 102M/497M [01:35<06:10, 1.07MB/s] 

Reached max_tokens limit: 28784642





Tokenized data saved to dataset/consultas_tributarias_es/output_tokenized.pkl
Tokenizing patentes_medicas


Processing file: 100%|██████████| 87.1M/87.1M [01:34<00:00, 921kB/s] 

Reached max_tokens limit: 28784641





Tokenized data saved to dataset/patentes_medicas/output_tokenized.pkl
Tokenizing dogc_ca-es


Processing file:  11%|█         | 91.8M/865M [01:36<13:33, 949kB/s] 

Reached max_tokens limit: 28784642





Tokenized data saved to dataset/dogc_ca-es/output_tokenized.pkl
Tokenizing eurlex_es


Processing file: 100%|██████████| 490k/490k [00:00<00:00, 1.16MB/s]


Reached max_tokens limit: 140790
Tokenized data saved to dataset/eurlex_es/output_tokenized.pkl
Tokenizing spanish_constitution_eu-ca-es


Processing file: 100%|██████████| 123k/123k [00:00<00:00, 1.20MB/s]


Reached max_tokens limit: 34589
Tokenized data saved to dataset/spanish_constitution_eu-ca-es/output_tokenized.pkl
Tokenizing abogacia_estado_boe_es


Processing file: 100%|██████████| 38.5M/38.5M [00:38<00:00, 991kB/s] 

Reached max_tokens limit: 11242268





Tokenized data saved to dataset/abogacia_estado_boe_es/output_tokenized.pkl
Tokenizing multiun_es


Processing file:   4%|▍         | 102M/2.30G [01:50<39:52, 919kB/s]  

Reached max_tokens limit: 28784642





Tokenized data saved to dataset/multiun_es/output_tokenized.pkl
Tokenizing europarl_es


Processing file:  29%|██▊       | 102M/355M [01:50<04:34, 922kB/s]  

Reached max_tokens limit: 28784641





Tokenized data saved to dataset/europarl_es/output_tokenized.pkl
Final tokenized corpus saved with 146496 chunks.
