In [1]:
import os
import re

# Define the function to clean text
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    
    # Remove extra spaces and normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove unwanted characters (e.g., non-ASCII)
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    
    # Remove numbers in square brackets, and standalone numbers
    text = re.sub(r'\[\d+\]', '', text)  # Removes numbers in square brackets
    
    return text

# Function to combine all text files into one
def combine_txt_files(input_folder):
    combined_text = ""
    for filename in os.listdir(input_folder):
        if filename.endswith(".txt"):
            file_path = os.path.join(input_folder, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                combined_text += file.read() + "\n\n"
    return combined_text

# Function to split text into chunks of max 500 characters
def chunk_text(text, chunk_size=500):
    chunks = []
    for i in range(0, len(text), chunk_size):
        chunk = text[i:i+chunk_size].strip()
        if len(chunk) > 0:
            chunks.append(chunk)
    return chunks

# Main function to process the text
def process_text(input_folder, output_folder):
    # Step 1: Combine all txt files
    combined_text = combine_txt_files(input_folder)
    
    # Step 2: Clean the combined text
    cleaned_text = clean_text(combined_text)
    
    # Step 3: Chunk the cleaned text into 500-character chunks
    chunks = chunk_text(cleaned_text)
    
    # Step 4: Save the chunks to individual files
    os.makedirs(output_folder, exist_ok=True)
    for idx, chunk in enumerate(chunks):
        output_file_path = os.path.join(output_folder, f"chunk_{idx+1}.txt")
        with open(output_file_path, 'w', encoding='utf-8') as output_file:
            output_file.write(chunk)
        print(f"✅ Saved: {output_file_path}")

# Example usage:
input_folder = 'Data/txt'  # Folder where your .txt files are located
output_folder = 'Data/processed'  # Folder where the processed chunks will be saved
process_text(input_folder, output_folder)

✅ Saved: Data/processed\chunk_1.txt
✅ Saved: Data/processed\chunk_2.txt
✅ Saved: Data/processed\chunk_3.txt
✅ Saved: Data/processed\chunk_4.txt
✅ Saved: Data/processed\chunk_5.txt
✅ Saved: Data/processed\chunk_6.txt
✅ Saved: Data/processed\chunk_7.txt
✅ Saved: Data/processed\chunk_8.txt
✅ Saved: Data/processed\chunk_9.txt
✅ Saved: Data/processed\chunk_10.txt
✅ Saved: Data/processed\chunk_11.txt
✅ Saved: Data/processed\chunk_12.txt
✅ Saved: Data/processed\chunk_13.txt
✅ Saved: Data/processed\chunk_14.txt
✅ Saved: Data/processed\chunk_15.txt
✅ Saved: Data/processed\chunk_16.txt
✅ Saved: Data/processed\chunk_17.txt
✅ Saved: Data/processed\chunk_18.txt
✅ Saved: Data/processed\chunk_19.txt
✅ Saved: Data/processed\chunk_20.txt
✅ Saved: Data/processed\chunk_21.txt
✅ Saved: Data/processed\chunk_22.txt
✅ Saved: Data/processed\chunk_23.txt
✅ Saved: Data/processed\chunk_24.txt
✅ Saved: Data/processed\chunk_25.txt
✅ Saved: Data/processed\chunk_26.txt
✅ Saved: Data/processed\chunk_27.txt
✅ Saved: D